From a51835bb9ef04f0e36056536aaf005a22cc08ad3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Jun 2020 08:30:23 +0200 Subject: [PATCH 01/29] POC: ArrayManager -- array-based data manager for columnar store --- pandas/core/frame.py | 12 +- pandas/core/generic.py | 12 +- pandas/core/internals/__init__.py | 1 + pandas/core/internals/concat.py | 20 +- pandas/core/internals/managers.py | 554 +++++++++++++++++++++++++++++- 5 files changed, 590 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c48bec9b670ad..7f94232237d54 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -128,7 +128,7 @@ from pandas.core.indexes.multi import MultiIndex, maybe_droplevels from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable -from pandas.core.internals import BlockManager +from pandas.core.internals import ArrayManager, BlockManager from pandas.core.internals.construction import ( arrays_to_mgr, dataclasses_to_dicts, @@ -446,6 +446,7 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, + manager: str = "array", ): if data is None: data = {} @@ -455,7 +456,7 @@ def __init__( if isinstance(data, DataFrame): data = data._mgr - if isinstance(data, BlockManager): + if isinstance(data, (BlockManager, ArrayManager)): if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath NDFrame.__init__( @@ -564,6 +565,11 @@ def __init__( values, index, columns, dtype=values.dtype, copy=False ) + if manager == "array" and not isinstance(mgr, ArrayManager): + # TODO proper initialization + df = DataFrame(mgr, manager="block") + arrays = [arr.copy() for arr in df._iter_column_arrays()] + mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- @@ -638,6 +644,8 @@ def _is_homogeneous_type(self) -> bool: ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type False """ + if isinstance(self._mgr, ArrayManager): + return False if self._mgr.any_extension_types: return len({block.dtype for block in self._mgr.blocks}) == 1 else: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c8780a0fc186..52ee65afecb2b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -100,7 +100,7 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import Period, PeriodIndex import pandas.core.indexing as indexing -from pandas.core.internals import BlockManager +from pandas.core.internals import ArrayManager, BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME from pandas.core.shared_docs import _shared_docs @@ -197,7 +197,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): _deprecations: FrozenSet[str] = frozenset(["get_values", "tshift"]) _metadata: List[str] = [] _is_copy = None - _mgr: BlockManager + _mgr: Union[BlockManager, ArrayManager] _attrs: Dict[Optional[Hashable], Any] _typ: str @@ -206,7 +206,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): def __init__( self, - data: BlockManager, + data: Union[BlockManager, ArrayManager], copy: bool = False, attrs: Optional[Mapping[Optional[Hashable], Any]] = None, ): @@ -223,7 +223,9 @@ def __init__( object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True)) @classmethod - def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: + def _init_mgr( + cls, mgr, axes, dtype=None, copy: bool = False + ) -> Union[BlockManager, ArrayManager]: """ passed a manager and a axes dict """ for a, axe in axes.items(): if axe is not None: @@ -5372,6 +5374,8 @@ def _protect_consolidate(self, f): Consolidate _mgr -- if the blocks have changed, then clear the cache """ + if isinstance(self._mgr, ArrayManager): + return f() blocks_before = len(self._mgr.blocks) result = f() if len(self._mgr.blocks) != blocks_before: diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index e12e0d7760ea7..dd06955e26081 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -16,6 +16,7 @@ from pandas.core.internals.concat import concatenate_block_managers from pandas.core.internals.managers import ( BlockManager, + ArrayManager, SingleBlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks, diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 88839d2211f81..06a5ba20fb35a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,5 +1,6 @@ from collections import defaultdict import copy +import itertools from typing import Dict, List import numpy as np @@ -26,7 +27,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray from pandas.core.internals.blocks import make_block -from pandas.core.internals.managers import BlockManager +from pandas.core.internals.managers import ArrayManager, BlockManager def concatenate_block_managers( @@ -46,6 +47,23 @@ def concatenate_block_managers( ------- BlockManager """ + # breakpoint() + + if isinstance(mgrs_indexers[0][0], ArrayManager): + + if concat_axis == 1: + # TODO for now only fastpath without indexers + mgrs = [t[0] for t in mgrs_indexers] + arrays = [ + np.concatenate([mgrs[i].arrays[j] for i in range(len(mgrs))]) + for j in range(len(mgrs[0].arrays)) + ] + return ArrayManager(arrays, [axes[1], axes[0]]) + elif concat_axis == 0: + mgrs = [t[0] for t in mgrs_indexers] + arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) + return ArrayManager(arrays, [axes[1], axes[0]]) + concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2e3098d94afcb..822c9a46f8aa0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -18,7 +18,7 @@ import numpy as np -from pandas._libs import internals as libinternals, lib +from pandas._libs import algos as libalgos, internals as libinternals, lib from pandas._typing import ArrayLike, DtypeObj, Label, Scalar from pandas.util._validators import validate_bool_kwarg @@ -33,6 +33,7 @@ is_dtype_equal, is_extension_array_dtype, is_list_like, + is_numeric_dtype, is_numeric_v_string_like, is_scalar, ) @@ -42,6 +43,7 @@ from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com @@ -66,7 +68,555 @@ T = TypeVar("T", bound="BlockManager") -class BlockManager(PandasObject): +class DataManager(PandasObject): + + pass + + +class ArrayManager(DataManager): + + __slots__ = [ + "_axes", + "arrays", + ] + + arrays: List[np.ndarray] + axes: Sequence[Index] + + def __init__( + self, + arrays: List[np.ndarray], + axes: Sequence[Index], + do_integrity_check: bool = True, + ): + self._axes = axes + self.arrays = arrays + + if do_integrity_check: + self._axes = [ensure_index(ax) for ax in axes] + self._verify_integrity() + + @property + def items(self) -> Index: + return self._axes[1] + + @property + def axes(self) -> Sequence[Index]: + return [self._axes[1], self._axes[0]] + + @property + def shape(self) -> Tuple[int, ...]: + # this still gives the "old" transposed shape + return tuple(len(ax) for ax in self.axes) + + @property + def shape_proper(self) -> Tuple[int, ...]: + # this still gives the "old" transposed shape + return tuple(len(ax) for ax in self._axes) + + @staticmethod + def _normalize_axis(axis): + # switch axis + axis = 1 if axis == 0 else 0 + return axis + + # TODO can be shared + @property + def ndim(self) -> int: + return len(self.axes) + + def consolidate(self) -> "ArrayManager": + return self + + def is_consolidated(self) -> bool: + return True + + def _consolidate_inplace(self) -> None: + pass + + # TODO can be shared + def set_axis(self, axis: int, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + axis = self._normalize_axis(axis) + old_len = len(self._axes[axis]) + new_len = len(new_labels) + + if new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) + + self._axes[axis] = new_labels + + def get_dtypes(self): + return np.array([arr.dtype for arr in self.arrays], dtype="object") + + # TODO setstate getstate + + # TODO can be shared + def __len__(self) -> int: + return len(self.items) + + def __repr__(self) -> str: + output = type(self).__name__ + output += f"\nIndex: {self._axes[0]}" + output += f"\nColumns: {self._axes[1]}" + output += f"\n{len(self.arrays)} arrays:" + for arr in self.arrays: + output += f"\n{arr.dtype}" + return output + + def _verify_integrity(self) -> None: + pass + # TODO + # mgr_shape = self.shape + # tot_items = sum(len(x.mgr_locs) for x in self.blocks) + # for block in self.blocks: + # if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: + # raise construction_error(tot_items, block.shape[1:], self.axes) + # if len(self.items) != tot_items: + # raise AssertionError( + # "Number of manager items must equal union of " + # f"block items\n# manager items: {len(self.items)}, # " + # f"tot_items: {tot_items}" + # ) + + def apply(self: T, f, align_keys=None, **kwargs) -> T: + """ + Iterate over the blocks, collect and create a new BlockManager. + + Parameters + ---------- + f : str or callable + Name of the Block method to apply. + + Returns + ------- + BlockManager + """ + assert "filter" not in kwargs + + align_keys = align_keys or [] + result_arrays: List[ExtensionArray] = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned + + aligned_args = {k: kwargs[k] for k in align_keys} + + for a in self.arrays: + + if aligned_args: + + raise NotImplementedError + + if callable(f): + applied = f(a, **kwargs) + else: + applied = getattr(a, f)(**kwargs) + result_arrays.append(applied) + + if len(result_arrays) == 0: + return self.make_empty(self._axes) + + return type(self)(result_arrays, self._axes) + + def where( + self, other, cond, align: bool, errors: str, try_cast: bool, axis: int + ) -> "ArrayManager": + # TODO can be shared + if align: + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] + other = extract_array(other, extract_numpy=True) + + return self.apply( + "where", + align_keys=align_keys, + other=other, + cond=cond, + errors=errors, + try_cast=try_cast, + axis=axis, + ) + + def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": + """ + Apply array_op blockwise with another (aligned) BlockManager. + """ + left_arrays = self.arrays + right_arrays = other.arrays + result_arrays = [array_op(l, r) for l, r in zip(left_arrays, right_arrays)] + return type(self)(result_arrays, self._axes) + + def copy(self: T, deep=True) -> T: + """ + Make deep or shallow copy of BlockManager + + Parameters + ---------- + deep : bool or string, default True + If False, return shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + BlockManager + """ + # this preserves the notion of view copying of axes + if deep: + # hit in e.g. tests.io.json.test_pandas + + def copy_func(ax): + return ax.copy(deep=True) if deep == "all" else ax.view() + + new_axes = [copy_func(ax) for ax in self._axes] + else: + new_axes = list(self._axes) + + res = self.apply("copy") # , deep=deep) + res._axes = new_axes + return res + + def astype( + self, dtype, copy: bool = False, errors: str = "raise" + ) -> "BlockManager": + return self.apply("astype", dtype=dtype, copy=copy) # , errors=errors) + + def iget(self, i: int) -> "SingleBlockManager": + """ + Return the data as a SingleBlockManager. + """ + values = self.arrays[i] + block = make_block(values, placement=slice(0, len(values)), ndim=1) + + return SingleBlockManager(block, self._axes[0]) + + def iget_values(self, i: int) -> ArrayLike: + """ + Return the data for column i as the values (ndarray or ExtensionArray). + """ + return self.arrays[i] + + def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): + """ + Take items along any axis. + """ + axis = self._normalize_axis(axis) + + indexer = ( + np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") + if isinstance(indexer, slice) + else np.asanyarray(indexer, dtype="int64") + ) + + n = self.shape_proper[axis] + if convert: + indexer = maybe_convert_indices(indexer, n) + + if verify: + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception("Indices must be nonzero and less than the axis length") + + new_labels = self._axes[axis].take(indexer) + return self._reindex_indexer( + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + ) + + def _make_na_array(self, fill_value=None): + if fill_value is None: + fill_value = np.nan + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + values = np.empty(self.shape_proper[0], dtype=dtype) + values.fill(fill_value) + return values + + def reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + ) -> T: + axis = self._normalize_axis(axis) + return self._reindex_indexer( + new_axis, indexer, axis, fill_value, allow_dups, copy + ) + + def _reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + ) -> T: + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object, default None + allow_dups : bool, default False + copy : bool, default True + + + pandas-indexer with -1's only. + """ + if indexer is None: + if new_axis is self._axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result._axes = list(self._axes) + result._axes[axis] = new_axis + return result + + # some axes don't allow reindexing with dups + if not allow_dups: + self._axes[axis]._can_reindex(indexer) + + # if axis >= self.ndim: + # raise IndexError("Requested axis not found in manager") + + if axis == 1: + new_arrays = [] + for i in indexer: + if i == -1: + arr = self._make_na_array(fill_value=fill_value) + else: + arr = self.arrays[i] + new_arrays.append(arr) + + else: + new_arrays = [ + algos.take( + array, + indexer, + allow_fill=True, + fill_value=fill_value, # if fill_value is not None else blk.fill_value + ) + for array in self.arrays + ] + + new_axes = list(self._axes) + new_axes[axis] = new_axis + + return type(self)(new_arrays, new_axes) + + def get_slice(self, slobj: slice, axis: int = 0) -> "BlockManager": + axis = self._normalize_axis(axis) + + if axis == 0: + arrays = [arr[slobj] for arr in self.arrays] + elif axis == 1: + arrays = self.arrays[slobj] + + new_axes = list(self._axes) + new_axes[axis] = new_axes[axis][slobj] + + return type(self)(arrays, new_axes, do_integrity_check=False) + + def iset(self, loc: Union[int, slice, np.ndarray], value): + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + """ + if lib.is_integer(loc): + # TODO normalize array + assert isinstance(value, np.ndarray) + value = value[0, :] + assert len(value) == len(self._axes[0]) + self.arrays[loc] = value + return + + # TODO + raise Exception + + def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : array_like + allow_duplicates: bool + If False, trying to insert non-unique item will raise + + """ + if not allow_duplicates and item in self.items: + # Should this be a different kind of error?? + raise ValueError(f"cannot insert {item}, already exists") + + if not isinstance(loc, int): + raise TypeError("loc must be int") + + # insert to the axis; this could possibly raise a TypeError + new_axis = self.items.insert(loc, item) + + if value.ndim == 2: + value = value[0, :] + assert len(value) == len(self.arrays[0]) + + # TODO is this copy needed? + arrays = self.arrays.copy() + arrays.insert(loc, value) + + self.arrays = arrays + self._axes[1] = new_axis + + def fillna(self, value, limit, inplace: bool, downcast) -> "ArrayManager": + + inplace = validate_bool_kwarg(inplace, "inplace") + + def array_fillna(array, value, limit, inplace): + + mask = isna(array) + if limit is not None: + limit = libalgos._validate_limit(None, limit=limit) + mask[mask.cumsum() > limit] = False + + # if not self._can_hold_na: + # if inplace: + # return [self] + # else: + # return [self.copy()] + if not inplace: + array = array.copy() + + np.putmask(array, mask, value) + return array + + return self.apply(array_fillna, value=value, limit=limit, inplace=inplace) + + # if self._can_hold_element(value): + # # equivalent: _try_coerce_args(value) would not raise + # blocks = self.putmask(mask, value, inplace=inplace) + # return self._maybe_downcast(blocks, downcast) + + # # we can't process the value, but nothing to do + # if not mask.any(): + # return [self] if inplace else [self.copy()] + + # # operate column-by-column + # def f(mask, val, idx): + # block = self.coerce_to_target_dtype(value) + + # # slice out our block + # if idx is not None: + # # i.e. self.ndim == 2 + # block = block.getitem_block(slice(idx, idx + 1)) + # return block.fillna(value, limit=limit, inplace=inplace, downcast=None) + + # return self.split_and_operate(None, f, inplace) + + # return self.apply( + # "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast + # ) + + def as_array( + self, + transpose: bool = False, + dtype=None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : bool, default False + If True, transpose the return array. + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + if len(self.arrays) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + + if not dtype: + dtype = _interleaved_dtype(self.arrays) + + result = np.empty(self.shape_proper, dtype=dtype) + + for i, arr in enumerate(self.arrays): + arr = arr.astype(dtype, copy=copy) + result[:, i] = arr + + if na_value is not lib.no_default: + result[isna(result)] = na_value + + return result + # return arr.transpose() if transpose else arr + + def get_bool_data(self, copy: bool = False) -> "BlockManager": + """ + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks + """ + mask = self.get_dtypes() == np.dtype("bool") + arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] + # TODO copy? + new_axes = [self._axes[0], self._axes[1][mask]] + return type(self)(arrays, new_axes) + + def get_numeric_data(self, copy: bool = False) -> "BlockManager": + """ + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks + """ + mask = np.array([is_numeric_dtype(t) for t in self.get_dtypes()]) + arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] + # TODO copy? + new_axes = [self._axes[0], self._axes[1][mask]] + return type(self)(arrays, new_axes) + + @property + def is_view(self) -> bool: + """ return a boolean if we are a single block and are a view """ + return False + + @property + def is_mixed_type(self) -> bool: + return True + + @property + def is_numeric_mixed_type(self) -> bool: + return False + + @property + def any_extension_types(self) -> bool: + """Whether any of the blocks in this manager are extension blocks""" + return False # any(block.is_extension for block in self.blocks) + + +class BlockManager(DataManager): """ Core internal data structure to implement DataFrame, Series, etc. From 591579b30564a073160add089285a929d589d8ed Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 27 Aug 2020 19:26:53 +0200 Subject: [PATCH 02/29] Update with latest master + some fixes --- asv_bench/benchmarks/stat_ops.py | 3 + pandas/core/frame.py | 4 + pandas/core/generic.py | 5 +- pandas/core/internals/concat.py | 2 - pandas/core/internals/managers.py | 143 +++++++++++++++++++------- pandas/tests/frame/test_api.py | 3 +- pandas/tests/frame/test_arithmetic.py | 2 +- 7 files changed, 116 insertions(+), 46 deletions(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 5639d6702a92c..74a1fe7295273 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -11,6 +11,9 @@ class FrameOps: param_names = ["op", "dtype", "axis"] def setup(self, op, dtype, axis): + if dtype == "Int64": + # XXX only dealing with numpy arrays in ArrayManager right now + raise NotImplementedError if op == "mad" and dtype == "Int64": # GH-33036, GH#33600 raise NotImplementedError diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7f94232237d54..e091ec1cff917 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -446,6 +446,8 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, + # TODO setting default to "array" for testing purposes (the actual default + # needs to stay "block" initially of course for backwards compatibility) manager: str = "array", ): if data is None: @@ -657,6 +659,8 @@ def _can_fast_transpose(self) -> bool: """ Can we transpose this DataFrame without creating any new array objects. """ + if isinstance(self._data, ArrayManager): + return False if self._data.any_extension_types: # TODO(EA2D) special case would be unnecessary with 2D EAs return False diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 52ee65afecb2b..322516a56c30b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -238,8 +238,9 @@ def _init_mgr( mgr = mgr.copy() if dtype is not None: # avoid further copies if we can - if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: - mgr = mgr.astype(dtype=dtype) + # TODO + # if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: + mgr = mgr.astype(dtype=dtype) return mgr # ---------------------------------------------------------------------- diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 06a5ba20fb35a..c604ffa273c72 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -47,8 +47,6 @@ def concatenate_block_managers( ------- BlockManager """ - # breakpoint() - if isinstance(mgrs_indexers[0][0], ArrayManager): if concat_axis == 1: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 822c9a46f8aa0..d93e9ed52d861 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -70,7 +70,35 @@ class DataManager(PandasObject): - pass + # TODO share more methods/attributes + + def __len__(self) -> int: + return len(self.items) + + @property + def ndim(self) -> int: + return len(self.axes) + + def reindex_axis( + self, + new_index, + axis: int, + method=None, + limit=None, + fill_value=None, + copy: bool = True, + ): + """ + Conform block manager to new index. + """ + new_index = ensure_index(new_index) + new_index, indexer = self.axes[axis].reindex( + new_index, method=method, limit=limit + ) + + return self.reindex_indexer( + new_index, indexer, axis=axis, fill_value=fill_value, copy=copy + ) class ArrayManager(DataManager): @@ -111,7 +139,7 @@ def shape(self) -> Tuple[int, ...]: @property def shape_proper(self) -> Tuple[int, ...]: - # this still gives the "old" transposed shape + # this returns (n_rows, n_columns) return tuple(len(ax) for ax in self._axes) @staticmethod @@ -120,10 +148,13 @@ def _normalize_axis(axis): axis = 1 if axis == 0 else 0 return axis - # TODO can be shared - @property - def ndim(self) -> int: - return len(self.axes) + def make_empty(self: T, axes=None) -> T: + """ return an empty BlockManager with the items axis of len 0 """ + if axes is None: + axes = [self.axes[1:], Index([])] + + arrays = [] + return type(self)(arrays, axes) def consolidate(self) -> "ArrayManager": return self @@ -154,10 +185,6 @@ def get_dtypes(self): # TODO setstate getstate - # TODO can be shared - def __len__(self) -> int: - return len(self.items) - def __repr__(self) -> str: output = type(self).__name__ output += f"\nIndex: {self._axes[0]}" @@ -182,6 +209,19 @@ def _verify_integrity(self) -> None: # f"tot_items: {tot_items}" # ) + def reduce(self: T, func) -> T: + # TODO this still fails because `func` assumes to work on 2D arrays + assert self.ndim == 2 + + res_arrays = [] + for array in self.arrays: + res = func(array) + res_arrays.append(np.array([res])) + + index = Index([0]) # placeholder + new_mgr = type(self)(res_arrays, [index, self.items]) + return new_mgr + def apply(self: T, f, align_keys=None, **kwargs) -> T: """ Iterate over the blocks, collect and create a new BlockManager. @@ -203,10 +243,13 @@ def apply(self: T, f, align_keys=None, **kwargs) -> T: aligned_args = {k: kwargs[k] for k in align_keys} + if f == "apply": + f = kwargs.pop("func") + for a in self.arrays: if aligned_args: - + # TODO raise NotImplementedError if callable(f): @@ -220,6 +263,9 @@ def apply(self: T, f, align_keys=None, **kwargs) -> T: return type(self)(result_arrays, self._axes) + def isna(self, func) -> "BlockManager": + return self.apply("apply", func=func) + def where( self, other, cond, align: bool, errors: str, try_cast: bool, axis: int ) -> "ArrayManager": @@ -240,6 +286,12 @@ def where( axis=axis, ) + def replace(self, value, **kwargs) -> "ArrayManager": + assert np.ndim(value) == 0, value + # TODO "replace" is right now implemented on the blocks, we should move + # it to general array algos so it can be reused here + return self.apply("replace", value=value, **kwargs) + def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": """ Apply array_op blockwise with another (aligned) BlockManager. @@ -298,6 +350,16 @@ def iget_values(self, i: int) -> ArrayLike: """ return self.arrays[i] + def idelete(self, indexer): + """ + Delete selected locations in-place (new block and array, same BlockManager) + """ + to_keep = np.ones(self.shape[0], dtype=np.bool_) + to_keep[indexer] = False + + self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] + self._axes = [self._axes[0], self._axes[1][to_keep]] + def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ Take items along any axis. @@ -428,9 +490,15 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): contained in the current set of items """ if lib.is_integer(loc): - # TODO normalize array - assert isinstance(value, np.ndarray) - value = value[0, :] + # TODO normalize array -> this should in theory not be needed + if isinstance(value, ExtensionArray): + import pytest + + pytest.skip() + value = np.asarray(value) + # assert isinstance(value, np.ndarray) + if value.ndim == 2: + value = value[0, :] assert len(value) == len(self._axes[0]) self.arrays[loc] = value return @@ -463,7 +531,8 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): if value.ndim == 2: value = value[0, :] - assert len(value) == len(self.arrays[0]) + # TODO self.arrays can be empty + # assert len(value) == len(self.arrays[0]) # TODO is this copy needed? arrays = self.arrays.copy() @@ -472,6 +541,21 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): self.arrays = arrays self._axes[1] = new_axis + def fast_xs(self, loc: int) -> ArrayLike: + """ + Return the array corresponding to `frame.iloc[loc]`. + + Parameters + ---------- + loc : int + + Returns + ------- + np.ndarray or ExtensionArray + """ + dtype = _interleaved_dtype(self.arrays) + return np.array([a[loc] for a in self.arrays], dtype=dtype) + def fillna(self, value, limit, inplace: bool, downcast) -> "ArrayManager": inplace = validate_bool_kwarg(inplace, "inplace") @@ -496,31 +580,6 @@ def array_fillna(array, value, limit, inplace): return self.apply(array_fillna, value=value, limit=limit, inplace=inplace) - # if self._can_hold_element(value): - # # equivalent: _try_coerce_args(value) would not raise - # blocks = self.putmask(mask, value, inplace=inplace) - # return self._maybe_downcast(blocks, downcast) - - # # we can't process the value, but nothing to do - # if not mask.any(): - # return [self] if inplace else [self.copy()] - - # # operate column-by-column - # def f(mask, val, idx): - # block = self.coerce_to_target_dtype(value) - - # # slice out our block - # if idx is not None: - # # i.e. self.ndim == 2 - # block = block.getitem_block(slice(idx, idx + 1)) - # return block.fillna(value, limit=limit, inplace=inplace, downcast=None) - - # return self.split_and_operate(None, f, inplace) - - # return self.apply( - # "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast - # ) - def as_array( self, transpose: bool = False, @@ -615,6 +674,10 @@ def any_extension_types(self) -> bool: """Whether any of the blocks in this manager are extension blocks""" return False # any(block.is_extension for block in self.blocks) + # TODO + # unstack + # to_dict + class BlockManager(DataManager): """ diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index b1c31a6f90133..dafb5aab34c65 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -354,7 +354,7 @@ def test_to_numpy_dtype(self): def test_to_numpy_copy(self): arr = np.random.randn(4, 3) - df = pd.DataFrame(arr) + df = pd.DataFrame(arr, manager="block") assert df.values.base is arr assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr @@ -446,6 +446,7 @@ def test_with_datetimelikes(self): expected = Series({np.dtype("object"): 10}) tm.assert_series_equal(result, expected) + @pytest.mark.skip def test_values(self, float_frame): float_frame.values[:, 0] = 5.0 assert (float_frame.values[:, 0] == 5).all() diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index e17357e9845b5..6a3080828d37d 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -846,7 +846,7 @@ def test_align_frame(self): result = ts + ts[::2] expected = ts + ts - expected.values[1::2] = np.nan + expected.iloc[1::2] = np.nan tm.assert_frame_equal(result, expected) half = ts[::2] From 896080ae515fd25fb7a21edc7bf4a0f90c021a1a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 4 Sep 2020 11:50:29 +0200 Subject: [PATCH 03/29] add pd.options.mode.data_manager to switch --- pandas/core/config_init.py | 6 ++++++ pandas/core/frame.py | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 0c23f1b4bcdf2..fde070f254b74 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -482,6 +482,12 @@ def use_inf_as_na_cb(key): cf.register_option( "use_inf_as_null", False, use_inf_as_null_doc, cb=use_inf_as_na_cb ) + cf.register_option( + "data_manager", + "block", + "internal manager type", + validator=is_one_of_factory(["block", "array"]), + ) cf.deprecate_option( "mode.use_inf_as_null", msg=use_inf_as_null_doc, rkey="mode.use_inf_as_na" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e091ec1cff917..da02704399490 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -448,7 +448,7 @@ def __init__( copy: bool = False, # TODO setting default to "array" for testing purposes (the actual default # needs to stay "block" initially of course for backwards compatibility) - manager: str = "array", + manager: Optional[str] = None, ): if data is None: data = {} @@ -567,11 +567,16 @@ def __init__( values, index, columns, dtype=values.dtype, copy=False ) + if manager is None: + manager = get_option("mode.data_manager") + if manager == "array" and not isinstance(mgr, ArrayManager): # TODO proper initialization df = DataFrame(mgr, manager="block") arrays = [arr.copy() for arr in df._iter_column_arrays()] mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) + # TODO check for case of manager="block" but mgr is ArrayManager + NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- From d18082aaa59328e0286cbdbd70b09bb46bc729be Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Sep 2020 09:40:41 +0200 Subject: [PATCH 04/29] add apply_with_block workaround --- pandas/core/config_init.py | 2 +- pandas/core/frame.py | 4 +- pandas/core/internals/managers.py | 94 +++++++++++++++++++++++++------ 3 files changed, 80 insertions(+), 20 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index fde070f254b74..9a5b1aa36e8bb 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -484,7 +484,7 @@ def use_inf_as_na_cb(key): ) cf.register_option( "data_manager", - "block", + "array", "internal manager type", validator=is_one_of_factory(["block", "array"]), ) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index da02704399490..3fcaa906c3526 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5529,10 +5529,10 @@ def sort_index( new_data = self._mgr.take(indexer, axis=baxis, verify=False) # reconstruct axis if needed - new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic()) if ignore_index: - new_data.axes[1] = ibase.default_index(len(indexer)) + new_data.set_axis(1, ibase.default_index(len(indexer))) result = self._constructor(new_data) if inplace: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4f15392365d7b..fe0a6d9e52cb2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -188,19 +188,18 @@ def __repr__(self) -> str: return output def _verify_integrity(self) -> None: - pass - # TODO - # mgr_shape = self.shape - # tot_items = sum(len(x.mgr_locs) for x in self.blocks) - # for block in self.blocks: - # if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: - # raise construction_error(tot_items, block.shape[1:], self.axes) - # if len(self.items) != tot_items: - # raise AssertionError( - # "Number of manager items must equal union of " - # f"block items\n# manager items: {len(self.items)}, # " - # f"tot_items: {tot_items}" - # ) + n_rows, n_columns = self.shape_proper + if not len(self.arrays) == n_columns: + raise ValueError( + "Number of passed arrays must equal the size of the column Index: " + f"{len(self.arrays)} arrays vs {n_columns} columns." + ) + for array in self.arrays: + if not len(array) == n_rows: + raise ValueError( + "Passed arrays should have the same length as the rows Index: " + f"{len(array)} vs {n_rows} rows" + ) def reduce(self: T, func) -> T: # TODO this still fails because `func` assumes to work on 2D arrays @@ -256,6 +255,21 @@ def apply(self: T, f, align_keys=None, **kwargs) -> T: return type(self)(result_arrays, self._axes) + def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: + + result_arrays = [] + + for array in self.arrays: + block = make_block(np.atleast_2d(array), placement=slice(0, 1, 1), ndim=2) + applied = getattr(block, f)(**kwargs) + while isinstance(applied, list): + # ObjectBlock gives double nested result?, some functions give no list + applied = applied[0] + applied_array = applied.values[0, :] + result_arrays.append(applied_array) + + return type(self)(result_arrays, self._axes) + def isna(self, func) -> "BlockManager": return self.apply("apply", func=func) @@ -283,7 +297,51 @@ def replace(self, value, **kwargs) -> "ArrayManager": assert np.ndim(value) == 0, value # TODO "replace" is right now implemented on the blocks, we should move # it to general array algos so it can be reused here - return self.apply("replace", value=value, **kwargs) + return self.apply_with_block("replace", value=value, **kwargs) + + def replace_list( + self: T, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> T: + """ do a list replace """ + inplace = validate_bool_kwarg(inplace, "inplace") + + return self.apply_with_block( + "_replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, + ) + + def diff(self, n: int, axis: int) -> "ArrayManager": + return self.apply_with_block("diff", n=n, axis=axis) + + def interpolate(self, **kwargs) -> "ArrayManager": + return self.apply_with_block("interpolate", **kwargs) + + def downcast(self) -> "ArrayManager": + return self.apply_with_block("downcast") + + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + ) -> "ArrayManager": + return self.apply_with_block( + "convert", + copy=copy, + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + ) def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": """ @@ -319,9 +377,11 @@ def copy_func(ax): else: new_axes = list(self._axes) - res = self.apply("copy") # , deep=deep) - res._axes = new_axes - return res + if deep: + new_arrays = [arr.copy() for arr in self.arrays] + else: + new_arrays = self.arrays + return type(self)(new_arrays, new_axes) def astype( self, dtype, copy: bool = False, errors: str = "raise" From cf3c07acb1765ef38a9917818572ae6099327a00 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Sep 2020 10:41:16 +0200 Subject: [PATCH 05/29] fix alignment in apply --- pandas/core/internals/managers.py | 112 +++++++++++++++++++++++++----- 1 file changed, 96 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index fe0a6d9e52cb2..ca6129f7fbafa 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -214,23 +214,34 @@ def reduce(self: T, func) -> T: new_mgr = type(self)(res_arrays, [index, self.items]) return new_mgr - def apply(self: T, f, align_keys=None, **kwargs) -> T: + def apply( + self: T, + f, + align_keys: Optional[List[str]] = None, + ignore_failures: bool = False, + **kwargs, + ) -> T: """ - Iterate over the blocks, collect and create a new BlockManager. + Iterate over the arrays, collect and create a new ArrayManager. Parameters ---------- f : str or callable - Name of the Block method to apply. + Name of the Array method to apply. + align_keys: List[str] or None, default None + ignore_failures: bool, default False + **kwargs + Keywords to pass to `f` Returns ------- - BlockManager + ArrayManager """ assert "filter" not in kwargs align_keys = align_keys or [] - result_arrays: List[ExtensionArray] = [] + result_arrays: List[np.ndarray] = [] + result_indices: List[int] = [] # fillna: Series/DataFrame is responsible for making sure value is aligned aligned_args = {k: kwargs[k] for k in align_keys} @@ -238,28 +249,68 @@ def apply(self: T, f, align_keys=None, **kwargs) -> T: if f == "apply": f = kwargs.pop("func") - for a in self.arrays: + for i, arr in enumerate(self.arrays): if aligned_args: - # TODO - raise NotImplementedError - if callable(f): - applied = f(a, **kwargs) - else: - applied = getattr(a, f)(**kwargs) + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[i] + else: + kwargs[k] = obj.iloc[:, i]._values + else: + # otherwise we have an ndarray + kwargs[k] = obj[i] + + try: + if callable(f): + applied = f(arr, **kwargs) + else: + applied = getattr(arr, f)(**kwargs) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue result_arrays.append(applied) + result_indices.append(i) + + if ignore_failures: + # TODO copy? + new_axes = [self._axes[0], self._axes[1][result_indices]] + else: + new_axes = self._axes if len(result_arrays) == 0: - return self.make_empty(self._axes) + return self.make_empty(new_axes) - return type(self)(result_arrays, self._axes) + return type(self)(result_arrays, new_axes) def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: + align_keys = align_keys or [] + aligned_args = {k: kwargs[k] for k in align_keys} + result_arrays = [] - for array in self.arrays: + for i, array in enumerate(self.arrays): + + if aligned_args: + + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[[i]] + else: + kwargs[k] = obj.iloc[:, [i]]._values + else: + # otherwise we have an ndarray + kwargs[k] = obj[[i]] + block = make_block(np.atleast_2d(array), placement=slice(0, 1, 1), ndim=2) applied = getattr(block, f)(**kwargs) while isinstance(applied, list): @@ -283,7 +334,7 @@ def where( align_keys = ["cond"] other = extract_array(other, extract_numpy=True) - return self.apply( + return self.apply_with_block( "where", align_keys=align_keys, other=other, @@ -293,6 +344,25 @@ def where( axis=axis, ) + def putmask(self, mask, new, align: bool = True, axis: int = 0): + transpose = self.ndim == 2 + + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + return self.apply_with_block( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, + inplace=True, + axis=axis, + transpose=transpose, + ) + def replace(self, value, **kwargs) -> "ArrayManager": assert np.ndim(value) == 0, value # TODO "replace" is right now implemented on the blocks, we should move @@ -323,6 +393,15 @@ def diff(self, n: int, axis: int) -> "ArrayManager": def interpolate(self, **kwargs) -> "ArrayManager": return self.apply_with_block("interpolate", **kwargs) + def shift(self, periods: int, axis: int, fill_value) -> "ArrayManager": + if axis == 0 and self.ndim == 2: + # TODO column-wise shift + raise NotImplementedError + + return self.apply_with_block( + "shift", periods=periods, axis=axis, fill_value=fill_value + ) + def downcast(self) -> "ArrayManager": return self.apply_with_block("downcast") @@ -730,6 +809,7 @@ def any_extension_types(self) -> bool: # TODO # unstack # to_dict + # quantile class BlockManager(DataManager): From b252c6d2564876dd20b6cc9aeeb045a77e07cdc7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Sep 2020 11:06:57 +0200 Subject: [PATCH 06/29] reorder methods to match BlockManager --- pandas/core/internals/managers.py | 609 ++++++++++++++++-------------- 1 file changed, 318 insertions(+), 291 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ca6129f7fbafa..5d97dbdc171a4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -95,6 +95,21 @@ def reindex_axis( class ArrayManager(DataManager): + """ + Core internal data structure to implement DataFrame and Series. + + Alternative to the BlockManager, storing a list of 1D arrays instead of + Blocks. + + This is *not* a public API class + + Parameters + ---------- + arrays : Sequence of arrays + axes : Sequence of Index + do_integrity_check : bool, default True + + """ __slots__ = [ "_axes", @@ -110,6 +125,8 @@ def __init__( axes: Sequence[Index], do_integrity_check: bool = True, ): + # Note: we are storing the axes in "_axes" in the (row, columns) order + # which contrasts the order how it is stored in BlockManager self._axes = axes self.arrays = arrays @@ -117,17 +134,26 @@ def __init__( self._axes = [ensure_index(ax) for ax in axes] self._verify_integrity() + def make_empty(self: T, axes=None) -> T: + """Return an empty ArrayManager with the items axis of len 0 (no columns)""" + if axes is None: + axes = [self.axes[1:], Index([])] + + arrays = [] + return type(self)(arrays, axes) + @property def items(self) -> Index: return self._axes[1] @property def axes(self) -> Sequence[Index]: + """Axes is BlockManager-compatible order (columns, rows)""" return [self._axes[1], self._axes[0]] @property def shape(self) -> Tuple[int, ...]: - # this still gives the "old" transposed shape + # this still gives the BlockManager-compatible transposed shape return tuple(len(ax) for ax in self.axes) @property @@ -141,23 +167,6 @@ def _normalize_axis(axis): axis = 1 if axis == 0 else 0 return axis - def make_empty(self: T, axes=None) -> T: - """ return an empty BlockManager with the items axis of len 0 """ - if axes is None: - axes = [self.axes[1:], Index([])] - - arrays = [] - return type(self)(arrays, axes) - - def consolidate(self) -> "ArrayManager": - return self - - def is_consolidated(self) -> bool: - return True - - def _consolidate_inplace(self) -> None: - pass - # TODO can be shared def set_axis(self, axis: int, new_labels: Index) -> None: # Caller is responsible for ensuring we have an Index object. @@ -173,6 +182,15 @@ def set_axis(self, axis: int, new_labels: Index) -> None: self._axes[axis] = new_labels + def consolidate(self) -> "ArrayManager": + return self + + def is_consolidated(self) -> bool: + return True + + def _consolidate_inplace(self) -> None: + pass + def get_dtypes(self): return np.array([arr.dtype for arr in self.arrays], dtype="object") @@ -214,6 +232,16 @@ def reduce(self: T, func) -> T: new_mgr = type(self)(res_arrays, [index, self.items]) return new_mgr + def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": + """ + Apply array_op blockwise with another (aligned) BlockManager. + """ + # TODO what if `other` is BlockManager ? + left_arrays = self.arrays + right_arrays = other.arrays + result_arrays = [array_op(l, r) for l, r in zip(left_arrays, right_arrays)] + return type(self)(result_arrays, self._axes) + def apply( self: T, f, @@ -321,13 +349,14 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: return type(self)(result_arrays, self._axes) - def isna(self, func) -> "BlockManager": + # TODO quantile + + def isna(self, func) -> "ArrayManager": return self.apply("apply", func=func) def where( self, other, cond, align: bool, errors: str, try_cast: bool, axis: int ) -> "ArrayManager": - # TODO can be shared if align: align_keys = ["other", "cond"] else: @@ -344,6 +373,10 @@ def where( axis=axis, ) + # TODO what is this used for? + # def setitem(self, indexer, value) -> "ArrayManager": + # return self.apply_with_block("setitem", indexer=indexer, value=value) + def putmask(self, mask, new, align: bool = True, axis: int = 0): transpose = self.ndim == 2 @@ -363,30 +396,6 @@ def putmask(self, mask, new, align: bool = True, axis: int = 0): transpose=transpose, ) - def replace(self, value, **kwargs) -> "ArrayManager": - assert np.ndim(value) == 0, value - # TODO "replace" is right now implemented on the blocks, we should move - # it to general array algos so it can be reused here - return self.apply_with_block("replace", value=value, **kwargs) - - def replace_list( - self: T, - src_list: List[Any], - dest_list: List[Any], - inplace: bool = False, - regex: bool = False, - ) -> T: - """ do a list replace """ - inplace = validate_bool_kwarg(inplace, "inplace") - - return self.apply_with_block( - "_replace_list", - src_list=src_list, - dest_list=dest_list, - inplace=inplace, - regex=regex, - ) - def diff(self, n: int, axis: int) -> "ArrayManager": return self.apply_with_block("diff", n=n, axis=axis) @@ -402,9 +411,38 @@ def shift(self, periods: int, axis: int, fill_value) -> "ArrayManager": "shift", periods=periods, axis=axis, fill_value=fill_value ) + def fillna(self, value, limit, inplace: bool, downcast) -> "ArrayManager": + + inplace = validate_bool_kwarg(inplace, "inplace") + + def array_fillna(array, value, limit, inplace): + + mask = isna(array) + if limit is not None: + limit = libalgos._validate_limit(None, limit=limit) + mask[mask.cumsum() > limit] = False + + # if not self._can_hold_na: + # if inplace: + # return [self] + # else: + # return [self.copy()] + if not inplace: + array = array.copy() + + np.putmask(array, mask, value) + return array + + return self.apply(array_fillna, value=value, limit=limit, inplace=inplace) + def downcast(self) -> "ArrayManager": return self.apply_with_block("downcast") + def astype( + self, dtype, copy: bool = False, errors: str = "raise" + ) -> "ArrayManager": + return self.apply("astype", dtype=dtype, copy=copy) # , errors=errors) + def convert( self, copy: bool = True, @@ -422,18 +460,78 @@ def convert( coerce=coerce, ) - def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": + def replace(self, value, **kwargs) -> "ArrayManager": + assert np.ndim(value) == 0, value + # TODO "replace" is right now implemented on the blocks, we should move + # it to general array algos so it can be reused here + return self.apply_with_block("replace", value=value, **kwargs) + + def replace_list( + self: T, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> T: + """ do a list replace """ + inplace = validate_bool_kwarg(inplace, "inplace") + + return self.apply_with_block( + "_replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, + ) + + @property + def is_mixed_type(self) -> bool: + return True + + @property + def is_numeric_mixed_type(self) -> bool: + return False + + @property + def any_extension_types(self) -> bool: + """Whether any of the blocks in this manager are extension blocks""" + return False # any(block.is_extension for block in self.blocks) + + @property + def is_view(self) -> bool: + """ return a boolean if we are a single block and are a view """ + # TODO what is this used for? + return False + + def get_bool_data(self, copy: bool = False) -> "BlockManager": """ - Apply array_op blockwise with another (aligned) BlockManager. + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks """ - left_arrays = self.arrays - right_arrays = other.arrays - result_arrays = [array_op(l, r) for l, r in zip(left_arrays, right_arrays)] - return type(self)(result_arrays, self._axes) + mask = self.get_dtypes() == np.dtype("bool") + arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] + # TODO copy? + new_axes = [self._axes[0], self._axes[1][mask]] + return type(self)(arrays, new_axes) + + def get_numeric_data(self, copy: bool = False) -> "BlockManager": + """ + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks + """ + mask = np.array([is_numeric_dtype(t) for t in self.get_dtypes()]) + arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] + # TODO copy? + new_axes = [self._axes[0], self._axes[1][mask]] + return type(self)(arrays, new_axes) def copy(self: T, deep=True) -> T: """ - Make deep or shallow copy of BlockManager + Make deep or shallow copy of ArrayManager Parameters ---------- @@ -462,159 +560,108 @@ def copy_func(ax): new_arrays = self.arrays return type(self)(new_arrays, new_axes) - def astype( - self, dtype, copy: bool = False, errors: str = "raise" - ) -> "BlockManager": - return self.apply("astype", dtype=dtype, copy=copy) # , errors=errors) - - def iget(self, i: int) -> "SingleBlockManager": - """ - Return the data as a SingleBlockManager. + def as_array( + self, + transpose: bool = False, + dtype=None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: """ - values = self.arrays[i] - block = make_block(values, placement=slice(0, len(values)), ndim=1) - - return SingleBlockManager(block, self._axes[0]) + Convert the blockmanager data into an numpy array. - def iget_values(self, i: int) -> ArrayLike: - """ - Return the data for column i as the values (ndarray or ExtensionArray). - """ - return self.arrays[i] + Parameters + ---------- + transpose : bool, default False + If True, transpose the return array. + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. - def idelete(self, indexer): - """ - Delete selected locations in-place (new block and array, same BlockManager) + Returns + ------- + arr : ndarray """ - to_keep = np.ones(self.shape[0], dtype=np.bool_) - to_keep[indexer] = False + if len(self.arrays) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr - self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] - self._axes = [self._axes[0], self._axes[1][to_keep]] + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default - def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): - """ - Take items along any axis. - """ - axis = self._normalize_axis(axis) + if not dtype: + dtype = _interleaved_dtype(self.arrays) - indexer = ( - np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") - if isinstance(indexer, slice) - else np.asanyarray(indexer, dtype="int64") - ) + result = np.empty(self.shape_proper, dtype=dtype) - n = self.shape_proper[axis] - if convert: - indexer = maybe_convert_indices(indexer, n) + for i, arr in enumerate(self.arrays): + arr = arr.astype(dtype, copy=copy) + result[:, i] = arr - if verify: - if ((indexer == -1) | (indexer >= n)).any(): - raise Exception("Indices must be nonzero and less than the axis length") + if na_value is not lib.no_default: + result[isna(result)] = na_value - new_labels = self._axes[axis].take(indexer) - return self._reindex_indexer( - new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True - ) + return result + # return arr.transpose() if transpose else arr - def _make_na_array(self, fill_value=None): - if fill_value is None: - fill_value = np.nan + def get_slice(self, slobj: slice, axis: int = 0) -> "BlockManager": + axis = self._normalize_axis(axis) - dtype, fill_value = infer_dtype_from_scalar(fill_value) - values = np.empty(self.shape_proper[0], dtype=dtype) - values.fill(fill_value) - return values + if axis == 0: + arrays = [arr[slobj] for arr in self.arrays] + elif axis == 1: + arrays = self.arrays[slobj] - def reindex_indexer( - self: T, - new_axis, - indexer, - axis: int, - fill_value=None, - allow_dups: bool = False, - copy: bool = True, - ) -> T: - axis = self._normalize_axis(axis) - return self._reindex_indexer( - new_axis, indexer, axis, fill_value, allow_dups, copy - ) + new_axes = list(self._axes) + new_axes[axis] = new_axes[axis][slobj] - def _reindex_indexer( - self: T, - new_axis, - indexer, - axis: int, - fill_value=None, - allow_dups: bool = False, - copy: bool = True, - ) -> T: + return type(self)(arrays, new_axes, do_integrity_check=False) + + def fast_xs(self, loc: int) -> ArrayLike: """ + Return the array corresponding to `frame.iloc[loc]`. + Parameters ---------- - new_axis : Index - indexer : ndarray of int64 or None - axis : int - fill_value : object, default None - allow_dups : bool, default False - copy : bool, default True - + loc : int - pandas-indexer with -1's only. + Returns + ------- + np.ndarray or ExtensionArray """ - if indexer is None: - if new_axis is self._axes[axis] and not copy: - return self - - result = self.copy(deep=copy) - result._axes = list(self._axes) - result._axes[axis] = new_axis - return result - - # some axes don't allow reindexing with dups - if not allow_dups: - self._axes[axis]._can_reindex(indexer) - - # if axis >= self.ndim: - # raise IndexError("Requested axis not found in manager") - - if axis == 1: - new_arrays = [] - for i in indexer: - if i == -1: - arr = self._make_na_array(fill_value=fill_value) - else: - arr = self.arrays[i] - new_arrays.append(arr) - - else: - new_arrays = [ - algos.take( - array, - indexer, - allow_fill=True, - fill_value=fill_value, # if fill_value is not None else blk.fill_value - ) - for array in self.arrays - ] - - new_axes = list(self._axes) - new_axes[axis] = new_axis + dtype = _interleaved_dtype(self.arrays) + return np.array([a[loc] for a in self.arrays], dtype=dtype) - return type(self)(new_arrays, new_axes) + def iget(self, i: int) -> "SingleBlockManager": + """ + Return the data as a SingleBlockManager. + """ + values = self.arrays[i] + block = make_block(values, placement=slice(0, len(values)), ndim=1) - def get_slice(self, slobj: slice, axis: int = 0) -> "BlockManager": - axis = self._normalize_axis(axis) + return SingleBlockManager(block, self._axes[0]) - if axis == 0: - arrays = [arr[slobj] for arr in self.arrays] - elif axis == 1: - arrays = self.arrays[slobj] + def iget_values(self, i: int) -> ArrayLike: + """ + Return the data for column i as the values (ndarray or ExtensionArray). + """ + return self.arrays[i] - new_axes = list(self._axes) - new_axes[axis] = new_axes[axis][slobj] + def idelete(self, indexer): + """ + Delete selected locations in-place (new block and array, same BlockManager) + """ + to_keep = np.ones(self.shape[0], dtype=np.bool_) + to_keep[indexer] = False - return type(self)(arrays, new_axes, do_integrity_check=False) + self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] + self._axes = [self._axes[0], self._axes[1][to_keep]] def iset(self, loc: Union[int, slice, np.ndarray], value): """ @@ -673,140 +720,120 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): self.arrays = arrays self._axes[1] = new_axis - def fast_xs(self, loc: int) -> ArrayLike: - """ - Return the array corresponding to `frame.iloc[loc]`. + def reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + ) -> T: + axis = self._normalize_axis(axis) + return self._reindex_indexer( + new_axis, indexer, axis, fill_value, allow_dups, copy + ) + def _reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + ) -> T: + """ Parameters ---------- - loc : int - - Returns - ------- - np.ndarray or ExtensionArray - """ - dtype = _interleaved_dtype(self.arrays) - return np.array([a[loc] for a in self.arrays], dtype=dtype) - - def fillna(self, value, limit, inplace: bool, downcast) -> "ArrayManager": - - inplace = validate_bool_kwarg(inplace, "inplace") - - def array_fillna(array, value, limit, inplace): - - mask = isna(array) - if limit is not None: - limit = libalgos._validate_limit(None, limit=limit) - mask[mask.cumsum() > limit] = False - - # if not self._can_hold_na: - # if inplace: - # return [self] - # else: - # return [self.copy()] - if not inplace: - array = array.copy() - - np.putmask(array, mask, value) - return array + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object, default None + allow_dups : bool, default False + copy : bool, default True - return self.apply(array_fillna, value=value, limit=limit, inplace=inplace) - def as_array( - self, - transpose: bool = False, - dtype=None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: + pandas-indexer with -1's only. """ - Convert the blockmanager data into an numpy array. - - Parameters - ---------- - transpose : bool, default False - If True, transpose the return array. - dtype : object, default None - Data type of the return array. - copy : bool, default False - If True then guarantee that a copy is returned. A value of - False does not guarantee that the underlying data is not - copied. - na_value : object, default lib.no_default - Value to be used as the missing value sentinel. + if indexer is None: + if new_axis is self._axes[axis] and not copy: + return self - Returns - ------- - arr : ndarray - """ - if len(self.arrays) == 0: - arr = np.empty(self.shape, dtype=float) - return arr.transpose() if transpose else arr + result = self.copy(deep=copy) + result._axes = list(self._axes) + result._axes[axis] = new_axis + return result - # We want to copy when na_value is provided to avoid - # mutating the original object - copy = copy or na_value is not lib.no_default + # some axes don't allow reindexing with dups + if not allow_dups: + self._axes[axis]._can_reindex(indexer) - if not dtype: - dtype = _interleaved_dtype(self.arrays) + # if axis >= self.ndim: + # raise IndexError("Requested axis not found in manager") - result = np.empty(self.shape_proper, dtype=dtype) + if axis == 1: + new_arrays = [] + for i in indexer: + if i == -1: + arr = self._make_na_array(fill_value=fill_value) + else: + arr = self.arrays[i] + new_arrays.append(arr) - for i, arr in enumerate(self.arrays): - arr = arr.astype(dtype, copy=copy) - result[:, i] = arr + else: + new_arrays = [ + algos.take( + array, + indexer, + allow_fill=True, + fill_value=fill_value, + # if fill_value is not None else blk.fill_value + ) + for array in self.arrays + ] - if na_value is not lib.no_default: - result[isna(result)] = na_value + new_axes = list(self._axes) + new_axes[axis] = new_axis - return result - # return arr.transpose() if transpose else arr + return type(self)(new_arrays, new_axes) - def get_bool_data(self, copy: bool = False) -> "BlockManager": + def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks + Take items along any axis. """ - mask = self.get_dtypes() == np.dtype("bool") - arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] - # TODO copy? - new_axes = [self._axes[0], self._axes[1][mask]] - return type(self)(arrays, new_axes) + axis = self._normalize_axis(axis) - def get_numeric_data(self, copy: bool = False) -> "BlockManager": - """ - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks - """ - mask = np.array([is_numeric_dtype(t) for t in self.get_dtypes()]) - arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] - # TODO copy? - new_axes = [self._axes[0], self._axes[1][mask]] - return type(self)(arrays, new_axes) + indexer = ( + np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") + if isinstance(indexer, slice) + else np.asanyarray(indexer, dtype="int64") + ) - @property - def is_view(self) -> bool: - """ return a boolean if we are a single block and are a view """ - return False + n = self.shape_proper[axis] + if convert: + indexer = maybe_convert_indices(indexer, n) - @property - def is_mixed_type(self) -> bool: - return True + if verify: + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception("Indices must be nonzero and less than the axis length") - @property - def is_numeric_mixed_type(self) -> bool: - return False + new_labels = self._axes[axis].take(indexer) + return self._reindex_indexer( + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + ) - @property - def any_extension_types(self) -> bool: - """Whether any of the blocks in this manager are extension blocks""" - return False # any(block.is_extension for block in self.blocks) + def _make_na_array(self, fill_value=None): + if fill_value is None: + fill_value = np.nan + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + values = np.empty(self.shape_proper[0], dtype=dtype) + values.fill(fill_value) + return values # TODO + # equals # unstack # to_dict # quantile From 0fb645ed8e2ec67475fb0b798e8dee4b2d7bee9f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Sep 2020 11:45:02 +0200 Subject: [PATCH 07/29] skip json tests for now --- pandas/tests/io/json/test_compression.py | 4 ++++ pandas/tests/io/json/test_deprecated_kwargs.py | 6 ++++++ pandas/tests/io/json/test_json_table_schema.py | 4 ++++ pandas/tests/io/json/test_normalize.py | 5 +++++ pandas/tests/io/json/test_pandas.py | 5 +++++ pandas/tests/io/json/test_readlines.py | 4 ++++ pandas/tests/io/json/test_ujson.py | 5 +++++ 7 files changed, 33 insertions(+) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c0e3220454bf1..8ffbac13103aa 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -5,6 +5,10 @@ import pandas as pd import pandas._testing as tm +pytestmark = pytest.mark.skipif( + pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" +) + def test_compression_roundtrip(compression): df = pd.DataFrame( diff --git a/pandas/tests/io/json/test_deprecated_kwargs.py b/pandas/tests/io/json/test_deprecated_kwargs.py index 79245bc9d34a8..c09d754444d83 100644 --- a/pandas/tests/io/json/test_deprecated_kwargs.py +++ b/pandas/tests/io/json/test_deprecated_kwargs.py @@ -2,11 +2,17 @@ Tests for the deprecated keyword arguments for `read_json`. """ +import pytest + import pandas as pd import pandas._testing as tm from pandas.io.json import read_json +pytestmark = pytest.mark.skipif( + pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" +) + def test_deprecated_kwargs(): df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2]) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 8f1ed193b100f..c8274c498e2d1 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -20,6 +20,10 @@ set_default_names, ) +pytestmark = pytest.mark.skipif( + pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" +) + class TestBuildSchema: def setup_method(self, method): diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 8d93fbcc063f4..1caa2ed4eb694 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -3,11 +3,16 @@ import numpy as np import pytest +import pandas as pd from pandas import DataFrame, Index, Series, json_normalize import pandas._testing as tm from pandas.io.json._normalize import nested_to_record +pytestmark = pytest.mark.skipif( + pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" +) + @pytest.fixture def deep_nested(): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 59d64e1a6e909..d9ece1095c092 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -16,6 +16,11 @@ from pandas import DataFrame, DatetimeIndex, Series, Timestamp, compat, read_json import pandas._testing as tm +pytestmark = pytest.mark.skipif( + pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" +) + + _seriesd = tm.getSeriesData() _frame = DataFrame(_seriesd) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index b475fa2c514ff..9c28621e4406f 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -9,6 +9,10 @@ from pandas.io.json._json import JsonReader +pytestmark = pytest.mark.skipif( + pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" +) + @pytest.fixture def lines_json_df(): diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index e2007e07c572a..32d04a59da908 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -17,9 +17,14 @@ from pandas._libs.tslib import Timestamp from pandas.compat import IS64, is_platform_windows +import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm +pytestmark = pytest.mark.skipif( + pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" +) + def _clean_dict(d): """ From eb55fef4a4797f94fe6444b7d3fadfa40b4132d8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Sep 2020 12:42:08 +0200 Subject: [PATCH 08/29] skip more json tests + to_csv with to_native_types --- pandas/core/internals/managers.py | 10 ++++++++ pandas/io/formats/csvs.py | 25 +++++++++++++------ pandas/tests/io/formats/test_printing.py | 3 +++ pandas/tests/io/json/test_compression.py | 4 +-- .../tests/io/json/test_deprecated_kwargs.py | 6 ++--- .../tests/io/json/test_json_table_schema.py | 6 ++--- pandas/tests/io/json/test_normalize.py | 7 +++--- pandas/tests/io/json/test_pandas.py | 4 +-- pandas/tests/io/json/test_readlines.py | 6 ++--- pandas/tests/io/json/test_ujson.py | 6 ++--- pandas/tests/io/test_common.py | 8 +++++- pandas/tests/io/test_compression.py | 4 +++ pandas/util/_test_decorators.py | 7 ++++++ 13 files changed, 64 insertions(+), 32 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5d97dbdc171a4..07057de612a3c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -832,6 +832,16 @@ def _make_na_array(self, fill_value=None): values.fill(fill_value) return values + def to_native_types(self, **kwargs): + result_arrays = [] + + for i, array in enumerate(self.arrays): + block = make_block(np.atleast_2d(array), placement=slice(0, 1, 1), ndim=2) + res = block.to_native_types(**kwargs) + result_arrays.append(res[0, :]) + + return result_arrays + # TODO # equals # unstack diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 270caec022fef..b212b405c9924 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -326,11 +326,9 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] - blocks = df._mgr.blocks - for i in range(len(blocks)): - b = blocks[i] - d = b.to_native_types( + if hasattr(df._mgr, "arrays"): + self.data = df._mgr.to_native_types( na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, @@ -338,9 +336,22 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: quoting=self.quoting, ) - for col_loc, col in zip(b.mgr_locs, d): - # self.data is a preallocated list - self.data[col_loc] = col + else: + blocks = df._mgr.blocks + + for i in range(len(blocks)): + b = blocks[i] + d = b.to_native_types( + na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting, + ) + + for col_loc, col in zip(b.mgr_locs, d): + # self.data is a preallocated list + self.data[col_loc] = col ix = data_index.to_native_types( slicer=slicer, diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index f0d5ef19c4468..2339e21288bb5 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -3,6 +3,8 @@ import pandas._config.config as cf +import pandas.util._test_decorators as td + import pandas as pd import pandas.io.formats.format as fmt @@ -119,6 +121,7 @@ def test_ambiguous_width(self): assert adjoined == expected +@td.skip_array_manager_not_yet_implemented class TestTableSchemaRepr: @classmethod def setup_class(cls): diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 8ffbac13103aa..d08ecb3e99812 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -5,9 +5,7 @@ import pandas as pd import pandas._testing as tm -pytestmark = pytest.mark.skipif( - pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" -) +pytestmark = td.skip_array_manager_not_yet_implemented def test_compression_roundtrip(compression): diff --git a/pandas/tests/io/json/test_deprecated_kwargs.py b/pandas/tests/io/json/test_deprecated_kwargs.py index c09d754444d83..7367aaefb1c1e 100644 --- a/pandas/tests/io/json/test_deprecated_kwargs.py +++ b/pandas/tests/io/json/test_deprecated_kwargs.py @@ -2,16 +2,14 @@ Tests for the deprecated keyword arguments for `read_json`. """ -import pytest +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm from pandas.io.json import read_json -pytestmark = pytest.mark.skipif( - pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" -) +pytestmark = td.skip_array_manager_not_yet_implemented def test_deprecated_kwargs(): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index c8274c498e2d1..afb29e84d7346 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -6,6 +6,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype import pandas as pd @@ -20,9 +22,7 @@ set_default_names, ) -pytestmark = pytest.mark.skipif( - pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" -) +pytestmark = td.skip_array_manager_not_yet_implemented class TestBuildSchema: diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 1caa2ed4eb694..0d6b10441e582 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -3,15 +3,14 @@ import numpy as np import pytest -import pandas as pd +import pandas.util._test_decorators as td + from pandas import DataFrame, Index, Series, json_normalize import pandas._testing as tm from pandas.io.json._normalize import nested_to_record -pytestmark = pytest.mark.skipif( - pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" -) +pytestmark = td.skip_array_manager_not_yet_implemented @pytest.fixture diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d9ece1095c092..44a1fb1457c19 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -16,9 +16,7 @@ from pandas import DataFrame, DatetimeIndex, Series, Timestamp, compat, read_json import pandas._testing as tm -pytestmark = pytest.mark.skipif( - pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" -) +pytestmark = td.skip_array_manager_not_yet_implemented _seriesd = tm.getSeriesData() diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 9c28621e4406f..48ad621ec96ad 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -3,15 +3,15 @@ import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, read_json import pandas._testing as tm from pandas.io.json._json import JsonReader -pytestmark = pytest.mark.skipif( - pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" -) +pytestmark = td.skip_array_manager_not_yet_implemented @pytest.fixture diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 32d04a59da908..30173d7953689 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -16,14 +16,12 @@ import pandas._libs.json as ujson from pandas._libs.tslib import Timestamp from pandas.compat import IS64, is_platform_windows +import pandas.util._test_decorators as td -import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm -pytestmark = pytest.mark.skipif( - pd.get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" -) +pytestmark = td.skip_array_manager_not_yet_implemented def _clean_dict(d): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 85a12a13d19fb..c600293ad2011 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -271,7 +271,13 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_excel", {"engine": "xlwt"}, "xlwt"), ("to_feather", {}, "pyarrow"), ("to_html", {}, "os"), - ("to_json", {}, "os"), + ( + pytest.param( + "to_json", marks=td.skip_array_manager_not_yet_implemented + ), + {}, + "os", + ), ("to_latex", {}, "os"), ("to_pickle", {}, "os"), ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 31e9ad4cf4416..d65512074199b 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -8,11 +8,15 @@ import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm import pandas.io.common as icom +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.mark.parametrize( "obj", diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 78facd6694635..f145ed17ca304 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -31,6 +31,8 @@ def test_foo(): import numpy as np import pytest +from pandas._config import get_option + from pandas.compat import IS64, is_platform_windows from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import _np_version @@ -277,3 +279,8 @@ def async_mark(): async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") return async_mark + + +skip_array_manager_not_yet_implemented = pytest.mark.skipif( + get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" +) From 47c3ee3004d5a5b9773b1719b5a2d6552a7e5b75 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 17 Sep 2020 08:57:01 +0200 Subject: [PATCH 09/29] support both ndarrays and ExtensionArrays --- pandas/core/internals/concat.py | 2 +- pandas/core/internals/managers.py | 101 +++++++++++++++++++++--------- 2 files changed, 71 insertions(+), 32 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 54e0262711ad7..15151a1ee3c57 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -53,7 +53,7 @@ def concatenate_block_managers( # TODO for now only fastpath without indexers mgrs = [t[0] for t in mgrs_indexers] arrays = [ - np.concatenate([mgrs[i].arrays[j] for i in range(len(mgrs))]) + concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0) for j in range(len(mgrs[0].arrays)) ] return ArrayManager(arrays, [axes[1], axes[0]]) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index df8c383c53556..7d3046fb71088 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -26,6 +26,7 @@ ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, + is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_list_like, @@ -37,7 +38,7 @@ from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ExtensionArray, PandasDtype from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject from pandas.core.construction import extract_array @@ -116,12 +117,12 @@ class ArrayManager(DataManager): "arrays", ] - arrays: List[np.ndarray] + arrays: List[Union[np.ndarray, ExtensionArray]] axes: Sequence[Index] def __init__( self, - arrays: List[np.ndarray], + arrays: List[Union[np.ndarray, ExtensionArray]], axes: Sequence[Index], do_integrity_check: bool = True, ): @@ -212,11 +213,16 @@ def _verify_integrity(self) -> None: "Number of passed arrays must equal the size of the column Index: " f"{len(self.arrays)} arrays vs {n_columns} columns." ) - for array in self.arrays: - if not len(array) == n_rows: + for arr in self.arrays: + if not len(arr) == n_rows: raise ValueError( "Passed arrays should have the same length as the rows Index: " - f"{len(array)} vs {n_rows} rows" + f"{len(arr)} vs {n_rows} rows" + ) + if not isinstance(arr, (np.ndarray, ExtensionArray)): + raise ValueError( + "Passed arrays should be np.ndarray or ExtensionArray instances, " + f"got {type(arr)} instead" ) def reduce(self: T, func) -> T: @@ -224,8 +230,8 @@ def reduce(self: T, func) -> T: assert self.ndim == 2 res_arrays = [] - for array in self.arrays: - res = func(array) + for arr in self.arrays: + res = func(arr) res_arrays.append(np.array([res])) index = Index([0]) # placeholder @@ -290,7 +296,7 @@ def apply( else: kwargs[k] = obj.iloc[:, i]._values else: - # otherwise we have an ndarray + # otherwise we have an array-like kwargs[k] = obj[i] try: @@ -302,6 +308,9 @@ def apply( if not ignore_failures: raise continue + # if not isinstance(applied, ExtensionArray): + # # TODO not all EA operations return new EAs (eg astype) + # applied = array(applied) result_arrays.append(applied) result_indices.append(i) @@ -323,10 +332,9 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: result_arrays = [] - for i, array in enumerate(self.arrays): + for i, arr in enumerate(self.arrays): if aligned_args: - for k, obj in aligned_args.items(): if isinstance(obj, (ABCSeries, ABCDataFrame)): # The caller is responsible for ensuring that @@ -339,13 +347,17 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: # otherwise we have an ndarray kwargs[k] = obj[[i]] - block = make_block(np.atleast_2d(array), placement=slice(0, 1, 1), ndim=2) + if isinstance(arr, np.ndarray): + arr = np.atleast_2d(arr) + block = make_block(arr, placement=slice(0, 1, 1), ndim=2) applied = getattr(block, f)(**kwargs) while isinstance(applied, list): # ObjectBlock gives double nested result?, some functions give no list applied = applied[0] - applied_array = applied.values[0, :] - result_arrays.append(applied_array) + arr = applied.values + if isinstance(arr, np.ndarray): + arr = arr[0, :] + result_arrays.append(arr) return type(self)(result_arrays, self._axes) @@ -419,7 +431,7 @@ def array_fillna(array, value, limit, inplace): mask = isna(array) if limit is not None: - limit = libalgos._validate_limit(None, limit=limit) + limit = libalgos.validate_limit(None, limit=limit) mask[mask.cumsum() > limit] = False # if not self._can_hold_na: @@ -430,7 +442,10 @@ def array_fillna(array, value, limit, inplace): if not inplace: array = array.copy() - np.putmask(array, mask, value) + # np.putmask(array, mask, value) + if np.any(mask): + # TODO allow invalid value if there is nothing to fill? + array[mask] = value return array return self.apply(array_fillna, value=value, limit=limit, inplace=inplace) @@ -510,7 +525,7 @@ def get_bool_data(self, copy: bool = False) -> "BlockManager": copy : bool, default False Whether to copy the blocks """ - mask = self.get_dtypes() == np.dtype("bool") + mask = np.array([is_bool_dtype(t) for t in self.get_dtypes()], dtype="object") arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] # TODO copy? new_axes = [self._axes[0], self._axes[1][mask]] @@ -598,6 +613,15 @@ def as_array( if not dtype: dtype = _interleaved_dtype(self.arrays) + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + elif isinstance(dtype, PandasDtype): + dtype = dtype.numpy_dtype + elif is_extension_array_dtype(dtype): + dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" + result = np.empty(self.shape_proper, dtype=dtype) for i, arr in enumerate(self.arrays): @@ -636,7 +660,22 @@ def fast_xs(self, loc: int) -> ArrayLike: np.ndarray or ExtensionArray """ dtype = _interleaved_dtype(self.arrays) - return np.array([a[loc] for a in self.arrays], dtype=dtype) + + if isinstance(dtype, SparseDtype): + temp_dtype = dtype.subtype + elif isinstance(dtype, PandasDtype): + temp_dtype = dtype.numpy_dtype + elif is_extension_array_dtype(dtype): + temp_dtype = "object" + elif is_dtype_equal(dtype, str): + temp_dtype = "object" + else: + temp_dtype = dtype + + result = np.array([arr[loc] for arr in self.arrays], dtype=temp_dtype) + if isinstance(dtype, ExtensionDtype): + result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + return result def iget(self, i: int) -> "SingleBlockManager": """ @@ -669,15 +708,14 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): contained in the current set of items """ if lib.is_integer(loc): - # TODO normalize array -> this should in theory not be needed - if isinstance(value, ExtensionArray): - import pytest + # TODO normalize array -> this should in theory not be needed? + value = extract_array(value, extract_numpy=True) + if isinstance(value, np.ndarray) and value.ndim == 2: + value = value[0, :] - pytest.skip() - value = np.asarray(value) + assert isinstance(value, (np.ndarray, ExtensionArray)) + # value = np.asarray(value) # assert isinstance(value, np.ndarray) - if value.ndim == 2: - value = value[0, :] assert len(value) == len(self._axes[0]) self.arrays[loc] = value return @@ -708,6 +746,7 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) + value = extract_array(value, extract_numpy=True) if value.ndim == 2: value = value[0, :] # TODO self.arrays can be empty @@ -784,13 +823,13 @@ def _reindex_indexer( else: new_arrays = [ algos.take( - array, + arr, indexer, allow_fill=True, fill_value=fill_value, # if fill_value is not None else blk.fill_value ) - for array in self.arrays + for arr in self.arrays ] new_axes = list(self._axes) @@ -835,8 +874,8 @@ def _make_na_array(self, fill_value=None): def to_native_types(self, **kwargs): result_arrays = [] - for i, array in enumerate(self.arrays): - block = make_block(np.atleast_2d(array), placement=slice(0, 1, 1), ndim=2) + for i, arr in enumerate(self.arrays): + block = make_block(np.atleast_2d(arr), placement=slice(0, 1, 1), ndim=2) res = block.to_native_types(**kwargs) result_arrays.append(res[0, :]) @@ -2321,8 +2360,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> "SingleBlockManager": raise IndexError("Requested axis not found in manager") blk = self._block - array = blk._slice(slobj) - block = blk.make_block_same_class(array, placement=slice(0, len(array))) + arr = blk._slice(slobj) + block = blk.make_block_same_class(arr, placement=slice(0, len(arr))) return type(self)(block, self.index[slobj]) @property From f36e395d9936098659e14e3828b2fd02ade5b679 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 17 Sep 2020 09:23:49 +0200 Subject: [PATCH 10/29] add unstack --- pandas/core/internals/managers.py | 34 ++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4df73bf831054..10d465254381e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -881,9 +881,41 @@ def to_native_types(self, **kwargs): return result_arrays + def unstack(self, unstacker, fill_value) -> "ArrayManager": + """ + Return a BlockManager with all blocks unstacked.. + + Parameters + ---------- + unstacker : reshape._Unstacker + fill_value : Any + fill_value for newly introduced missing values. + + Returns + ------- + unstacked : BlockManager + """ + indexer, _ = unstacker._indexer_and_to_sort + new_indexer = np.full(unstacker.mask.shape, -1) + new_indexer[unstacker.mask] = indexer + new_indexer2D = new_indexer.reshape(*unstacker.full_shape) + + new_arrays = [] + for arr in self.arrays: + for i in range(unstacker.full_shape[1]): + new_arr = algos.take( + arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value + ) + new_arrays.append(new_arr) + + new_index = unstacker.new_index + new_columns = unstacker.get_new_columns(self._axes[1]) + new_axes = [new_index, new_columns] + + return type(self)(new_arrays, new_axes, do_integrity_check=False) + # TODO # equals - # unstack # to_dict # quantile From be20816ae16e1799c5f1c0a711a78ed9ba3d39c7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 17 Sep 2020 09:57:47 +0200 Subject: [PATCH 11/29] fix native types, skip quantile, hdf, stata tests --- pandas/core/frame.py | 2 +- pandas/core/internals/managers.py | 22 +++++++++++--------- pandas/tests/frame/methods/test_describe.py | 5 +++++ pandas/tests/frame/methods/test_quantile.py | 4 ++++ pandas/tests/frame/test_api.py | 2 +- pandas/tests/io/pytables/test_complex.py | 4 ++++ pandas/tests/io/pytables/test_store.py | 4 ++++ pandas/tests/io/pytables/test_timezones.py | 3 +++ pandas/tests/io/test_stata.py | 5 +++++ pandas/tests/series/methods/test_describe.py | 5 +++++ pandas/tests/series/methods/test_quantile.py | 4 ++++ pandas/util/_test_decorators.py | 5 +++++ 12 files changed, 53 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3480f5b471293..b95ed90b6dad2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5368,7 +5368,7 @@ def sort_values( # type: ignore[override] ) if ignore_index: - new_data.axes[1] = ibase.default_index(len(indexer)) + new_data.set_axis(1, ibase.default_index(len(indexer))) result = self._constructor(new_data) if inplace: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 10d465254381e..b669a0dbb834b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -347,6 +347,9 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: # otherwise we have an ndarray kwargs[k] = obj[[i]] + if hasattr(arr, "tz") and arr.tz is None: + # DatetimeArray needs to be converted to ndarray for DatetimeBlock + arr = arr._data if isinstance(arr, np.ndarray): arr = np.atleast_2d(arr) block = make_block(arr, placement=slice(0, 1, 1), ndim=2) @@ -499,6 +502,9 @@ def replace_list( regex=regex, ) + def to_native_types(self, **kwargs): + return self.apply_with_block("to_native_types", **kwargs) + @property def is_mixed_type(self) -> bool: return True @@ -518,6 +524,12 @@ def is_view(self) -> bool: # TODO what is this used for? return False + @property + def _is_single_block(self) -> bool: + # TODO should we avoid using it from outside the blockmanager since + # it is a private property? (eg use is_mixed_type instead?) + return False + def get_bool_data(self, copy: bool = False) -> "BlockManager": """ Parameters @@ -871,16 +883,6 @@ def _make_na_array(self, fill_value=None): values.fill(fill_value) return values - def to_native_types(self, **kwargs): - result_arrays = [] - - for i, arr in enumerate(self.arrays): - block = make_block(np.atleast_2d(arr), placement=slice(0, 1, 1), ndim=2) - res = block.to_native_types(**kwargs) - result_arrays.append(res[0, :]) - - return result_arrays - def unstack(self, unstacker, fill_value) -> "ArrayManager": """ Return a BlockManager with all blocks unstacked.. diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 0b70bead375da..7c3ac98431ef1 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -1,9 +1,14 @@ import numpy as np +import pandas.util._test_decorators as td + import pandas as pd from pandas import Categorical, DataFrame, Series, Timestamp, date_range import pandas._testing as tm +# TODO(ArrayManager) quantile is needed for describe() +pytestmark = td.skip_array_manager_not_yet_implemented + class TestDataFrameDescribe: def test_describe_bool_in_mixed_frame(self): diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 0b8f1e0495155..7e8ae42c59759 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -1,10 +1,14 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Series, Timestamp import pandas._testing as tm +pytestmark = td.skip_array_manager_not_yet_implemented + class TestDataFrameQuantile: @pytest.mark.parametrize( diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 21f737d3bc4c0..5c381f94ec4d9 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -447,7 +447,7 @@ def test_with_datetimelikes(self): expected = Series({np.dtype("object"): 10}) tm.assert_series_equal(result, expected) - @pytest.mark.skip + @td.skip_array_manager_invalid_test def test_values(self, float_frame): float_frame.values[:, 0] = 5.0 assert (float_frame.values[:, 0] == 5).all() diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 543940e674dba..0c896f116ce25 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -12,6 +12,10 @@ from pandas.io.pytables import read_hdf +# TODO(ArrayManager) HDFStore relies on accessing the blocks +pytestmark = td.skip_array_manager_not_yet_implemented + + # GH10447 diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 0942c79837e7c..f9be264100c46 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -56,6 +56,10 @@ from pandas.io.pytables import TableIterator # noqa: E402 isort:skip +# TODO(ArrayManager) HDFStore relies on accessing the blocks +pytestmark = td.skip_array_manager_not_yet_implemented + + _default_compressor = "blosc" ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 1c29928991cde..f439e184b1144 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -14,6 +14,9 @@ ensure_clean_store, ) +# TODO(ArrayManager) HDFStore relies on accessing the blocks +pytestmark = td.skip_array_manager_not_yet_implemented + def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 88f61390957a6..fdd4b22ec0028 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -12,6 +12,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd @@ -29,6 +31,9 @@ read_stata, ) +# TODO(ArrayManager) the stata code relies on BlockManager internals (eg blknos) +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.fixture() def mixed_frame(): diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index a15dc0751aa7d..e479e5c1416db 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,8 +1,13 @@ import numpy as np +import pandas.util._test_decorators as td + from pandas import Period, Series, Timedelta, Timestamp, date_range import pandas._testing as tm +# TODO(ArrayManager) quantile is needed for describe() +pytestmark = td.skip_array_manager_not_yet_implemented + class TestSeriesDescribe: def test_describe(self): diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 79f50afca658f..e69f1dd09c537 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_integer import pandas as pd @@ -8,6 +10,8 @@ import pandas._testing as tm from pandas.core.indexes.datetimes import Timestamp +pytestmark = td.skip_array_manager_not_yet_implemented + class TestSeriesQuantile: def test_quantile(self, datetime_series): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index f259c90d7b8b1..22af25f94c8db 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -285,3 +285,8 @@ def async_mark(): skip_array_manager_not_yet_implemented = pytest.mark.skipif( get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" ) + +skip_array_manager_invalid_test = pytest.mark.skipif( + get_option("mode.data_manager") == "array", + reason="Test that relies on BlockManager internals or specific behaviour", +) From 8b7cc8157a3a8959f48c007f808a6198927ea9b3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 17 Sep 2020 10:25:33 +0200 Subject: [PATCH 12/29] remove skip in the benchmarks --- asv_bench/benchmarks/stat_ops.py | 3 --- pandas/core/config_init.py | 2 ++ pandas/core/frame.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 74a1fe7295273..5639d6702a92c 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -11,9 +11,6 @@ class FrameOps: param_names = ["op", "dtype", "axis"] def setup(self, op, dtype, axis): - if dtype == "Int64": - # XXX only dealing with numpy arrays in ArrayManager right now - raise NotImplementedError if op == "mad" and dtype == "Int64": # GH-33036, GH#33600 raise NotImplementedError diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index bd557783b27e7..71ef5c28b11d9 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -484,6 +484,8 @@ def use_inf_as_na_cb(key): ) cf.register_option( "data_manager", + # TODO switch back to default of "block" before merging + # "block", "array", "internal manager type", validator=is_one_of_factory(["block", "array"]), diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b95ed90b6dad2..aaa844a3b304f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -437,8 +437,8 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, - # TODO setting default to "array" for testing purposes (the actual default - # needs to stay "block" initially of course for backwards compatibility) + # TODO do we want to keep this as a keyword as well? (I think it can be handy) + # can we somehow make it a "private" keyword? (`_manager` ?) manager: Optional[str] = None, ): if data is None: From 55d38be16538607fb6d0b5ac3aa01797355d0173 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 16 Oct 2020 09:56:27 +0200 Subject: [PATCH 13/29] remove manager keyword from DataFrame constructor, add _as_manager instead --- pandas/core/frame.py | 46 ++++++++++++++++++++----- pandas/tests/frame/test_api.py | 3 +- pandas/tests/internals/test_managers.py | 38 ++++++++++++++++++++ 3 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 pandas/tests/internals/test_managers.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a517d855dc3ce..ed11545e60d37 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -442,9 +442,6 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, - # TODO do we want to keep this as a keyword as well? (I think it can be handy) - # can we somehow make it a "private" keyword? (`_manager` ?) - manager: Optional[str] = None, ): if data is None: data = {} @@ -561,18 +558,51 @@ def __init__( values, index, columns, dtype=values.dtype, copy=False ) - if manager is None: - manager = get_option("mode.data_manager") + manager = get_option("mode.data_manager") if manager == "array" and not isinstance(mgr, ArrayManager): # TODO proper initialization - df = DataFrame(mgr, manager="block") - arrays = [arr.copy() for arr in df._iter_column_arrays()] - mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) + df = DataFrame(mgr) + mgr = df._as_manager("array")._mgr # TODO check for case of manager="block" but mgr is ArrayManager NDFrame.__init__(self, mgr) + def _as_manager(self, typ): + """ + Private helper function to create a DataFrame with specific manager. + + Parameters + ---------- + mgr : {"block", "array"} + + Returns + ------- + DataFrame + New DataFrame using specified manager type. Is not guaranteed + to be a copy or not. + """ + mgr = self._mgr + if typ == "block": + if isinstance(mgr, BlockManager): + new_mgr = mgr + else: + new_mgr = arrays_to_mgr( + mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None + ) + elif typ == "array": + if isinstance(mgr, ArrayManager): + new_mgr = mgr + else: + arrays = [arr.copy() for arr in self._iter_column_arrays()] + new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) + else: + raise ValueError( + f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'" + ) + # fastpath of passing a manager doesn't check the option/manager class + return DataFrame(new_mgr) + # ---------------------------------------------------------------------- @property diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 7b8fbbfbe8952..d1fc31953b6d2 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -355,7 +355,8 @@ def test_to_numpy_dtype(self): def test_to_numpy_copy(self): arr = np.random.randn(4, 3) - df = pd.DataFrame(arr, manager="block") + with pd.option_context("mode.data_manager", "block"): + df = pd.DataFrame(arr) assert df.values.base is arr assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr diff --git a/pandas/tests/internals/test_managers.py b/pandas/tests/internals/test_managers.py new file mode 100644 index 0000000000000..9c9ca950b4af9 --- /dev/null +++ b/pandas/tests/internals/test_managers.py @@ -0,0 +1,38 @@ +""" +Testing interaction between the different managers (BlockManager, ArrayManager) +""" +import pandas as pd +import pandas._testing as tm +from pandas.core.internals import ArrayManager, BlockManager + + +def test_dataframe_creation(): + + with pd.option_context("mode.data_manager", "block"): + df_block = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + assert isinstance(df_block._mgr, BlockManager) + + with pd.option_context("mode.data_manager", "array"): + df_array = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + assert isinstance(df_array._mgr, ArrayManager) + + # also ensure both are seen as equal + tm.assert_frame_equal(df_block, df_array) + + # conversion from one manager to the other + result = df_block._as_manager("block") + assert isinstance(result._mgr, BlockManager) + result = df_block._as_manager("array") + assert isinstance(result._mgr, ArrayManager) + tm.assert_frame_equal(result, df_block) + assert all( + tm.array_equivalent(left, right) + for left, right in zip(result._mgr.arrays, df_array._mgr.arrays) + ) + + result = df_array._as_manager("array") + assert isinstance(result._mgr, ArrayManager) + result = df_array._as_manager("block") + assert isinstance(result._mgr, BlockManager) + tm.assert_frame_equal(result, df_array) + assert len(result._mgr.blocks) == 2 From 3dea0d7efb916c737994406dae27f62ed1dbb20b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 16 Oct 2020 10:22:15 +0200 Subject: [PATCH 14/29] move new ArrayManager code to separate file --- pandas/core/internals/__init__.py | 2 +- pandas/core/internals/array_manager.py | 884 +++++++++++++++++++++++++ pandas/core/internals/base.py | 39 ++ pandas/core/internals/concat.py | 3 +- pandas/core/internals/managers.py | 871 +----------------------- 5 files changed, 930 insertions(+), 869 deletions(-) create mode 100644 pandas/core/internals/array_manager.py create mode 100644 pandas/core/internals/base.py diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 209ccfc4a4b81..9b09344871e98 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,3 +1,4 @@ +from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, BoolBlock, @@ -15,7 +16,6 @@ ) from pandas.core.internals.concat import concatenate_block_managers from pandas.core.internals.managers import ( - ArrayManager, BlockManager, SingleBlockManager, create_block_manager_from_arrays, diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py new file mode 100644 index 0000000000000..75723206ecde6 --- /dev/null +++ b/pandas/core/internals/array_manager.py @@ -0,0 +1,884 @@ +""" +Experimental manager based on storing a collection of 1D arrays +""" +from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, TypeVar, Union + +import numpy as np + +from pandas._libs import algos as libalgos, lib +from pandas._typing import ArrayLike, DtypeObj, Label +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.cast import find_common_type, infer_dtype_from_scalar +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_numeric_dtype, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import isna + +import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray, PandasDtype +from pandas.core.arrays.sparse import SparseDtype +from pandas.core.construction import extract_array +from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexes.api import Index, ensure_index +from pandas.core.internals.base import DataManager +from pandas.core.internals.blocks import make_block + +if TYPE_CHECKING: + from pandas.core.internals.managers import SingleBlockManager + + +T = TypeVar("T", bound="ArrayManager") + + +class ArrayManager(DataManager): + """ + Core internal data structure to implement DataFrame and Series. + + Alternative to the BlockManager, storing a list of 1D arrays instead of + Blocks. + + This is *not* a public API class + + Parameters + ---------- + arrays : Sequence of arrays + axes : Sequence of Index + do_integrity_check : bool, default True + + """ + + __slots__ = [ + "_axes", + "arrays", + ] + + arrays: List[Union[np.ndarray, ExtensionArray]] + axes: Sequence[Index] + + def __init__( + self, + arrays: List[Union[np.ndarray, ExtensionArray]], + axes: Sequence[Index], + do_integrity_check: bool = True, + ): + # Note: we are storing the axes in "_axes" in the (row, columns) order + # which contrasts the order how it is stored in BlockManager + self._axes = axes + self.arrays = arrays + + if do_integrity_check: + self._axes = [ensure_index(ax) for ax in axes] + self._verify_integrity() + + def make_empty(self: T, axes=None) -> T: + """Return an empty ArrayManager with the items axis of len 0 (no columns)""" + if axes is None: + axes = [self.axes[1:], Index([])] + + arrays = [] + return type(self)(arrays, axes) + + @property + def items(self) -> Index: + return self._axes[1] + + @property + def axes(self) -> Sequence[Index]: + """Axes is BlockManager-compatible order (columns, rows)""" + return [self._axes[1], self._axes[0]] + + @property + def shape(self) -> Tuple[int, ...]: + # this still gives the BlockManager-compatible transposed shape + return tuple(len(ax) for ax in self.axes) + + @property + def shape_proper(self) -> Tuple[int, ...]: + # this returns (n_rows, n_columns) + return tuple(len(ax) for ax in self._axes) + + @staticmethod + def _normalize_axis(axis): + # switch axis + axis = 1 if axis == 0 else 0 + return axis + + # TODO can be shared + def set_axis(self, axis: int, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + axis = self._normalize_axis(axis) + old_len = len(self._axes[axis]) + new_len = len(new_labels) + + if new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) + + self._axes[axis] = new_labels + + def consolidate(self) -> "ArrayManager": + return self + + def is_consolidated(self) -> bool: + return True + + def _consolidate_inplace(self) -> None: + pass + + def get_dtypes(self): + return np.array([arr.dtype for arr in self.arrays], dtype="object") + + # TODO setstate getstate + + def __repr__(self) -> str: + output = type(self).__name__ + output += f"\nIndex: {self._axes[0]}" + output += f"\nColumns: {self._axes[1]}" + output += f"\n{len(self.arrays)} arrays:" + for arr in self.arrays: + output += f"\n{arr.dtype}" + return output + + def _verify_integrity(self) -> None: + n_rows, n_columns = self.shape_proper + if not len(self.arrays) == n_columns: + raise ValueError( + "Number of passed arrays must equal the size of the column Index: " + f"{len(self.arrays)} arrays vs {n_columns} columns." + ) + for arr in self.arrays: + if not len(arr) == n_rows: + raise ValueError( + "Passed arrays should have the same length as the rows Index: " + f"{len(arr)} vs {n_rows} rows" + ) + if not isinstance(arr, (np.ndarray, ExtensionArray)): + raise ValueError( + "Passed arrays should be np.ndarray or ExtensionArray instances, " + f"got {type(arr)} instead" + ) + + def reduce(self: T, func) -> T: + # TODO this still fails because `func` assumes to work on 2D arrays + assert self.ndim == 2 + + res_arrays = [] + for arr in self.arrays: + res = func(arr) + res_arrays.append(np.array([res])) + + index = Index([0]) # placeholder + new_mgr = type(self)(res_arrays, [index, self.items]) + return new_mgr + + def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": + """ + Apply array_op blockwise with another (aligned) BlockManager. + """ + # TODO what if `other` is BlockManager ? + left_arrays = self.arrays + right_arrays = other.arrays + result_arrays = [array_op(l, r) for l, r in zip(left_arrays, right_arrays)] + return type(self)(result_arrays, self._axes) + + def apply( + self: T, + f, + align_keys: Optional[List[str]] = None, + ignore_failures: bool = False, + **kwargs, + ) -> T: + """ + Iterate over the arrays, collect and create a new ArrayManager. + + Parameters + ---------- + f : str or callable + Name of the Array method to apply. + align_keys: List[str] or None, default None + ignore_failures: bool, default False + **kwargs + Keywords to pass to `f` + + Returns + ------- + ArrayManager + """ + assert "filter" not in kwargs + + align_keys = align_keys or [] + result_arrays: List[np.ndarray] = [] + result_indices: List[int] = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned + + aligned_args = {k: kwargs[k] for k in align_keys} + + if f == "apply": + f = kwargs.pop("func") + + for i, arr in enumerate(self.arrays): + + if aligned_args: + + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[i] + else: + kwargs[k] = obj.iloc[:, i]._values + else: + # otherwise we have an array-like + kwargs[k] = obj[i] + + try: + if callable(f): + applied = f(arr, **kwargs) + else: + applied = getattr(arr, f)(**kwargs) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + # if not isinstance(applied, ExtensionArray): + # # TODO not all EA operations return new EAs (eg astype) + # applied = array(applied) + result_arrays.append(applied) + result_indices.append(i) + + if ignore_failures: + # TODO copy? + new_axes = [self._axes[0], self._axes[1][result_indices]] + else: + new_axes = self._axes + + if len(result_arrays) == 0: + return self.make_empty(new_axes) + + return type(self)(result_arrays, new_axes) + + def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: + + align_keys = align_keys or [] + aligned_args = {k: kwargs[k] for k in align_keys} + + result_arrays = [] + + for i, arr in enumerate(self.arrays): + + if aligned_args: + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[[i]] + else: + kwargs[k] = obj.iloc[:, [i]]._values + else: + # otherwise we have an ndarray + kwargs[k] = obj[[i]] + + if hasattr(arr, "tz") and arr.tz is None: + # DatetimeArray needs to be converted to ndarray for DatetimeBlock + arr = arr._data + if isinstance(arr, np.ndarray): + arr = np.atleast_2d(arr) + block = make_block(arr, placement=slice(0, 1, 1), ndim=2) + applied = getattr(block, f)(**kwargs) + while isinstance(applied, list): + # ObjectBlock gives double nested result?, some functions give no list + applied = applied[0] + arr = applied.values + if isinstance(arr, np.ndarray): + arr = arr[0, :] + result_arrays.append(arr) + + return type(self)(result_arrays, self._axes) + + # TODO quantile + + def isna(self, func) -> "ArrayManager": + return self.apply("apply", func=func) + + def where( + self, other, cond, align: bool, errors: str, try_cast: bool, axis: int + ) -> "ArrayManager": + if align: + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] + other = extract_array(other, extract_numpy=True) + + return self.apply_with_block( + "where", + align_keys=align_keys, + other=other, + cond=cond, + errors=errors, + try_cast=try_cast, + axis=axis, + ) + + # TODO what is this used for? + # def setitem(self, indexer, value) -> "ArrayManager": + # return self.apply_with_block("setitem", indexer=indexer, value=value) + + def putmask(self, mask, new, align: bool = True, axis: int = 0): + transpose = self.ndim == 2 + + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + return self.apply_with_block( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, + inplace=True, + axis=axis, + transpose=transpose, + ) + + def diff(self, n: int, axis: int) -> "ArrayManager": + return self.apply_with_block("diff", n=n, axis=axis) + + def interpolate(self, **kwargs) -> "ArrayManager": + return self.apply_with_block("interpolate", **kwargs) + + def shift(self, periods: int, axis: int, fill_value) -> "ArrayManager": + if axis == 0 and self.ndim == 2: + # TODO column-wise shift + raise NotImplementedError + + return self.apply_with_block( + "shift", periods=periods, axis=axis, fill_value=fill_value + ) + + def fillna(self, value, limit, inplace: bool, downcast) -> "ArrayManager": + + inplace = validate_bool_kwarg(inplace, "inplace") + + def array_fillna(array, value, limit, inplace): + + mask = isna(array) + if limit is not None: + limit = libalgos.validate_limit(None, limit=limit) + mask[mask.cumsum() > limit] = False + + # if not self._can_hold_na: + # if inplace: + # return [self] + # else: + # return [self.copy()] + if not inplace: + array = array.copy() + + # np.putmask(array, mask, value) + if np.any(mask): + # TODO allow invalid value if there is nothing to fill? + array[mask] = value + return array + + return self.apply(array_fillna, value=value, limit=limit, inplace=inplace) + + def downcast(self) -> "ArrayManager": + return self.apply_with_block("downcast") + + def astype( + self, dtype, copy: bool = False, errors: str = "raise" + ) -> "ArrayManager": + return self.apply("astype", dtype=dtype, copy=copy) # , errors=errors) + + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + ) -> "ArrayManager": + return self.apply_with_block( + "convert", + copy=copy, + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + ) + + def replace(self, value, **kwargs) -> "ArrayManager": + assert np.ndim(value) == 0, value + # TODO "replace" is right now implemented on the blocks, we should move + # it to general array algos so it can be reused here + return self.apply_with_block("replace", value=value, **kwargs) + + def replace_list( + self: T, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> T: + """ do a list replace """ + inplace = validate_bool_kwarg(inplace, "inplace") + + return self.apply_with_block( + "_replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, + ) + + def to_native_types(self, **kwargs): + return self.apply_with_block("to_native_types", **kwargs) + + @property + def is_mixed_type(self) -> bool: + return True + + @property + def is_numeric_mixed_type(self) -> bool: + return False + + @property + def any_extension_types(self) -> bool: + """Whether any of the blocks in this manager are extension blocks""" + return False # any(block.is_extension for block in self.blocks) + + @property + def is_view(self) -> bool: + """ return a boolean if we are a single block and are a view """ + # TODO what is this used for? + return False + + @property + def _is_single_block(self) -> bool: + # TODO should we avoid using it from outside the blockmanager since + # it is a private property? (eg use is_mixed_type instead?) + return False + + def get_bool_data(self, copy: bool = False) -> "ArrayManager": + """ + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks + """ + mask = np.array([is_bool_dtype(t) for t in self.get_dtypes()], dtype="object") + arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] + # TODO copy? + new_axes = [self._axes[0], self._axes[1][mask]] + return type(self)(arrays, new_axes) + + def get_numeric_data(self, copy: bool = False) -> "ArrayManager": + """ + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks + """ + mask = np.array([is_numeric_dtype(t) for t in self.get_dtypes()]) + arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] + # TODO copy? + new_axes = [self._axes[0], self._axes[1][mask]] + return type(self)(arrays, new_axes) + + def copy(self: T, deep=True) -> T: + """ + Make deep or shallow copy of ArrayManager + + Parameters + ---------- + deep : bool or string, default True + If False, return shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + BlockManager + """ + # this preserves the notion of view copying of axes + if deep: + # hit in e.g. tests.io.json.test_pandas + + def copy_func(ax): + return ax.copy(deep=True) if deep == "all" else ax.view() + + new_axes = [copy_func(ax) for ax in self._axes] + else: + new_axes = list(self._axes) + + if deep: + new_arrays = [arr.copy() for arr in self.arrays] + else: + new_arrays = self.arrays + return type(self)(new_arrays, new_axes) + + def as_array( + self, + transpose: bool = False, + dtype=None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : bool, default False + If True, transpose the return array. + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + if len(self.arrays) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + + if not dtype: + dtype = _interleaved_dtype(self.arrays) + + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + elif isinstance(dtype, PandasDtype): + dtype = dtype.numpy_dtype + elif is_extension_array_dtype(dtype): + dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" + + result = np.empty(self.shape_proper, dtype=dtype) + + for i, arr in enumerate(self.arrays): + arr = arr.astype(dtype, copy=copy) + result[:, i] = arr + + if na_value is not lib.no_default: + result[isna(result)] = na_value + + return result + # return arr.transpose() if transpose else arr + + def get_slice(self, slobj: slice, axis: int = 0) -> "ArrayManager": + axis = self._normalize_axis(axis) + + if axis == 0: + arrays = [arr[slobj] for arr in self.arrays] + elif axis == 1: + arrays = self.arrays[slobj] + + new_axes = list(self._axes) + new_axes[axis] = new_axes[axis][slobj] + + return type(self)(arrays, new_axes, do_integrity_check=False) + + def fast_xs(self, loc: int) -> ArrayLike: + """ + Return the array corresponding to `frame.iloc[loc]`. + + Parameters + ---------- + loc : int + + Returns + ------- + np.ndarray or ExtensionArray + """ + dtype = _interleaved_dtype(self.arrays) + + if isinstance(dtype, SparseDtype): + temp_dtype = dtype.subtype + elif isinstance(dtype, PandasDtype): + temp_dtype = dtype.numpy_dtype + elif is_extension_array_dtype(dtype): + temp_dtype = "object" + elif is_dtype_equal(dtype, str): + temp_dtype = "object" + else: + temp_dtype = dtype + + result = np.array([arr[loc] for arr in self.arrays], dtype=temp_dtype) + if isinstance(dtype, ExtensionDtype): + result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + return result + + def iget(self, i: int) -> "SingleBlockManager": + """ + Return the data as a SingleBlockManager. + """ + from pandas.core.internals.managers import SingleBlockManager + + values = self.arrays[i] + block = make_block(values, placement=slice(0, len(values)), ndim=1) + + return SingleBlockManager(block, self._axes[0]) + + def iget_values(self, i: int) -> ArrayLike: + """ + Return the data for column i as the values (ndarray or ExtensionArray). + """ + return self.arrays[i] + + def idelete(self, indexer): + """ + Delete selected locations in-place (new block and array, same BlockManager) + """ + to_keep = np.ones(self.shape[0], dtype=np.bool_) + to_keep[indexer] = False + + self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] + self._axes = [self._axes[0], self._axes[1][to_keep]] + + def iset(self, loc: Union[int, slice, np.ndarray], value): + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + """ + if lib.is_integer(loc): + # TODO normalize array -> this should in theory not be needed? + value = extract_array(value, extract_numpy=True) + if isinstance(value, np.ndarray) and value.ndim == 2: + value = value[0, :] + + assert isinstance(value, (np.ndarray, ExtensionArray)) + # value = np.asarray(value) + # assert isinstance(value, np.ndarray) + assert len(value) == len(self._axes[0]) + self.arrays[loc] = value + return + + # TODO + raise Exception + + def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : array_like + allow_duplicates: bool + If False, trying to insert non-unique item will raise + + """ + if not allow_duplicates and item in self.items: + # Should this be a different kind of error?? + raise ValueError(f"cannot insert {item}, already exists") + + if not isinstance(loc, int): + raise TypeError("loc must be int") + + # insert to the axis; this could possibly raise a TypeError + new_axis = self.items.insert(loc, item) + + value = extract_array(value, extract_numpy=True) + if value.ndim == 2: + value = value[0, :] + # TODO self.arrays can be empty + # assert len(value) == len(self.arrays[0]) + + # TODO is this copy needed? + arrays = self.arrays.copy() + arrays.insert(loc, value) + + self.arrays = arrays + self._axes[1] = new_axis + + def reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + ) -> T: + axis = self._normalize_axis(axis) + return self._reindex_indexer( + new_axis, indexer, axis, fill_value, allow_dups, copy + ) + + def _reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + ) -> T: + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object, default None + allow_dups : bool, default False + copy : bool, default True + + + pandas-indexer with -1's only. + """ + if indexer is None: + if new_axis is self._axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result._axes = list(self._axes) + result._axes[axis] = new_axis + return result + + # some axes don't allow reindexing with dups + if not allow_dups: + self._axes[axis]._can_reindex(indexer) + + # if axis >= self.ndim: + # raise IndexError("Requested axis not found in manager") + + if axis == 1: + new_arrays = [] + for i in indexer: + if i == -1: + arr = self._make_na_array(fill_value=fill_value) + else: + arr = self.arrays[i] + new_arrays.append(arr) + + else: + new_arrays = [ + algos.take( + arr, + indexer, + allow_fill=True, + fill_value=fill_value, + # if fill_value is not None else blk.fill_value + ) + for arr in self.arrays + ] + + new_axes = list(self._axes) + new_axes[axis] = new_axis + + return type(self)(new_arrays, new_axes) + + def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): + """ + Take items along any axis. + """ + axis = self._normalize_axis(axis) + + indexer = ( + np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") + if isinstance(indexer, slice) + else np.asanyarray(indexer, dtype="int64") + ) + + n = self.shape_proper[axis] + if convert: + indexer = maybe_convert_indices(indexer, n) + + if verify: + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception("Indices must be nonzero and less than the axis length") + + new_labels = self._axes[axis].take(indexer) + return self._reindex_indexer( + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + ) + + def _make_na_array(self, fill_value=None): + if fill_value is None: + fill_value = np.nan + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + values = np.empty(self.shape_proper[0], dtype=dtype) + values.fill(fill_value) + return values + + def unstack(self, unstacker, fill_value) -> "ArrayManager": + """ + Return a BlockManager with all blocks unstacked.. + + Parameters + ---------- + unstacker : reshape._Unstacker + fill_value : Any + fill_value for newly introduced missing values. + + Returns + ------- + unstacked : BlockManager + """ + indexer, _ = unstacker._indexer_and_to_sort + new_indexer = np.full(unstacker.mask.shape, -1) + new_indexer[unstacker.mask] = indexer + new_indexer2D = new_indexer.reshape(*unstacker.full_shape) + + new_arrays = [] + for arr in self.arrays: + for i in range(unstacker.full_shape[1]): + new_arr = algos.take( + arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value + ) + new_arrays.append(new_arr) + + new_index = unstacker.new_index + new_columns = unstacker.get_new_columns(self._axes[1]) + new_axes = [new_index, new_columns] + + return type(self)(new_arrays, new_axes, do_integrity_check=False) + + # TODO + # equals + # to_dict + # quantile + + +def _interleaved_dtype(blocks) -> Optional[DtypeObj]: + """ + Find the common dtype for `blocks`. + + Parameters + ---------- + blocks : List[Block] + + Returns + ------- + dtype : np.dtype, ExtensionDtype, or None + None is returned when `blocks` is empty. + """ + if not len(blocks): + return None + + return find_common_type([b.dtype for b in blocks]) diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py new file mode 100644 index 0000000000000..94ba46ddfba45 --- /dev/null +++ b/pandas/core/internals/base.py @@ -0,0 +1,39 @@ +""" +Base class for the internal managers. Both BlockManager and ArrayManager +inherit from this class. +""" +from pandas.core.base import PandasObject +from pandas.core.indexes.api import ensure_index + + +class DataManager(PandasObject): + + # TODO share more methods/attributes + + def __len__(self) -> int: + return len(self.items) + + @property + def ndim(self) -> int: + return len(self.axes) + + def reindex_axis( + self, + new_index, + axis: int, + method=None, + limit=None, + fill_value=None, + copy: bool = True, + ): + """ + Conform block manager to new index. + """ + new_index = ensure_index(new_index) + new_index, indexer = self.axes[axis].reindex( + new_index, method=method, limit=limit + ) + + return self.reindex_indexer( + new_index, indexer, axis=axis, fill_value=fill_value, copy=copy + ) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 460fc3c7f758f..e3cd644a152ce 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -26,8 +26,9 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray +from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import make_block -from pandas.core.internals.managers import ArrayManager, BlockManager +from pandas.core.internals.managers import BlockManager if TYPE_CHECKING: from pandas.core.arrays.sparse.dtype import SparseDtype diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index eea27857a58d8..24caf0efc129d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -16,7 +16,7 @@ import numpy as np -from pandas._libs import algos as libalgos, internals as libinternals, lib +from pandas._libs import internals as libinternals, lib from pandas._typing import ArrayLike, DtypeObj, Label from pandas.util._validators import validate_bool_kwarg @@ -27,11 +27,9 @@ ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, - is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_list_like, - is_numeric_dtype, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype @@ -39,12 +37,11 @@ from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray, PandasDtype from pandas.core.arrays.sparse import SparseDtype -from pandas.core.base import PandasObject from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index +from pandas.core.internals.base import DataManager from pandas.core.internals.blocks import ( Block, CategoricalBlock, @@ -63,866 +60,6 @@ T = TypeVar("T", bound="BlockManager") -class DataManager(PandasObject): - - # TODO share more methods/attributes - - def __len__(self) -> int: - return len(self.items) - - @property - def ndim(self) -> int: - return len(self.axes) - - def reindex_axis( - self, - new_index, - axis: int, - method=None, - limit=None, - fill_value=None, - copy: bool = True, - ): - """ - Conform block manager to new index. - """ - new_index = ensure_index(new_index) - new_index, indexer = self.axes[axis].reindex( - new_index, method=method, limit=limit - ) - - return self.reindex_indexer( - new_index, indexer, axis=axis, fill_value=fill_value, copy=copy - ) - - -class ArrayManager(DataManager): - """ - Core internal data structure to implement DataFrame and Series. - - Alternative to the BlockManager, storing a list of 1D arrays instead of - Blocks. - - This is *not* a public API class - - Parameters - ---------- - arrays : Sequence of arrays - axes : Sequence of Index - do_integrity_check : bool, default True - - """ - - __slots__ = [ - "_axes", - "arrays", - ] - - arrays: List[Union[np.ndarray, ExtensionArray]] - axes: Sequence[Index] - - def __init__( - self, - arrays: List[Union[np.ndarray, ExtensionArray]], - axes: Sequence[Index], - do_integrity_check: bool = True, - ): - # Note: we are storing the axes in "_axes" in the (row, columns) order - # which contrasts the order how it is stored in BlockManager - self._axes = axes - self.arrays = arrays - - if do_integrity_check: - self._axes = [ensure_index(ax) for ax in axes] - self._verify_integrity() - - def make_empty(self: T, axes=None) -> T: - """Return an empty ArrayManager with the items axis of len 0 (no columns)""" - if axes is None: - axes = [self.axes[1:], Index([])] - - arrays = [] - return type(self)(arrays, axes) - - @property - def items(self) -> Index: - return self._axes[1] - - @property - def axes(self) -> Sequence[Index]: - """Axes is BlockManager-compatible order (columns, rows)""" - return [self._axes[1], self._axes[0]] - - @property - def shape(self) -> Tuple[int, ...]: - # this still gives the BlockManager-compatible transposed shape - return tuple(len(ax) for ax in self.axes) - - @property - def shape_proper(self) -> Tuple[int, ...]: - # this returns (n_rows, n_columns) - return tuple(len(ax) for ax in self._axes) - - @staticmethod - def _normalize_axis(axis): - # switch axis - axis = 1 if axis == 0 else 0 - return axis - - # TODO can be shared - def set_axis(self, axis: int, new_labels: Index) -> None: - # Caller is responsible for ensuring we have an Index object. - axis = self._normalize_axis(axis) - old_len = len(self._axes[axis]) - new_len = len(new_labels) - - if new_len != old_len: - raise ValueError( - f"Length mismatch: Expected axis has {old_len} elements, new " - f"values have {new_len} elements" - ) - - self._axes[axis] = new_labels - - def consolidate(self) -> "ArrayManager": - return self - - def is_consolidated(self) -> bool: - return True - - def _consolidate_inplace(self) -> None: - pass - - def get_dtypes(self): - return np.array([arr.dtype for arr in self.arrays], dtype="object") - - # TODO setstate getstate - - def __repr__(self) -> str: - output = type(self).__name__ - output += f"\nIndex: {self._axes[0]}" - output += f"\nColumns: {self._axes[1]}" - output += f"\n{len(self.arrays)} arrays:" - for arr in self.arrays: - output += f"\n{arr.dtype}" - return output - - def _verify_integrity(self) -> None: - n_rows, n_columns = self.shape_proper - if not len(self.arrays) == n_columns: - raise ValueError( - "Number of passed arrays must equal the size of the column Index: " - f"{len(self.arrays)} arrays vs {n_columns} columns." - ) - for arr in self.arrays: - if not len(arr) == n_rows: - raise ValueError( - "Passed arrays should have the same length as the rows Index: " - f"{len(arr)} vs {n_rows} rows" - ) - if not isinstance(arr, (np.ndarray, ExtensionArray)): - raise ValueError( - "Passed arrays should be np.ndarray or ExtensionArray instances, " - f"got {type(arr)} instead" - ) - - def reduce(self: T, func) -> T: - # TODO this still fails because `func` assumes to work on 2D arrays - assert self.ndim == 2 - - res_arrays = [] - for arr in self.arrays: - res = func(arr) - res_arrays.append(np.array([res])) - - index = Index([0]) # placeholder - new_mgr = type(self)(res_arrays, [index, self.items]) - return new_mgr - - def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": - """ - Apply array_op blockwise with another (aligned) BlockManager. - """ - # TODO what if `other` is BlockManager ? - left_arrays = self.arrays - right_arrays = other.arrays - result_arrays = [array_op(l, r) for l, r in zip(left_arrays, right_arrays)] - return type(self)(result_arrays, self._axes) - - def apply( - self: T, - f, - align_keys: Optional[List[str]] = None, - ignore_failures: bool = False, - **kwargs, - ) -> T: - """ - Iterate over the arrays, collect and create a new ArrayManager. - - Parameters - ---------- - f : str or callable - Name of the Array method to apply. - align_keys: List[str] or None, default None - ignore_failures: bool, default False - **kwargs - Keywords to pass to `f` - - Returns - ------- - ArrayManager - """ - assert "filter" not in kwargs - - align_keys = align_keys or [] - result_arrays: List[np.ndarray] = [] - result_indices: List[int] = [] - # fillna: Series/DataFrame is responsible for making sure value is aligned - - aligned_args = {k: kwargs[k] for k in align_keys} - - if f == "apply": - f = kwargs.pop("func") - - for i, arr in enumerate(self.arrays): - - if aligned_args: - - for k, obj in aligned_args.items(): - if isinstance(obj, (ABCSeries, ABCDataFrame)): - # The caller is responsible for ensuring that - # obj.axes[-1].equals(self.items) - if obj.ndim == 1: - kwargs[k] = obj.iloc[i] - else: - kwargs[k] = obj.iloc[:, i]._values - else: - # otherwise we have an array-like - kwargs[k] = obj[i] - - try: - if callable(f): - applied = f(arr, **kwargs) - else: - applied = getattr(arr, f)(**kwargs) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - continue - # if not isinstance(applied, ExtensionArray): - # # TODO not all EA operations return new EAs (eg astype) - # applied = array(applied) - result_arrays.append(applied) - result_indices.append(i) - - if ignore_failures: - # TODO copy? - new_axes = [self._axes[0], self._axes[1][result_indices]] - else: - new_axes = self._axes - - if len(result_arrays) == 0: - return self.make_empty(new_axes) - - return type(self)(result_arrays, new_axes) - - def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: - - align_keys = align_keys or [] - aligned_args = {k: kwargs[k] for k in align_keys} - - result_arrays = [] - - for i, arr in enumerate(self.arrays): - - if aligned_args: - for k, obj in aligned_args.items(): - if isinstance(obj, (ABCSeries, ABCDataFrame)): - # The caller is responsible for ensuring that - # obj.axes[-1].equals(self.items) - if obj.ndim == 1: - kwargs[k] = obj.iloc[[i]] - else: - kwargs[k] = obj.iloc[:, [i]]._values - else: - # otherwise we have an ndarray - kwargs[k] = obj[[i]] - - if hasattr(arr, "tz") and arr.tz is None: - # DatetimeArray needs to be converted to ndarray for DatetimeBlock - arr = arr._data - if isinstance(arr, np.ndarray): - arr = np.atleast_2d(arr) - block = make_block(arr, placement=slice(0, 1, 1), ndim=2) - applied = getattr(block, f)(**kwargs) - while isinstance(applied, list): - # ObjectBlock gives double nested result?, some functions give no list - applied = applied[0] - arr = applied.values - if isinstance(arr, np.ndarray): - arr = arr[0, :] - result_arrays.append(arr) - - return type(self)(result_arrays, self._axes) - - # TODO quantile - - def isna(self, func) -> "ArrayManager": - return self.apply("apply", func=func) - - def where( - self, other, cond, align: bool, errors: str, try_cast: bool, axis: int - ) -> "ArrayManager": - if align: - align_keys = ["other", "cond"] - else: - align_keys = ["cond"] - other = extract_array(other, extract_numpy=True) - - return self.apply_with_block( - "where", - align_keys=align_keys, - other=other, - cond=cond, - errors=errors, - try_cast=try_cast, - axis=axis, - ) - - # TODO what is this used for? - # def setitem(self, indexer, value) -> "ArrayManager": - # return self.apply_with_block("setitem", indexer=indexer, value=value) - - def putmask(self, mask, new, align: bool = True, axis: int = 0): - transpose = self.ndim == 2 - - if align: - align_keys = ["new", "mask"] - else: - align_keys = ["mask"] - new = extract_array(new, extract_numpy=True) - - return self.apply_with_block( - "putmask", - align_keys=align_keys, - mask=mask, - new=new, - inplace=True, - axis=axis, - transpose=transpose, - ) - - def diff(self, n: int, axis: int) -> "ArrayManager": - return self.apply_with_block("diff", n=n, axis=axis) - - def interpolate(self, **kwargs) -> "ArrayManager": - return self.apply_with_block("interpolate", **kwargs) - - def shift(self, periods: int, axis: int, fill_value) -> "ArrayManager": - if axis == 0 and self.ndim == 2: - # TODO column-wise shift - raise NotImplementedError - - return self.apply_with_block( - "shift", periods=periods, axis=axis, fill_value=fill_value - ) - - def fillna(self, value, limit, inplace: bool, downcast) -> "ArrayManager": - - inplace = validate_bool_kwarg(inplace, "inplace") - - def array_fillna(array, value, limit, inplace): - - mask = isna(array) - if limit is not None: - limit = libalgos.validate_limit(None, limit=limit) - mask[mask.cumsum() > limit] = False - - # if not self._can_hold_na: - # if inplace: - # return [self] - # else: - # return [self.copy()] - if not inplace: - array = array.copy() - - # np.putmask(array, mask, value) - if np.any(mask): - # TODO allow invalid value if there is nothing to fill? - array[mask] = value - return array - - return self.apply(array_fillna, value=value, limit=limit, inplace=inplace) - - def downcast(self) -> "ArrayManager": - return self.apply_with_block("downcast") - - def astype( - self, dtype, copy: bool = False, errors: str = "raise" - ) -> "ArrayManager": - return self.apply("astype", dtype=dtype, copy=copy) # , errors=errors) - - def convert( - self, - copy: bool = True, - datetime: bool = True, - numeric: bool = True, - timedelta: bool = True, - coerce: bool = False, - ) -> "ArrayManager": - return self.apply_with_block( - "convert", - copy=copy, - datetime=datetime, - numeric=numeric, - timedelta=timedelta, - coerce=coerce, - ) - - def replace(self, value, **kwargs) -> "ArrayManager": - assert np.ndim(value) == 0, value - # TODO "replace" is right now implemented on the blocks, we should move - # it to general array algos so it can be reused here - return self.apply_with_block("replace", value=value, **kwargs) - - def replace_list( - self: T, - src_list: List[Any], - dest_list: List[Any], - inplace: bool = False, - regex: bool = False, - ) -> T: - """ do a list replace """ - inplace = validate_bool_kwarg(inplace, "inplace") - - return self.apply_with_block( - "_replace_list", - src_list=src_list, - dest_list=dest_list, - inplace=inplace, - regex=regex, - ) - - def to_native_types(self, **kwargs): - return self.apply_with_block("to_native_types", **kwargs) - - @property - def is_mixed_type(self) -> bool: - return True - - @property - def is_numeric_mixed_type(self) -> bool: - return False - - @property - def any_extension_types(self) -> bool: - """Whether any of the blocks in this manager are extension blocks""" - return False # any(block.is_extension for block in self.blocks) - - @property - def is_view(self) -> bool: - """ return a boolean if we are a single block and are a view """ - # TODO what is this used for? - return False - - @property - def _is_single_block(self) -> bool: - # TODO should we avoid using it from outside the blockmanager since - # it is a private property? (eg use is_mixed_type instead?) - return False - - def get_bool_data(self, copy: bool = False) -> "BlockManager": - """ - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks - """ - mask = np.array([is_bool_dtype(t) for t in self.get_dtypes()], dtype="object") - arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] - # TODO copy? - new_axes = [self._axes[0], self._axes[1][mask]] - return type(self)(arrays, new_axes) - - def get_numeric_data(self, copy: bool = False) -> "BlockManager": - """ - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks - """ - mask = np.array([is_numeric_dtype(t) for t in self.get_dtypes()]) - arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] - # TODO copy? - new_axes = [self._axes[0], self._axes[1][mask]] - return type(self)(arrays, new_axes) - - def copy(self: T, deep=True) -> T: - """ - Make deep or shallow copy of ArrayManager - - Parameters - ---------- - deep : bool or string, default True - If False, return shallow copy (do not copy data) - If 'all', copy data and a deep copy of the index - - Returns - ------- - BlockManager - """ - # this preserves the notion of view copying of axes - if deep: - # hit in e.g. tests.io.json.test_pandas - - def copy_func(ax): - return ax.copy(deep=True) if deep == "all" else ax.view() - - new_axes = [copy_func(ax) for ax in self._axes] - else: - new_axes = list(self._axes) - - if deep: - new_arrays = [arr.copy() for arr in self.arrays] - else: - new_arrays = self.arrays - return type(self)(new_arrays, new_axes) - - def as_array( - self, - transpose: bool = False, - dtype=None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - """ - Convert the blockmanager data into an numpy array. - - Parameters - ---------- - transpose : bool, default False - If True, transpose the return array. - dtype : object, default None - Data type of the return array. - copy : bool, default False - If True then guarantee that a copy is returned. A value of - False does not guarantee that the underlying data is not - copied. - na_value : object, default lib.no_default - Value to be used as the missing value sentinel. - - Returns - ------- - arr : ndarray - """ - if len(self.arrays) == 0: - arr = np.empty(self.shape, dtype=float) - return arr.transpose() if transpose else arr - - # We want to copy when na_value is provided to avoid - # mutating the original object - copy = copy or na_value is not lib.no_default - - if not dtype: - dtype = _interleaved_dtype(self.arrays) - - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - elif isinstance(dtype, PandasDtype): - dtype = dtype.numpy_dtype - elif is_extension_array_dtype(dtype): - dtype = "object" - elif is_dtype_equal(dtype, str): - dtype = "object" - - result = np.empty(self.shape_proper, dtype=dtype) - - for i, arr in enumerate(self.arrays): - arr = arr.astype(dtype, copy=copy) - result[:, i] = arr - - if na_value is not lib.no_default: - result[isna(result)] = na_value - - return result - # return arr.transpose() if transpose else arr - - def get_slice(self, slobj: slice, axis: int = 0) -> "BlockManager": - axis = self._normalize_axis(axis) - - if axis == 0: - arrays = [arr[slobj] for arr in self.arrays] - elif axis == 1: - arrays = self.arrays[slobj] - - new_axes = list(self._axes) - new_axes[axis] = new_axes[axis][slobj] - - return type(self)(arrays, new_axes, do_integrity_check=False) - - def fast_xs(self, loc: int) -> ArrayLike: - """ - Return the array corresponding to `frame.iloc[loc]`. - - Parameters - ---------- - loc : int - - Returns - ------- - np.ndarray or ExtensionArray - """ - dtype = _interleaved_dtype(self.arrays) - - if isinstance(dtype, SparseDtype): - temp_dtype = dtype.subtype - elif isinstance(dtype, PandasDtype): - temp_dtype = dtype.numpy_dtype - elif is_extension_array_dtype(dtype): - temp_dtype = "object" - elif is_dtype_equal(dtype, str): - temp_dtype = "object" - else: - temp_dtype = dtype - - result = np.array([arr[loc] for arr in self.arrays], dtype=temp_dtype) - if isinstance(dtype, ExtensionDtype): - result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) - return result - - def iget(self, i: int) -> "SingleBlockManager": - """ - Return the data as a SingleBlockManager. - """ - values = self.arrays[i] - block = make_block(values, placement=slice(0, len(values)), ndim=1) - - return SingleBlockManager(block, self._axes[0]) - - def iget_values(self, i: int) -> ArrayLike: - """ - Return the data for column i as the values (ndarray or ExtensionArray). - """ - return self.arrays[i] - - def idelete(self, indexer): - """ - Delete selected locations in-place (new block and array, same BlockManager) - """ - to_keep = np.ones(self.shape[0], dtype=np.bool_) - to_keep[indexer] = False - - self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] - self._axes = [self._axes[0], self._axes[1][to_keep]] - - def iset(self, loc: Union[int, slice, np.ndarray], value): - """ - Set new item in-place. Does not consolidate. Adds new Block if not - contained in the current set of items - """ - if lib.is_integer(loc): - # TODO normalize array -> this should in theory not be needed? - value = extract_array(value, extract_numpy=True) - if isinstance(value, np.ndarray) and value.ndim == 2: - value = value[0, :] - - assert isinstance(value, (np.ndarray, ExtensionArray)) - # value = np.asarray(value) - # assert isinstance(value, np.ndarray) - assert len(value) == len(self._axes[0]) - self.arrays[loc] = value - return - - # TODO - raise Exception - - def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): - """ - Insert item at selected position. - - Parameters - ---------- - loc : int - item : hashable - value : array_like - allow_duplicates: bool - If False, trying to insert non-unique item will raise - - """ - if not allow_duplicates and item in self.items: - # Should this be a different kind of error?? - raise ValueError(f"cannot insert {item}, already exists") - - if not isinstance(loc, int): - raise TypeError("loc must be int") - - # insert to the axis; this could possibly raise a TypeError - new_axis = self.items.insert(loc, item) - - value = extract_array(value, extract_numpy=True) - if value.ndim == 2: - value = value[0, :] - # TODO self.arrays can be empty - # assert len(value) == len(self.arrays[0]) - - # TODO is this copy needed? - arrays = self.arrays.copy() - arrays.insert(loc, value) - - self.arrays = arrays - self._axes[1] = new_axis - - def reindex_indexer( - self: T, - new_axis, - indexer, - axis: int, - fill_value=None, - allow_dups: bool = False, - copy: bool = True, - ) -> T: - axis = self._normalize_axis(axis) - return self._reindex_indexer( - new_axis, indexer, axis, fill_value, allow_dups, copy - ) - - def _reindex_indexer( - self: T, - new_axis, - indexer, - axis: int, - fill_value=None, - allow_dups: bool = False, - copy: bool = True, - ) -> T: - """ - Parameters - ---------- - new_axis : Index - indexer : ndarray of int64 or None - axis : int - fill_value : object, default None - allow_dups : bool, default False - copy : bool, default True - - - pandas-indexer with -1's only. - """ - if indexer is None: - if new_axis is self._axes[axis] and not copy: - return self - - result = self.copy(deep=copy) - result._axes = list(self._axes) - result._axes[axis] = new_axis - return result - - # some axes don't allow reindexing with dups - if not allow_dups: - self._axes[axis]._can_reindex(indexer) - - # if axis >= self.ndim: - # raise IndexError("Requested axis not found in manager") - - if axis == 1: - new_arrays = [] - for i in indexer: - if i == -1: - arr = self._make_na_array(fill_value=fill_value) - else: - arr = self.arrays[i] - new_arrays.append(arr) - - else: - new_arrays = [ - algos.take( - arr, - indexer, - allow_fill=True, - fill_value=fill_value, - # if fill_value is not None else blk.fill_value - ) - for arr in self.arrays - ] - - new_axes = list(self._axes) - new_axes[axis] = new_axis - - return type(self)(new_arrays, new_axes) - - def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): - """ - Take items along any axis. - """ - axis = self._normalize_axis(axis) - - indexer = ( - np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") - if isinstance(indexer, slice) - else np.asanyarray(indexer, dtype="int64") - ) - - n = self.shape_proper[axis] - if convert: - indexer = maybe_convert_indices(indexer, n) - - if verify: - if ((indexer == -1) | (indexer >= n)).any(): - raise Exception("Indices must be nonzero and less than the axis length") - - new_labels = self._axes[axis].take(indexer) - return self._reindex_indexer( - new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True - ) - - def _make_na_array(self, fill_value=None): - if fill_value is None: - fill_value = np.nan - - dtype, fill_value = infer_dtype_from_scalar(fill_value) - values = np.empty(self.shape_proper[0], dtype=dtype) - values.fill(fill_value) - return values - - def unstack(self, unstacker, fill_value) -> "ArrayManager": - """ - Return a BlockManager with all blocks unstacked.. - - Parameters - ---------- - unstacker : reshape._Unstacker - fill_value : Any - fill_value for newly introduced missing values. - - Returns - ------- - unstacked : BlockManager - """ - indexer, _ = unstacker._indexer_and_to_sort - new_indexer = np.full(unstacker.mask.shape, -1) - new_indexer[unstacker.mask] = indexer - new_indexer2D = new_indexer.reshape(*unstacker.full_shape) - - new_arrays = [] - for arr in self.arrays: - for i in range(unstacker.full_shape[1]): - new_arr = algos.take( - arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value - ) - new_arrays.append(new_arr) - - new_index = unstacker.new_index - new_columns = unstacker.get_new_columns(self._axes[1]) - new_axes = [new_index, new_columns] - - return type(self)(new_arrays, new_axes, do_integrity_check=False) - - # TODO - # equals - # to_dict - # quantile - - class BlockManager(DataManager): """ Core internal data structure to implement DataFrame, Series, etc. @@ -2430,8 +1567,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> "SingleBlockManager": raise IndexError("Requested axis not found in manager") blk = self._block - arr = blk._slice(slobj) - block = blk.make_block_same_class(arr, placement=slice(0, len(arr))) + array = blk._slice(slobj) + block = blk.make_block_same_class(array, placement=slice(0, len(array))) return type(self)(block, self.index[slobj]) @property From 9751d33bab871ecc1d73943378adae7a72a8af9b Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 10 Nov 2020 10:48:33 -0800 Subject: [PATCH 15/29] de-privatize --- pandas/core/internals/array_manager.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 75723206ecde6..b9dd35ee62a58 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -466,9 +466,7 @@ def is_view(self) -> bool: return False @property - def _is_single_block(self) -> bool: - # TODO should we avoid using it from outside the blockmanager since - # it is a private property? (eg use is_mixed_type instead?) + def is_single_block(self) -> bool: return False def get_bool_data(self, copy: bool = False) -> "ArrayManager": From 3749c7dcf28365db94edca94f755d45a33e945fc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 11 Dec 2020 21:01:11 +0100 Subject: [PATCH 16/29] try fix up typing --- pandas/core/frame.py | 6 ++++- pandas/core/generic.py | 6 +++-- pandas/core/groupby/generic.py | 6 +++-- pandas/core/internals/array_manager.py | 36 +++++++++++++++++++------- pandas/core/internals/base.py | 29 +++++++++++++++++++-- pandas/core/internals/concat.py | 4 +-- 6 files changed, 68 insertions(+), 19 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 305a691efdf26..3685d0cb930fa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -645,6 +645,7 @@ def _as_manager(self, typ): New DataFrame using specified manager type. Is not guaranteed to be a copy or not. """ + new_mgr: Union[BlockManager, ArrayManager] mgr = self._mgr if typ == "block": if isinstance(mgr, BlockManager): @@ -6087,7 +6088,10 @@ def _dispatch_frame_op(self, right, func, axis: Optional[int] = None): # fails in cases with empty columns reached via # _frame_arith_method_with_reindex - bm = self._mgr.operate_blockwise(right._mgr, array_op) + # TODO operate_blockwise expects a manager of the same type + bm = self._mgr.operate_blockwise( + right._mgr, array_op # type: ignore[arg-type] + ) return type(self)(bm) elif isinstance(right, Series) and axis == 1: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8ab29dad335c3..d376c3bbd2644 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5700,11 +5700,13 @@ def _to_dict_of_blocks(self, copy: bool_t = True): Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. - Internal ONLY + Internal ONLY - only works for BlockManager """ + mgr = self._mgr + mgr = cast(BlockManager, mgr) return { k: self._constructor(v).__finalize__(self) - for k, v, in self._mgr.to_dict(copy=copy).items() + for k, v, in mgr.to_dict(copy=copy).items() } def astype( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 07ffb881495fa..d71df1ed57511 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1078,10 +1078,12 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike: # in the operation. We un-split here. result = result._consolidate() assert isinstance(result, (Series, DataFrame)) # for mypy - assert len(result._mgr.blocks) == 1 + mgr = result._mgr + assert isinstance(mgr, BlockManager) + assert len(mgr.blocks) == 1 # unwrap DataFrame to get array - result = result._mgr.blocks[0].values + result = mgr.blocks[0].values return result def blk_func(bvalues: ArrayLike) -> ArrayLike: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index b9dd35ee62a58..b7fadc54e04f6 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1,7 +1,7 @@ """ Experimental manager based on storing a collection of 1D arrays """ -from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, TypeVar, Union +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, TypeVar, Union import numpy as np @@ -59,12 +59,12 @@ class ArrayManager(DataManager): ] arrays: List[Union[np.ndarray, ExtensionArray]] - axes: Sequence[Index] + _axes: List[Index] def __init__( self, arrays: List[Union[np.ndarray, ExtensionArray]], - axes: Sequence[Index], + axes: List[Index], do_integrity_check: bool = True, ): # Note: we are storing the axes in "_axes" in the (row, columns) order @@ -81,7 +81,7 @@ def make_empty(self: T, axes=None) -> T: if axes is None: axes = [self.axes[1:], Index([])] - arrays = [] + arrays: List[Union[np.ndarray, ExtensionArray]] = [] return type(self)(arrays, axes) @property @@ -89,7 +89,9 @@ def items(self) -> Index: return self._axes[1] @property - def axes(self) -> Sequence[Index]: + def axes(self) -> List[Index]: # type: ignore[override] + # mypy doesn't work to override attribute with property + # see https://github.com/python/mypy/issues/4125 """Axes is BlockManager-compatible order (columns, rows)""" return [self._axes[1], self._axes[0]] @@ -166,8 +168,11 @@ def _verify_integrity(self) -> None: f"got {type(arr)} instead" ) - def reduce(self: T, func) -> T: + def reduce( + self: T, func: Callable, ignore_failures: bool = False + ) -> Tuple[T, np.ndarray]: # TODO this still fails because `func` assumes to work on 2D arrays + # TODO implement ignore_failures assert self.ndim == 2 res_arrays = [] @@ -177,7 +182,8 @@ def reduce(self: T, func) -> T: index = Index([0]) # placeholder new_mgr = type(self)(res_arrays, [index, self.items]) - return new_mgr + indexer = np.arange(self.shape[0]) + return new_mgr, indexer def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": """ @@ -186,7 +192,9 @@ def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": # TODO what if `other` is BlockManager ? left_arrays = self.arrays right_arrays = other.arrays - result_arrays = [array_op(l, r) for l, r in zip(left_arrays, right_arrays)] + result_arrays = [ + array_op(left, right) for left, right in zip(left_arrays, right_arrays) + ] return type(self)(result_arrays, self._axes) def apply( @@ -255,6 +263,7 @@ def apply( result_arrays.append(applied) result_indices.append(i) + new_axes: List[Index] if ignore_failures: # TODO copy? new_axes = [self._axes[0], self._axes[1][result_indices]] @@ -288,9 +297,9 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: # otherwise we have an ndarray kwargs[k] = obj[[i]] - if hasattr(arr, "tz") and arr.tz is None: + if hasattr(arr, "tz") and arr.tz is None: # type: ignore[union-attr] # DatetimeArray needs to be converted to ndarray for DatetimeBlock - arr = arr._data + arr = arr._data # type: ignore[union-attr] if isinstance(arr, np.ndarray): arr = np.atleast_2d(arr) block = make_block(arr, placement=slice(0, 1, 1), ndim=2) @@ -720,6 +729,9 @@ def reindex_indexer( fill_value=None, allow_dups: bool = False, copy: bool = True, + # ignored keywords + consolidate: bool = True, + only_slice: bool = False, ) -> T: axis = self._normalize_axis(axis) return self._reindex_indexer( @@ -824,6 +836,10 @@ def _make_na_array(self, fill_value=None): values.fill(fill_value) return values + def equals(self, other: object) -> bool: + # TODO + raise NotImplementedError + def unstack(self, unstacker, fill_value) -> "ArrayManager": """ Return a BlockManager with all blocks unstacked.. diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 94ba46ddfba45..ed07f81b0078c 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -2,14 +2,26 @@ Base class for the internal managers. Both BlockManager and ArrayManager inherit from this class. """ +from typing import List, TypeVar + +from pandas.errors import AbstractMethodError + from pandas.core.base import PandasObject -from pandas.core.indexes.api import ensure_index +from pandas.core.indexes.api import Index, ensure_index + +T = TypeVar("T", bound="DataManager") class DataManager(PandasObject): # TODO share more methods/attributes + axes: List[Index] + + @property + def items(self) -> Index: + raise AbstractMethodError(self) + def __len__(self) -> int: return len(self.items) @@ -17,6 +29,19 @@ def __len__(self) -> int: def ndim(self) -> int: return len(self.axes) + def reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + consolidate: bool = True, + only_slice: bool = False, + ) -> T: + raise AbstractMethodError(self) + def reindex_axis( self, new_index, @@ -27,7 +52,7 @@ def reindex_axis( copy: bool = True, ): """ - Conform block manager to new index. + Conform data manager to new index. """ new_index = ensure_index(new_index) new_index, indexer = self.axes[axis].reindex( diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 49d59d2c94ada..d0613522083c0 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,7 +1,7 @@ from collections import defaultdict import copy import itertools -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union, cast import numpy as np @@ -36,7 +36,7 @@ def concatenate_block_managers( mgrs_indexers, axes, concat_axis: int, copy: bool -) -> BlockManager: +) -> Union[ArrayManager, BlockManager]: """ Concatenate block managers into one. From af5304069fd7e2d5f7170f9cb0202f9d47685980 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 11 Dec 2020 21:20:43 +0100 Subject: [PATCH 17/29] add pytest option + add one github actions build to run them --- .github/workflows/ci.yml | 19 +++++++++++++++++++ pandas/conftest.py | 7 +++++++ pandas/core/config_init.py | 6 ++---- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2848437a76a16..edbfaeb4ae97d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -136,3 +136,22 @@ jobs: - name: Upload dev docs run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev if: github.event_name == 'push' + + data_manager: + name: Test experimental data manager + runs-on: ubuntu-latest + steps: + + - name: Setting conda path + run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH + + - name: Checkout + uses: actions/checkout@v1 + + - name: Setup environment and build pandas + run: ci/setup_env.sh + + - name: Run tests + run: | + source activate pandas-dev + pytest pandas --array-manager diff --git a/pandas/conftest.py b/pandas/conftest.py index 2bac2ed198789..76d5fc3fec8ec 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -75,6 +75,11 @@ def pytest_addoption(parser): action="store_true", help="Fail if a test is skipped for missing data file.", ) + parser.addoption( + "--array-manager", + action="store_true", + help="Use the experimental ArrayManager as default data manager.", + ) def pytest_runtest_setup(item): @@ -94,6 +99,8 @@ def pytest_runtest_setup(item): "--run-high-memory" ): pytest.skip("skipping high memory test since --run-high-memory was not set") + if item.config.getoption("--array-manager"): + pd.options.mode.data_manager = "array" # Hypothesis diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 7a0e98be5ac15..72e41815a0ab1 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -485,10 +485,8 @@ def use_inf_as_na_cb(key): ) cf.register_option( "data_manager", - # TODO switch back to default of "block" before merging - # "block", - "array", - "internal manager type", + "block", + "Internal data manager type", validator=is_one_of_factory(["block", "array"]), ) From cc45673f9911804287182f6ac2fa995637219b61 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 12 Dec 2020 14:21:33 +0100 Subject: [PATCH 18/29] fix pytest marks for skipping when using array-manager --- pandas/tests/io/test_common.py | 8 ++------ pandas/util/_test_decorators.py | 7 ++++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 60d1ccc6351d3..4d9dd5ca6af0e 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -265,12 +265,8 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_excel", {"engine": "xlwt"}, "xlwt"), ("to_feather", {}, "pyarrow"), ("to_html", {}, "os"), - ( - pytest.param( - "to_json", marks=td.skip_array_manager_not_yet_implemented - ), - {}, - "os", + pytest.param( + "to_json", {}, "os", marks=td.skip_array_manager_not_yet_implemented ), ("to_latex", {}, "os"), ("to_pickle", {}, "os"), diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index ce260609160ac..1c7a1f1d79543 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -278,8 +278,13 @@ def async_mark(): return async_mark +# Note: we are using a string as condition (and not for example +# `get_option("mode.data_manager") == "array"`) because this needs to be +# evaluated at test time (otherwise this boolean condition gets evaluated +# at import time, when the pd.options.mode.data_manager has not yet been set) + skip_array_manager_not_yet_implemented = pytest.mark.skipif( - get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" + "config.getvalue('--array-manager')", reason="JSON C code relies on Blocks" ) skip_array_manager_invalid_test = pytest.mark.skipif( From 27cf215aae9553ce4cc92cba217a1c824b3f94ab Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 12 Dec 2020 18:11:33 +0100 Subject: [PATCH 19/29] several fixes - get tests/frame/methods tests passing --- pandas/conftest.py | 18 ++++++++++-- pandas/core/frame.py | 4 +-- pandas/core/internals/array_manager.py | 14 +++++---- pandas/core/internals/base.py | 10 ++++++- pandas/core/internals/managers.py | 29 ------------------- pandas/tests/frame/methods/test_append.py | 10 +++++++ pandas/tests/frame/methods/test_astype.py | 10 +++++++ pandas/tests/frame/methods/test_count.py | 3 ++ pandas/tests/frame/methods/test_cov_corr.py | 5 ++-- pandas/tests/frame/methods/test_drop.py | 2 ++ pandas/tests/frame/methods/test_equals.py | 5 ++++ pandas/tests/frame/methods/test_explode.py | 5 ++++ pandas/tests/frame/methods/test_fillna.py | 13 ++++++--- .../tests/frame/methods/test_interpolate.py | 1 + .../methods/test_is_homogeneous_dtype.py | 5 ++++ pandas/tests/frame/methods/test_join.py | 5 ++++ pandas/tests/frame/methods/test_rank.py | 1 + .../tests/frame/methods/test_reset_index.py | 4 +++ pandas/tests/frame/methods/test_shift.py | 11 +++++-- pandas/tests/frame/methods/test_sort_index.py | 3 ++ .../tests/frame/methods/test_sort_values.py | 5 ++-- .../frame/methods/test_to_dict_of_blocks.py | 4 +++ pandas/tests/frame/methods/test_transpose.py | 3 ++ pandas/util/_test_decorators.py | 4 +-- 24 files changed, 121 insertions(+), 53 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 76d5fc3fec8ec..42d1be546d2ea 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -77,11 +77,19 @@ def pytest_addoption(parser): ) parser.addoption( "--array-manager", + "--am", action="store_true", help="Use the experimental ArrayManager as default data manager.", ) +def pytest_sessionstart(session): + # Note: we need to set the option here and not in pytest_runtest_setup below + # to ensure this is run before creating fixture data + if session.config.getoption("--array-manager"): + pd.options.mode.data_manager = "array" + + def pytest_runtest_setup(item): if "slow" in item.keywords and item.config.getoption("--skip-slow"): pytest.skip("skipping due to --skip-slow") @@ -99,8 +107,6 @@ def pytest_runtest_setup(item): "--run-high-memory" ): pytest.skip("skipping high memory test since --run-high-memory was not set") - if item.config.getoption("--array-manager"): - pd.options.mode.data_manager = "array" # Hypothesis @@ -1453,3 +1459,11 @@ def names(request): A 3-tuple of names, the first two for operands, the last for a result. """ return request.param + + +@pytest.fixture +def using_array_manager(request): + """ + Fixture to check if the array manager is being used. + """ + return pd.options.mode.data_manager == "array" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3685d0cb930fa..fa0e222b9813c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8891,11 +8891,11 @@ def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) - def blk_func(values): + def blk_func(values, axis=1): if isinstance(values, ExtensionArray): return values._reduce(name, skipna=skipna, **kwds) else: - return op(values, axis=1, skipna=skipna, **kwds) + return op(values, axis=axis, skipna=skipna, **kwds) def _get_data() -> DataFrame: if filter_type is None: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index b7fadc54e04f6..6030d9ad5b0b0 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -177,10 +177,10 @@ def reduce( res_arrays = [] for arr in self.arrays: - res = func(arr) + res = func(arr, axis=0) res_arrays.append(np.array([res])) - index = Index([0]) # placeholder + index = Index([None]) # placeholder new_mgr = type(self)(res_arrays, [index, self.items]) indexer = np.arange(self.shape[0]) return new_mgr, indexer @@ -300,6 +300,9 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: if hasattr(arr, "tz") and arr.tz is None: # type: ignore[union-attr] # DatetimeArray needs to be converted to ndarray for DatetimeBlock arr = arr._data # type: ignore[union-attr] + elif arr.dtype.kind == "m": + # TimedeltaArray needs to be converted to ndarray for TimedeltaBlock + arr = arr._data if isinstance(arr, np.ndarray): arr = np.atleast_2d(arr) block = make_block(arr, placement=slice(0, 1, 1), ndim=2) @@ -368,6 +371,9 @@ def interpolate(self, **kwargs) -> "ArrayManager": return self.apply_with_block("interpolate", **kwargs) def shift(self, periods: int, axis: int, fill_value) -> "ArrayManager": + if fill_value is lib.no_default: + fill_value = None + if axis == 0 and self.ndim == 2: # TODO column-wise shift raise NotImplementedError @@ -377,7 +383,7 @@ def shift(self, periods: int, axis: int, fill_value) -> "ArrayManager": ) def fillna(self, value, limit, inplace: bool, downcast) -> "ArrayManager": - + # TODO implement downcast inplace = validate_bool_kwarg(inplace, "inplace") def array_fillna(array, value, limit, inplace): @@ -417,7 +423,6 @@ def convert( datetime: bool = True, numeric: bool = True, timedelta: bool = True, - coerce: bool = False, ) -> "ArrayManager": return self.apply_with_block( "convert", @@ -425,7 +430,6 @@ def convert( datetime=datetime, numeric=numeric, timedelta=timedelta, - coerce=coerce, ) def replace(self, value, **kwargs) -> "ArrayManager": diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index ed07f81b0078c..2295e3f2c41b2 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -50,6 +50,8 @@ def reindex_axis( limit=None, fill_value=None, copy: bool = True, + consolidate: bool = True, + only_slice: bool = False, ): """ Conform data manager to new index. @@ -60,5 +62,11 @@ def reindex_axis( ) return self.reindex_indexer( - new_index, indexer, axis=axis, fill_value=fill_value, copy=copy + new_index, + indexer, + axis=axis, + fill_value=fill_value, + copy=copy, + consolidate=consolidate, + only_slice=only_slice, ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 57be9c5c76ffc..f36a07816a396 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1232,35 +1232,6 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): stacklevel=5, ) - def reindex_axis( - self, - new_index, - axis: int, - method=None, - limit=None, - fill_value=None, - copy: bool = True, - consolidate: bool = True, - only_slice: bool = False, - ): - """ - Conform block manager to new index. - """ - new_index = ensure_index(new_index) - new_index, indexer = self.axes[axis].reindex( - new_index, method=method, limit=limit - ) - - return self.reindex_indexer( - new_index, - indexer, - axis=axis, - fill_value=fill_value, - copy=copy, - consolidate=consolidate, - only_slice=only_slice, - ) - def reindex_indexer( self: T, new_axis, diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 38b5c150630fe..fdf71b7340a0e 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -1,10 +1,15 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Series, Timestamp import pandas._testing as tm +# TODO td.skip_array_manager_not_yet_implemented +# appending with reindexing not yet working + class TestDataFrameAppend: @pytest.mark.parametrize("klass", [Series, DataFrame]) @@ -33,6 +38,7 @@ def test_append_empty_list(self): tm.assert_frame_equal(result, expected) assert result is not df # .append() should return a new object + @td.skip_array_manager_not_yet_implemented def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) @@ -73,6 +79,7 @@ def test_append_series_dict(self): expected = df.append(df[-1:], ignore_index=True) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) @@ -91,6 +98,7 @@ def test_append_list_of_series_dicts(self): expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented def test_append_missing_cols(self): # GH22252 # exercise the conditional branch in append method where the data @@ -135,6 +143,7 @@ def test_append_empty_dataframe(self): expected = df1.copy() tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented def test_append_dtypes(self): # GH 5754 @@ -194,6 +203,7 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): expected = Series(Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize( "data, dtype", [ diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index d79969eac0323..64fea426acbbf 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( Categorical, CategoricalDtype, @@ -90,6 +92,7 @@ def test_astype_mixed_type(self, mixed_type_frame): casted = mn.astype("O") _check_cast(casted, "object") + @td.skip_array_manager_not_yet_implemented def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) @@ -124,6 +127,7 @@ def test_astype_with_view_mixed_float(self, mixed_float_frame): casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): @@ -382,6 +386,7 @@ def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination @@ -406,6 +411,7 @@ def test_astype_to_timedelta_unit_ns(self, unit): tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float @@ -429,6 +435,7 @@ def test_astype_to_incorrect_datetimelike(self, unit): msg = ( fr"cannot astype a datetimelike from \[datetime64\[ns\]\] to " fr"\[timedelta64\[{unit}\]\]" + fr"|(Cannot cast DatetimeArray to dtype timedelta64\[{unit}\])" ) with pytest.raises(TypeError, match=msg): df.astype(other) @@ -436,11 +443,13 @@ def test_astype_to_incorrect_datetimelike(self, unit): msg = ( fr"cannot astype a timedelta from \[timedelta64\[ns\]\] to " fr"\[datetime64\[{unit}\]\]" + fr"|(Cannot cast TimedeltaArray to dtype datetime64\[{unit}\])" ) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) + @td.skip_array_manager_not_yet_implemented def test_astype_arg_for_errors(self): # GH#14878 @@ -567,6 +576,7 @@ def test_astype_empty_dtype_dict(self): tm.assert_frame_equal(result, df) assert result is not df + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) ignore keyword @pytest.mark.parametrize( "df", [ diff --git a/pandas/tests/frame/methods/test_count.py b/pandas/tests/frame/methods/test_count.py index d738c7139093c..1727a76c191ee 100644 --- a/pandas/tests/frame/methods/test_count.py +++ b/pandas/tests/frame/methods/test_count.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, Index, Series import pandas._testing as tm @@ -103,6 +105,7 @@ def test_count_index_with_nan(self): ) tm.assert_frame_equal(res, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_count_level( self, multiindex_year_month_day_dataframe_random_data, diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 6cea5abcac6d0..f8d729a215ba8 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -191,14 +191,15 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_corr_item_cache(self): + def test_corr_item_cache(self, using_array_manager): # Check that corr does not lead to incorrect entries in item_cache df = DataFrame({"A": range(10)}) df["B"] = range(10)[::-1] ser = df["A"] # populate item_cache - assert len(df._mgr.blocks) == 2 + if not using_array_manager: + assert len(df._mgr.blocks) == 2 _ = df.corr() diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index eb5bc31f3aa8f..178c43b7bd8f2 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -4,6 +4,7 @@ import pytest from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp @@ -154,6 +155,7 @@ def test_drop(self): assert return_value is None tm.assert_frame_equal(df, expected) + @td.skip_array_manager_not_yet_implemented def test_drop_multiindex_not_lexsorted(self): # GH#11640 diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index de2509ed91be2..dc45c9eb97ae4 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -1,8 +1,13 @@ import numpy as np +import pandas.util._test_decorators as td + from pandas import DataFrame, date_range import pandas._testing as tm +# TODO(ArrayManager) implement equals +pytestmark = td.skip_array_manager_not_yet_implemented + class TestEquals: def test_dataframe_not_equal(self): diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index bd0901387eeed..be80dd49ff1fb 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -1,9 +1,14 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm +# TODO(ArrayManager) concat with reindexing +pytestmark = td.skip_array_manager_not_yet_implemented + def test_error(): df = pd.DataFrame( diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index b427611099be3..58016be82c405 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( Categorical, DataFrame, @@ -230,6 +232,7 @@ def test_fillna_categorical_nan(self): df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=NaT), df) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) implement downcast def test_fillna_downcast(self): # GH#15277 # infer int64 from float64 @@ -244,6 +247,7 @@ def test_fillna_downcast(self): expected = DataFrame({"a": [1, 0]}) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) object upcasting def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) @@ -268,6 +272,7 @@ def test_fillna_dtype_conversion(self): result = df.fillna(v) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_invalid_test def test_fillna_datetime_columns(self): # GH#7095 df = DataFrame( @@ -335,13 +340,13 @@ def test_frame_pad_backfill_limit(self): result = df[:2].reindex(index, method="pad", limit=5) expected = df[:2].reindex(index).fillna(method="pad") - expected.values[-3:] = np.nan + expected.iloc[-3:] = np.nan tm.assert_frame_equal(result, expected) result = df[-2:].reindex(index, method="backfill", limit=5) expected = df[-2:].reindex(index).fillna(method="backfill") - expected.values[:3] = np.nan + expected.iloc[:3] = np.nan tm.assert_frame_equal(result, expected) def test_frame_fillna_limit(self): @@ -352,14 +357,14 @@ def test_frame_fillna_limit(self): result = result.fillna(method="pad", limit=5) expected = df[:2].reindex(index).fillna(method="pad") - expected.values[-3:] = np.nan + expected.iloc[-3:] = np.nan tm.assert_frame_equal(result, expected) result = df[-2:].reindex(index) result = result.fillna(method="backfill", limit=5) expected = df[-2:].reindex(index).fillna(method="backfill") - expected.values[:3] = np.nan + expected.iloc[:3] = np.nan tm.assert_frame_equal(result, expected) def test_fillna_skip_certain_blocks(self): diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 6b86a13fcf1b9..2477ad79d8a2c 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -324,6 +324,7 @@ def test_interp_string_axis(self, axis_name, axis_number): expected = df.interpolate(method="linear", axis=axis_number) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) support axis=1 @pytest.mark.parametrize("method", ["ffill", "bfill", "pad"]) def test_interp_fillna_methods(self, axis, method): # GH 12918 diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py index 0fca4e988b775..126c78a657c58 100644 --- a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -1,8 +1,13 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import Categorical, DataFrame +# _is_homogeneous_type always returns True for ArrayManager +pytestmark = td.skip_array_manager_invalid_test + @pytest.mark.parametrize( "data, expected", diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index eba92cc71a6d0..42694dc3ff37c 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -3,10 +3,15 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Index, MultiIndex, date_range, period_range import pandas._testing as tm +# TODO(ArrayManager) concat with reindexing +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.fixture def frame_with_period_index(): diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index bab2db3192b4a..9aab745e5b89a 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -236,6 +236,7 @@ def test_rank_methods_frame(self): expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) def test_rank_descending(self, method, dtype): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 00d4a4277a42f..e43eb3fb47b7e 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype import pandas as pd @@ -518,6 +520,7 @@ def test_reset_index_delevel_infer_dtype(self): assert is_integer_dtype(deleveled["prm1"]) assert is_float_dtype(deleveled["prm2"]) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_reset_index_with_drop( self, multiindex_year_month_day_dataframe_random_data ): @@ -616,6 +619,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): # https://github.com/pandas-dev/pandas/issues/35657 df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": pd.to_datetime("2020-01-01")}) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 2e21ce8ec2256..f57459a320350 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Index, Series, date_range, offsets import pandas._testing as tm @@ -145,12 +147,13 @@ def test_shift_duplicate_columns(self): tm.assert_frame_equal(shifted[0], shifted[1]) tm.assert_frame_equal(shifted[0], shifted[2]) - def test_shift_axis1_multiple_blocks(self): + def test_shift_axis1_multiple_blocks(self, using_array_manager): # GH#35488 df1 = DataFrame(np.random.randint(1000, size=(5, 3))) df2 = DataFrame(np.random.randint(1000, size=(5, 2))) df3 = pd.concat([df1, df2], axis=1) - assert len(df3._mgr.blocks) == 2 + if not using_array_manager: + assert len(df3._mgr.blocks) == 2 result = df3.shift(2, axis=1) @@ -163,7 +166,8 @@ def test_shift_axis1_multiple_blocks(self): # Case with periods < 0 # rebuild df3 because `take` call above consolidated df3 = pd.concat([df1, df2], axis=1) - assert len(df3._mgr.blocks) == 2 + if not using_array_manager: + assert len(df3._mgr.blocks) == 2 result = df3.shift(-2, axis=1) expected = df3.take([2, 3, 4, -1, -1], axis=1) @@ -272,6 +276,7 @@ def test_datetime_frame_shift_with_freq_error(self, datetime_frame): with pytest.raises(ValueError, match=msg): no_freq.shift(freq="infer") + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) axis=1 support def test_shift_dt64values_int_fill_deprecated(self): # GH#31971 ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index de847c12723b2..973cb149a801f 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( CategoricalDtype, @@ -373,6 +375,7 @@ def test_sort_index_multiindex(self, level): result = df.sort_index(level=level, sort_remaining=False) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_sort_index_intervalindex(self): # this is a de-facto sort via unstack # confirming that we sort in the order of the bins diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 1bb969956e074..741a2c61cac83 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -544,12 +544,13 @@ def test_sort_values_nat_na_position_default(self): result = expected.sort_values(["A", "date"]) tm.assert_frame_equal(result, expected) - def test_sort_values_item_cache(self): + def test_sort_values_item_cache(self, using_array_manager): # previous behavior incorrect retained an invalid _item_cache entry df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) df["D"] = df["A"] * 2 ser = df["A"] - assert len(df._mgr.blocks) == 2 + if not using_array_manager: + assert len(df._mgr.blocks) == 2 df.sort_values(by="A") ser.values[0] = 99 diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 0257a5d43170f..8de47cb17d7d3 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,9 +1,13 @@ import numpy as np +import pandas.util._test_decorators as td + from pandas import DataFrame, MultiIndex import pandas._testing as tm from pandas.core.arrays import PandasArray +pytestmark = td.skip_array_manager_invalid_test + class TestToDictOfBlocks: def test_copy_blocks(self, float_frame): diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 8635168f1eb03..548842e653a63 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, date_range import pandas._testing as tm @@ -79,6 +81,7 @@ def test_transpose_float(self, float_frame): for col, s in mixed_T.items(): assert s.dtype == np.object_ + @td.skip_array_manager_invalid_test def test_transpose_get_view(self, float_frame): dft = float_frame.T dft.values[:, 5:10] = 5 diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 1c7a1f1d79543..95ef2f6c00fe8 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -32,8 +32,6 @@ def test_foo(): import numpy as np import pytest -from pandas._config import get_option - from pandas.compat import IS64, is_platform_windows from pandas.compat._optional import import_optional_dependency @@ -288,6 +286,6 @@ def async_mark(): ) skip_array_manager_invalid_test = pytest.mark.skipif( - get_option("mode.data_manager") == "array", + "config.getvalue('--array-manager')", reason="Test that relies on BlockManager internals or specific behaviour", ) From f6a97dfcf1fdb0b00220f3b644b9ed30363a0476 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 12 Dec 2020 18:12:46 +0100 Subject: [PATCH 20/29] ci - only run the tests/frame/methods tests --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index edbfaeb4ae97d..ada0ec141154d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,4 +154,4 @@ jobs: - name: Run tests run: | source activate pandas-dev - pytest pandas --array-manager + pytest pandas/tests/frame/methods --array-manager From 670ed7602892b1f75e1135ab2f93f27aa3c156a6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 12 Dec 2020 20:02:40 +0100 Subject: [PATCH 21/29] mypy fix --- pandas/core/internals/array_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 6030d9ad5b0b0..29ec63e1e4c68 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -302,7 +302,7 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: arr = arr._data # type: ignore[union-attr] elif arr.dtype.kind == "m": # TimedeltaArray needs to be converted to ndarray for TimedeltaBlock - arr = arr._data + arr = arr._data # type: ignore[union-attr] if isinstance(arr, np.ndarray): arr = np.atleast_2d(arr) block = make_block(arr, placement=slice(0, 1, 1), ndim=2) From a9a8c2daa19984b5427e4048717ba4d8c340154b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jan 2021 17:00:39 +0100 Subject: [PATCH 22/29] move to internals/construction.py --- pandas/core/frame.py | 32 +++++--------------------- pandas/core/internals/__init__.py | 2 ++ pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/construction.py | 27 ++++++++++++++++++++++ 4 files changed, 36 insertions(+), 27 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 585f841436208..971c1c99b6ffd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -145,6 +145,7 @@ init_dict, init_ndarray, masked_rec_array_to_mgr, + mgr_to_mgr, nested_data_to_arrays, reorder_arrays, sanitize_index, @@ -602,23 +603,19 @@ def __init__( values, index, columns, dtype=values.dtype, copy=False ) + # ensure correct Manager type according to settings manager = get_option("mode.data_manager") - - if manager == "array" and not isinstance(mgr, ArrayManager): - # TODO proper initialization - df = DataFrame(mgr) - mgr = df._as_manager("array")._mgr - # TODO check for case of manager="block" but mgr is ArrayManager + mgr = mgr_to_mgr(mgr, typ=manager) NDFrame.__init__(self, mgr) - def _as_manager(self, typ): + def _as_manager(self, typ: str) -> DataFrame: """ Private helper function to create a DataFrame with specific manager. Parameters ---------- - mgr : {"block", "array"} + typ : {"block", "array"} Returns ------- @@ -627,24 +624,7 @@ def _as_manager(self, typ): to be a copy or not. """ new_mgr: Union[BlockManager, ArrayManager] - mgr = self._mgr - if typ == "block": - if isinstance(mgr, BlockManager): - new_mgr = mgr - else: - new_mgr = arrays_to_mgr( - mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None - ) - elif typ == "array": - if isinstance(mgr, ArrayManager): - new_mgr = mgr - else: - arrays = [arr.copy() for arr in self._iter_column_arrays()] - new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) - else: - raise ValueError( - f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'" - ) + new_mgr = mgr_to_mgr(self._mgr, typ=typ) # fastpath of passing a manager doesn't check the option/manager class return DataFrame(new_mgr) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 9b09344871e98..e71143224556b 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,4 +1,5 @@ from pandas.core.internals.array_manager import ArrayManager +from pandas.core.internals.base import DataManager from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, BoolBlock, @@ -36,6 +37,7 @@ "TimeDeltaBlock", "safe_reshape", "make_block", + "DataManager", "ArrayManager", "BlockManager", "SingleBlockManager", diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 29ec63e1e4c68..0bbcc52e90e0c 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -54,7 +54,7 @@ class ArrayManager(DataManager): """ __slots__ = [ - "_axes", + "_axes", # private attribute, because 'axes' has different order, see below "arrays", ] diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f1cd221bae15c..eadb61894a990 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -139,6 +139,33 @@ def masked_rec_array_to_mgr( return mgr +def mgr_to_mgr(mgr, typ: str): + """ + Convert to specific type of Manager. Does not copy if the type is already + correct. Does not guarantee a copy otherwise. + """ + from pandas.core.internals import ArrayManager, BlockManager + + new_mgr: Union[ArrayManager, BlockManager] + + if typ == "block": + if isinstance(mgr, BlockManager): + new_mgr = mgr + else: + new_mgr = arrays_to_mgr( + mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None + ) + elif typ == "array": + if isinstance(mgr, ArrayManager): + new_mgr = mgr + else: + arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))] + new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) + else: + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'") + return new_mgr + + # --------------------------------------------------------------------- # DataFrame Constructor Interface From c7898fbdc4f1a83a959003ed8db34a972a9488cc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jan 2021 17:17:08 +0100 Subject: [PATCH 23/29] update for latest changes - fix tests/mypy --- pandas/core/internals/array_manager.py | 8 +------- pandas/io/pytables.py | 12 ++++++++---- pandas/tests/frame/methods/test_reorder_levels.py | 3 +++ pandas/tests/frame/methods/test_select_dtypes.py | 3 +++ 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 0bbcc52e90e0c..4f70621be6cdc 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -322,9 +322,7 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: def isna(self, func) -> "ArrayManager": return self.apply("apply", func=func) - def where( - self, other, cond, align: bool, errors: str, try_cast: bool, axis: int - ) -> "ArrayManager": + def where(self, other, cond, align: bool, errors: str, axis: int) -> "ArrayManager": if align: align_keys = ["other", "cond"] else: @@ -337,7 +335,6 @@ def where( other=other, cond=cond, errors=errors, - try_cast=try_cast, axis=axis, ) @@ -346,7 +343,6 @@ def where( # return self.apply_with_block("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True, axis: int = 0): - transpose = self.ndim == 2 if align: align_keys = ["new", "mask"] @@ -359,9 +355,7 @@ def putmask(self, mask, new, align: bool = True, axis: int = 0): align_keys=align_keys, mask=mask, new=new, - inplace=True, axis=axis, - transpose=transpose, ) def diff(self, n: int, axis: int) -> "ArrayManager": diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d2b02038f8b78..165910492d0b8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -20,6 +20,7 @@ Tuple, Type, Union, + cast, ) import warnings @@ -73,6 +74,7 @@ from pandas.core.computation.pytables import PyTablesExpr, maybe_expression from pandas.core.construction import extract_array from pandas.core.indexes.api import ensure_index +from pandas.core.internals import BlockManager from pandas.io.common import stringify_path from pandas.io.formats.printing import adjoin, pprint_thing @@ -3989,19 +3991,21 @@ def _get_blocks_and_items( def get_blk_items(mgr): return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks] - blocks: List["Block"] = list(frame._mgr.blocks) - blk_items: List[Index] = get_blk_items(frame._mgr) + mgr = frame._mgr + mgr = cast(BlockManager, mgr) + blocks: List["Block"] = list(mgr.blocks) + blk_items: List[Index] = get_blk_items(mgr) if len(data_columns): axis, axis_labels = new_non_index_axes[0] new_labels = Index(axis_labels).difference(Index(data_columns)) mgr = frame.reindex(new_labels, axis=axis)._mgr - blocks = list(mgr.blocks) + blocks = list(mgr.blocks) # type: ignore[union-attr] blk_items = get_blk_items(mgr) for c in data_columns: mgr = frame.reindex([c], axis=axis)._mgr - blocks.extend(mgr.blocks) + blocks.extend(mgr.blocks) # type: ignore[union-attr] blk_items.extend(get_blk_items(mgr)) # reorder the blocks in the same order as the existing table if we can diff --git a/pandas/tests/frame/methods/test_reorder_levels.py b/pandas/tests/frame/methods/test_reorder_levels.py index 6bfbf089a6108..451fc9a5cf717 100644 --- a/pandas/tests/frame/methods/test_reorder_levels.py +++ b/pandas/tests/frame/methods/test_reorder_levels.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, MultiIndex import pandas._testing as tm @@ -47,6 +49,7 @@ def test_reorder_levels(self, frame_or_series): result = obj.reorder_levels(["L0", "L0", "L0"]) tm.assert_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_reorder_levels_swaplevel_equivalence( self, multiindex_year_month_day_dataframe_random_data ): diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index f2dbe4a799a17..434df5ccccaf7 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -42,6 +42,9 @@ def __len__(self) -> int: def __getitem__(self, item): pass + def copy(self): + return self + class TestSelectDtypes: def test_select_dtypes_include_using_list_like(self): From 3430307a69de009db74507ce32e737e7902fd910 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jan 2021 17:38:50 +0100 Subject: [PATCH 24/29] fix todo --- pandas/core/generic.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8e64a713fd7a0..ab2db0b735dcf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -225,9 +225,14 @@ def _init_mgr( mgr = mgr.copy() if dtype is not None: # avoid further copies if we can - # TODO - # if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: - mgr = mgr.astype(dtype=dtype) + if ( + isinstance(mgr, BlockManager) + and len(mgr.blocks) == 1 + and mgr.blocks[0].values.dtype == dtype + ): + pass + else: + mgr = mgr.astype(dtype=dtype) return mgr # ---------------------------------------------------------------------- From 1a3001364cade395c7939e1c93e259adc7dabf41 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Jan 2021 18:27:58 +0100 Subject: [PATCH 25/29] fix import in tests --- pandas/tests/internals/test_managers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/internals/test_managers.py b/pandas/tests/internals/test_managers.py index 9c9ca950b4af9..333455875904a 100644 --- a/pandas/tests/internals/test_managers.py +++ b/pandas/tests/internals/test_managers.py @@ -1,6 +1,8 @@ """ Testing interaction between the different managers (BlockManager, ArrayManager) """ +from pandas.core.dtypes.missing import array_equivalent + import pandas as pd import pandas._testing as tm from pandas.core.internals import ArrayManager, BlockManager @@ -26,7 +28,7 @@ def test_dataframe_creation(): assert isinstance(result._mgr, ArrayManager) tm.assert_frame_equal(result, df_block) assert all( - tm.array_equivalent(left, right) + array_equivalent(left, right) for left, right in zip(result._mgr.arrays, df_array._mgr.arrays) ) From c5548d9a15e67497d09c40ad6bee08408f221255 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 10 Jan 2021 22:25:50 +0100 Subject: [PATCH 26/29] add union alias to typing --- pandas/_typing.py | 4 ++++ pandas/core/frame.py | 3 ++- pandas/core/generic.py | 7 ++++--- pandas/core/internals/concat.py | 6 +++--- pandas/core/internals/construction.py | 4 ++-- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 0b50dd69f7abb..3f7ae7f2f1bed 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -39,6 +39,7 @@ from pandas.core.generic import NDFrame # noqa: F401 from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy from pandas.core.indexes.base import Index + from pandas.core.internals import ArrayManager, BlockManager from pandas.core.resample import Resampler from pandas.core.series import Series from pandas.core.window.rolling import BaseWindow @@ -160,3 +161,6 @@ ColspaceArgType = Union[ str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]] ] + +# internals +Manager = Union["ArrayManager", "BlockManager"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 971c1c99b6ffd..6697b12d9882a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -63,6 +63,7 @@ IndexLabel, Label, Level, + Manager, PythonFuncType, Renamer, StorageOptions, @@ -623,7 +624,7 @@ def _as_manager(self, typ: str) -> DataFrame: New DataFrame using specified manager type. Is not guaranteed to be a copy or not. """ - new_mgr: Union[BlockManager, ArrayManager] + new_mgr: Manager new_mgr = mgr_to_mgr(self._mgr, typ=typ) # fastpath of passing a manager doesn't check the option/manager class return DataFrame(new_mgr) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a9e26b8cf65e7..a25eade4fb46e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -46,6 +46,7 @@ JSONSerializable, Label, Level, + Manager, NpDtype, Renamer, StorageOptions, @@ -187,7 +188,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ) _metadata: List[str] = [] _is_copy = None - _mgr: Union[BlockManager, ArrayManager] + _mgr: Manager _attrs: Dict[Optional[Hashable], Any] _typ: str @@ -196,7 +197,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): def __init__( self, - data: Union[BlockManager, ArrayManager], + data: Manager, copy: bool = False, attrs: Optional[Mapping[Optional[Hashable], Any]] = None, ): @@ -215,7 +216,7 @@ def __init__( @classmethod def _init_mgr( cls, mgr, axes, dtype: Optional[Dtype] = None, copy: bool = False - ) -> Union[BlockManager, ArrayManager]: + ) -> Manager: """ passed a manager and a axes dict """ for a, axe in axes.items(): if axe is not None: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 1105cfea7bfed..32b6f9d64dd8d 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,12 +1,12 @@ from collections import defaultdict import copy import itertools -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union, cast +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast import numpy as np from pandas._libs import NaT, internals as libinternals -from pandas._typing import ArrayLike, DtypeObj, Shape +from pandas._typing import ArrayLike, DtypeObj, Manager, Shape from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote @@ -37,7 +37,7 @@ def concatenate_block_managers( mgrs_indexers, axes: List["Index"], concat_axis: int, copy: bool -) -> Union[ArrayManager, BlockManager]: +) -> Manager: """ Concatenate block managers into one. diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index eadb61894a990..9376b9a001e29 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -9,7 +9,7 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Axis, DtypeObj, Label, Scalar +from pandas._typing import Axis, DtypeObj, Label, Manager, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -146,7 +146,7 @@ def mgr_to_mgr(mgr, typ: str): """ from pandas.core.internals import ArrayManager, BlockManager - new_mgr: Union[ArrayManager, BlockManager] + new_mgr: Manager if typ == "block": if isinstance(mgr, BlockManager): From afe8f80675235fd199d4459ebf291ae3bdadb9f5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 10 Jan 2021 22:44:59 +0100 Subject: [PATCH 27/29] updates based on review --- pandas/core/frame.py | 2 +- pandas/core/internals/array_manager.py | 10 +++------- pandas/tests/frame/methods/test_to_numpy.py | 8 +++++--- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6697b12d9882a..96792dda60f42 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -702,7 +702,7 @@ def _is_homogeneous_type(self) -> bool: False """ if isinstance(self._mgr, ArrayManager): - return False + return len({arr.dtype for arr in self._mgr.arrays}) == 1 if self._mgr.any_extension_types: return len({block.dtype for block in self._mgr.blocks}) == 1 else: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 4f70621be6cdc..dcf9fdcab6376 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -307,8 +307,7 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: arr = np.atleast_2d(arr) block = make_block(arr, placement=slice(0, 1, 1), ndim=2) applied = getattr(block, f)(**kwargs) - while isinstance(applied, list): - # ObjectBlock gives double nested result?, some functions give no list + if isinstance(applied, list): applied = applied[0] arr = applied.values if isinstance(arr, np.ndarray): @@ -387,11 +386,8 @@ def array_fillna(array, value, limit, inplace): limit = libalgos.validate_limit(None, limit=limit) mask[mask.cumsum() > limit] = False - # if not self._can_hold_na: - # if inplace: - # return [self] - # else: - # return [self.copy()] + # TODO could optimize for arrays that cannot hold NAs + # (like _can_hold_na on Blocks) if not inplace: array = array.copy() diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index 960c6405ab4b3..0682989294457 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,6 +1,8 @@ import numpy as np -from pandas import DataFrame, Timestamp, option_context +import pandas.util._test_decorators as td + +from pandas import DataFrame, Timestamp import pandas._testing as tm @@ -17,10 +19,10 @@ def test_to_numpy_dtype(self): result = df.to_numpy(dtype="int64") tm.assert_numpy_array_equal(result, expected) + @td.skip_array_manager_invalid_test def test_to_numpy_copy(self): arr = np.random.randn(4, 3) - with option_context("mode.data_manager", "block"): - df = DataFrame(arr) + df = DataFrame(arr) assert df.values.base is arr assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr From b88c7573a8a5297b5d9178231b0f36749eeccf30 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 10 Jan 2021 23:01:18 +0100 Subject: [PATCH 28/29] skip json tests to avoid segfaults --- pandas/tests/io/test_fsspec.py | 1 + pandas/tests/io/test_user_agent.py | 34 ++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index b1038b6d28083..d9575a6ad81e5 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -247,6 +247,7 @@ def test_pickle_options(fsspectest): tm.assert_frame_equal(df, out) +@td.skip_array_manager_not_yet_implemented def test_json_options(fsspectest): df = DataFrame({"a": [0]}) df.to_json("testmem://afile", storage_options={"test": "json_write"}) diff --git a/pandas/tests/io/test_user_agent.py b/pandas/tests/io/test_user_agent.py index 32399c7de7a68..fd3ca3919d416 100644 --- a/pandas/tests/io/test_user_agent.py +++ b/pandas/tests/io/test_user_agent.py @@ -8,6 +8,8 @@ import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -180,13 +182,25 @@ def do_GET(self): "responder, read_method, port, parquet_engine", [ (CSVUserAgentResponder, pd.read_csv, 34259, None), - (JSONUserAgentResponder, pd.read_json, 34260, None), + pytest.param( + JSONUserAgentResponder, + pd.read_json, + 34260, + None, + marks=td.skip_array_manager_not_yet_implemented, + ), (ParquetPyArrowUserAgentResponder, pd.read_parquet, 34268, "pyarrow"), (ParquetFastParquetUserAgentResponder, pd.read_parquet, 34273, "fastparquet"), (PickleUserAgentResponder, pd.read_pickle, 34271, None), (StataUserAgentResponder, pd.read_stata, 34272, None), (GzippedCSVUserAgentResponder, pd.read_csv, 34261, None), - (GzippedJSONUserAgentResponder, pd.read_json, 34262, None), + pytest.param( + GzippedJSONUserAgentResponder, + pd.read_json, + 34262, + None, + marks=td.skip_array_manager_not_yet_implemented, + ), ], ) def test_server_and_default_headers(responder, read_method, port, parquet_engine): @@ -212,13 +226,25 @@ def test_server_and_default_headers(responder, read_method, port, parquet_engine "responder, read_method, port, parquet_engine", [ (CSVUserAgentResponder, pd.read_csv, 34263, None), - (JSONUserAgentResponder, pd.read_json, 34264, None), + pytest.param( + JSONUserAgentResponder, + pd.read_json, + 34264, + None, + marks=td.skip_array_manager_not_yet_implemented, + ), (ParquetPyArrowUserAgentResponder, pd.read_parquet, 34270, "pyarrow"), (ParquetFastParquetUserAgentResponder, pd.read_parquet, 34275, "fastparquet"), (PickleUserAgentResponder, pd.read_pickle, 34273, None), (StataUserAgentResponder, pd.read_stata, 34274, None), (GzippedCSVUserAgentResponder, pd.read_csv, 34265, None), - (GzippedJSONUserAgentResponder, pd.read_json, 34266, None), + pytest.param( + GzippedJSONUserAgentResponder, + pd.read_json, + 34266, + None, + marks=td.skip_array_manager_not_yet_implemented, + ), ], ) def test_server_and_custom_headers(responder, read_method, port, parquet_engine): From 9dc5600d38f5cfb97052b7c612a161018610316c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Jan 2021 12:52:05 +0100 Subject: [PATCH 29/29] fix for Label -> Hashable change in master --- pandas/core/internals/array_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index dcf9fdcab6376..134bf59ed7f9c 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -6,7 +6,7 @@ import numpy as np from pandas._libs import algos as libalgos, lib -from pandas._typing import ArrayLike, DtypeObj, Label +from pandas._typing import ArrayLike, DtypeObj, Hashable from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import find_common_type, infer_dtype_from_scalar @@ -679,7 +679,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): # TODO raise Exception - def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): + def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False): """ Insert item at selected position.