Skip to content

Commit 78e3ba7

Browse files
committed
Merge pull request #4836 from jreback/mra
ENH: DataFrame constructor now accepts a numpy masked record array (GH3478)
2 parents 62c882e + 3b0b184 commit 78e3ba7

File tree

4 files changed

+111
-15
lines changed

4 files changed

+111
-15
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ Improvements to existing features
115115
its ``DataFrame``'s ``to_excel()`` methods. (:issue:`4750`)
116116
- allow DataFrame constructor to accept more list-like objects, e.g. list of
117117
``collections.Sequence`` and ``array.Array`` objects (:issue:`3783`,:issue:`42971`)
118+
- DataFrame constructor now accepts a numpy masked record array (:issue:`3478`)
118119

119120
API Changes
120121
~~~~~~~~~~~

doc/source/v0.13.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ Enhancements
266266
``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set
267267
the bandwidth, and to gkde.evaluate() to specify the indicies at which it
268268
is evaluated, respecttively. See scipy docs.
269+
- DataFrame constructor now accepts a numpy masked record array (:issue:`3478`)
269270

270271
.. _whatsnew_0130.refactoring:
271272

pandas/core/frame.py

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -394,14 +394,22 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
394394
elif isinstance(data, dict):
395395
mgr = self._init_dict(data, index, columns, dtype=dtype)
396396
elif isinstance(data, ma.MaskedArray):
397-
mask = ma.getmaskarray(data)
398-
if mask.any():
399-
data, fill_value = _maybe_upcast(data, copy=True)
400-
data[mask] = fill_value
397+
398+
# masked recarray
399+
if isinstance(data, ma.mrecords.MaskedRecords):
400+
mgr = _masked_rec_array_to_mgr(data, index, columns, dtype, copy)
401+
402+
# a masked array
401403
else:
402-
data = data.copy()
403-
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
404-
copy=copy)
404+
mask = ma.getmaskarray(data)
405+
if mask.any():
406+
data, fill_value = _maybe_upcast(data, copy=True)
407+
data[mask] = fill_value
408+
else:
409+
data = data.copy()
410+
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
411+
copy=copy)
412+
405413
elif isinstance(data, (np.ndarray, Series)):
406414
if data.dtype.names:
407415
data_columns = list(data.dtype.names)
@@ -1009,13 +1017,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
10091017
arr_columns.append(k)
10101018
arrays.append(v)
10111019

1012-
# reorder according to the columns
1013-
if len(columns) and len(arr_columns):
1014-
indexer = _ensure_index(
1015-
arr_columns).get_indexer(columns)
1016-
arr_columns = _ensure_index(
1017-
[arr_columns[i] for i in indexer])
1018-
arrays = [arrays[i] for i in indexer]
1020+
arrays, arr_columns = _reorder_arrays(arrays, arr_columns, columns)
10191021

10201022
elif isinstance(data, (np.ndarray, DataFrame)):
10211023
arrays, columns = _to_arrays(data, columns)
@@ -4817,6 +4819,52 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None):
48174819
dtype=dtype)
48184820

48194821

4822+
def _masked_rec_array_to_mgr(data, index, columns, dtype, copy):
4823+
""" extract from a masked rec array and create the manager """
4824+
4825+
# essentially process a record array then fill it
4826+
fill_value = data.fill_value
4827+
fdata = ma.getdata(data)
4828+
if index is None:
4829+
index = _get_names_from_index(fdata)
4830+
if index is None:
4831+
index = _default_index(len(data))
4832+
index = _ensure_index(index)
4833+
4834+
if columns is not None:
4835+
columns = _ensure_index(columns)
4836+
arrays, arr_columns = _to_arrays(fdata, columns)
4837+
4838+
# fill if needed
4839+
new_arrays = []
4840+
for fv, arr, col in zip(fill_value, arrays, arr_columns):
4841+
mask = ma.getmaskarray(data[col])
4842+
if mask.any():
4843+
arr, fv = _maybe_upcast(arr, fill_value=fv, copy=True)
4844+
arr[mask] = fv
4845+
new_arrays.append(arr)
4846+
4847+
# create the manager
4848+
arrays, arr_columns = _reorder_arrays(new_arrays, arr_columns, columns)
4849+
if columns is None:
4850+
columns = arr_columns
4851+
4852+
mgr = _arrays_to_mgr(arrays, arr_columns, index, columns)
4853+
4854+
if copy:
4855+
mgr = mgr.copy()
4856+
return mgr
4857+
4858+
def _reorder_arrays(arrays, arr_columns, columns):
4859+
# reorder according to the columns
4860+
if columns is not None and len(columns) and arr_columns is not None and len(arr_columns):
4861+
indexer = _ensure_index(
4862+
arr_columns).get_indexer(columns)
4863+
arr_columns = _ensure_index(
4864+
[arr_columns[i] for i in indexer])
4865+
arrays = [arrays[i] for i in indexer]
4866+
return arrays, arr_columns
4867+
48204868
def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
48214869
if len(data) > 0 and isinstance(data[0], tuple):
48224870
content = list(lib.to_object_array_tuples(data).T)

pandas/tests/test_frame.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
import csv
1010
import unittest
1111
import nose
12-
12+
import functools
13+
import itertools
1314
from pandas.compat import(
1415
map, zip, range, long, lrange, lmap, lzip,
1516
OrderedDict, cPickle as pickle, u, StringIO
@@ -21,6 +22,7 @@
2122
import numpy as np
2223
import numpy.ma as ma
2324
from numpy.testing import assert_array_equal
25+
import numpy.ma.mrecords as mrecords
2426

2527
import pandas as pan
2628
import pandas.core.nanops as nanops
@@ -2510,6 +2512,50 @@ def test_constructor_maskedarray_nonfloat(self):
25102512
self.assertEqual(True, frame['A'][1])
25112513
self.assertEqual(False, frame['C'][2])
25122514

2515+
def test_constructor_mrecarray(self):
2516+
"""
2517+
Ensure mrecarray produces frame identical to dict of masked arrays
2518+
from GH3479
2519+
2520+
"""
2521+
assert_fr_equal = functools.partial(assert_frame_equal,
2522+
check_index_type=True,
2523+
check_column_type=True,
2524+
check_frame_type=True)
2525+
arrays = [
2526+
('float', np.array([1.5, 2.0])),
2527+
('int', np.array([1, 2])),
2528+
('str', np.array(['abc', 'def'])),
2529+
]
2530+
for name, arr in arrays[:]:
2531+
arrays.append(('masked1_' + name,
2532+
np.ma.masked_array(arr, mask=[False, True])))
2533+
arrays.append(('masked_all', np.ma.masked_all((2,))))
2534+
arrays.append(('masked_none',
2535+
np.ma.masked_array([1.0, 2.5], mask=False)))
2536+
2537+
# call assert_frame_equal for all selections of 3 arrays
2538+
for comb in itertools.combinations(arrays, 3):
2539+
names, data = zip(*comb)
2540+
mrecs = mrecords.fromarrays(data, names=names)
2541+
2542+
# fill the comb
2543+
comb = dict([ (k, v.filled()) if hasattr(v,'filled') else (k, v) for k, v in comb ])
2544+
2545+
expected = DataFrame(comb,columns=names)
2546+
result = DataFrame(mrecs)
2547+
assert_fr_equal(result,expected)
2548+
2549+
# specify columns
2550+
expected = DataFrame(comb,columns=names[::-1])
2551+
result = DataFrame(mrecs, columns=names[::-1])
2552+
assert_fr_equal(result,expected)
2553+
2554+
# specify index
2555+
expected = DataFrame(comb,columns=names,index=[1,2])
2556+
result = DataFrame(mrecs, index=[1,2])
2557+
assert_fr_equal(result,expected)
2558+
25132559
def test_constructor_corner(self):
25142560
df = DataFrame(index=[])
25152561
self.assertEqual(df.values.shape, (0, 0))

0 commit comments

Comments
 (0)