Description
While exploring the maybe_convert_objects
I've got quite a few proposals/questions but in the end I've come to a conclusion that the logic of function itself is far from optimal.
Motivation
- Main argument here is we are not checking for inconsistency of parts of already seen data. E.g. if we have seen
datetime
andfloat
we will still process the other part of array. And only after have seen all the elements we are going to checking of theSeen()
object state in order to determine result dtype. But in given example one may already conclude that the result dtype must be left asobject
.
Other words there is a need to do result type checking on the fly if we want to preserve the main course of the written function. - Even if data is consistent we are still using redundant bunch of checks. E.g. if the dtype is
datetime
for all seen values we are still will try to check forfloat
,bool
etc. But all this checks may be left in singleif is_datetime(val)... else return object
.
I.e. for each seen dtype there should be it's own checking algorithm for other array elements.
These are only main points. So instead of trying to bloat current realisation with (maybe redundant) checks I am suggesting to rewrite maybe_convert_objects
.
Realization
Main idea of following code is perform type probing
on the first element of array and then pass processing to the corresponding function, during execution which processing may be passed to another (more 'narrow' with checks) function if new dtype has been discovered.
NOTE this is working copy so I left questions arisen in code comments (marked with # QUEST:
)
# First declare checking functions for corresponding dtype got in probing
# QUEST: What is the correct way to wrap these functions? Maybe another .pxd?
# QUEST: Can here and in other cdefs 'nogil' param be used?
cdef str check_complex(bint safe,
object[:] objects,
complex128_t[:] complexes,
int n = 0):
cdef:
float64_t fnan = np.nan
Py_ssize_t i
for i in range(n, objects.shape[0]):
val = objects[i]
if util.is_complex_object(val) or util.is_float_object(val):
complexes[i] = val
elif val is None:
complexes[i] = fnan
elif util.is_integer_object(val):
if is_timedelta(val):
return 'object'
if not safe and oINT64_MIN <= val and val <= oUINT64_MAX:
complexes[i] = <complex128_t>val
else:
return 'object'
else:
return 'object'
return 'complex'
cdef str check_float(bint safe,
object[:] objects,
float64_t[:] floats,
complex128_t[:] complexes,
int n = 0):
cdef:
float64_t fnan = np.nan
Py_ssize_t i
for i in range(n, objects.shape[0]):
val = objects[i]
if util.is_float_object(val):
floats[i] = complexes[i] = val
elif val is None:
floats[i] = complexes[i] = fnan
elif util.is_integer_object(val):
if util.is_timedelta64_object(val):
return 'object'
if not safe and oINT64_MIN <= val and val <= oUINT64_MAX:
floats[i] = <float64_t>val
complexes[i] = <complex128_t>val
else:
return 'object'
elif util.is_complex_object(val):
return check_complex(safe, objects, complexes, i)
else:
return 'object'
return 'float'
cdef str check_integer(bint safe,
object[:] objects,
int64_t[:] ints,
uint64_t[:] uints,
float64_t[:] floats,
complex128_t[:] complexes,
int n = 0):
cdef:
Seen seen = Seen()
Py_ssize_t i
for i in range(n, objects.shape[0]):
val = objects[i]
if util.is_integer_object(val):
# is_integer hits timedelta64
# QUEST: maybe check timedelta64 in 'is_integer_object' directly?
# if so - remove from here and from check_float/complex
if util.is_timedelta64_object(val):
return 'object'
if not safe:
floats[i] = <float64_t>val
complexes[i] = <complex128_t>val
val = int(val)
seen.saw_int(val)
# QUEST: do val not in [in64.min, uint64.max] range should be
# dropped to object immediatelly?
if ((seen.uint_ and seen.sint_) or
val > oUINT64_MAX or val < oINT64_MIN):
return 'object'
if seen.uint_:
uints[i] = val
elif seen.sint_:
ints[i] = val
else:
uints[i] = val
ints[i] = val
elif safe:
return 'object'
elif util.is_float_object(val):
return check_float(safe, objects, floats, complexes, i)
elif val is None:
return check_float(safe, objects, floats, complexes, i)
elif util.is_complex_object(val):
return check_complex(safe, objects, complexes, i)
else:
return 'object'
if seen.uint_ and seen.sint_:
# Default for intersected sint uint data
# QUEST: should be 'sint'?
return 'sint'
elif seen.uint_:
return 'uint'
else:
return 'sint'
cdef str check_bool(object[:] objects,
uint8_t[:] bools,
int n = 0):
cdef Py_ssize_t i
for i in range(n, objects.shape[0]):
val = objects[i]
if util.is_bool_object(val):
bools[i] = val
else:
return 'object'
return 'bool'
cdef str check_datetime(object[:] objects,
int64_t[:] idatetimes,
int n = 0):
cdef Py_ssize_t i
for i in range(n, objects.shape[0]):
val = objects[i]
if is_datetime(val):
# QUEST: If one val has attr 'tzinfo' - must others have one?
# Or if other have not - 'object' should be returned direcrtly?
if getattr(val, 'tzinfo', None) is not None:
return 'datetime_tz'
idatetimes[i] = convert_to_tsobject(
val, None, None, 0, 0).value
elif val is NaT:
idatetimes[i] = NPY_NAT
else:
return 'object'
return 'datetime'
cdef str check_timedelta(object[:] objects,
int64_t[:] itimedeltas,
int n = 0):
cdef Py_ssize_t i
for i in range(n, objects.shape[0]):
val = objects[i]
if is_timedelta(val):
itimedeltas[i] = convert_to_timedelta64(val, 'ns')
elif val is NaT:
itimedeltas[i] = NPY_NAT
else:
return 'object'
return 'timedelta'
cdef str check_NaT(object[:] objects,
int64_t[:] idatetimes,
int64_t[:] itimedeltas,
int n = 0):
cdef Py_ssize_t i
for i in range(n, objects.shape[0]):
val = objects[i]
if val is NaT:
idatetimes[i] = itimedeltas[i] = NPY_NAT
elif is_datetime(val):
return check_datetime(objects, idatetimes, i)
elif is_timedelta(val):
return check_timedelta(objects, itimedeltas, i)
else:
return 'object'
# Default for arrays of NaT will be datetime64?
return 'datetime'
cdef str check_None(bint safe,
object[:] objects,
float64_t[:] floats,
complex128_t[:] complexes,
int n = 0):
cdef:
float64_t fnan = np.nan
Py_ssize_t i
for i in range(n, objects.shape[0]):
val = objects[i]
if val is None:
floats[i] = complexes[i] = fnan
elif util.is_float_object(val):
return check_float(safe, objects, floats, complexes, i)
elif util.is_complex_object(val):
return check_complex(safe, objects, complexes, i)
else:
return 'object'
# QUEST: array full of None will be left as object - correct?
# QUEST: at the same time [None, np.nan] converted to float.
# How about [None, NaT] then? or [None, bool]. Need consistency here.
# Same - [None, 1] is *float* now..
return 'object'
# QUEST: Disable directives for now caues unstable. Enable?
# @cython.boundscheck(False)
# @cython.wraparound(False)
def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
bint safe=0, bint convert_datetime=0,
bint convert_timedelta=0):
"""
Type inference function-- convert object array to proper dtype
"""
# Declaring output templates
# QUEST: doesn't it use excessing memory for unnecessary types?
# so it should be initialized after dtype probing?
# QUEST: do this arrs really needed? Why refilling even one of them -
# maybe use objects.view or something as result after checkings?
cdef:
int n = objects.shape[0]
ndarray[uint8_t] bools = np.empty(n, dtype = np.uint8)
ndarray[int64_t] ints = np.empty(n, dtype = np.int64)
ndarray[uint64_t] uints = np.empty(n, dtype = np.uint64)
ndarray[float64_t] floats = np.empty(n, dtype = np.float64)
ndarray[complex128_t] complexes = np.empty(n, dtype = np.complex128)
if convert_datetime:
datetimes = np.empty(n, dtype='M8[ns]')
idatetimes = datetimes.view(np.int64)
if convert_timedelta:
timedeltas = np.empty(n, dtype='m8[ns]')
itimedeltas = timedeltas.view(np.int64)
# Ensure we are able do probing at all
if n == 0:
return objects
# Type probing - determine type of first element
# and call corresponding check
cdef:
str convert_dtype = 'object'
object val = objects[0]
if val is None:
# QUEST: Or should here 'object' be returned immediatelly?
# cause floats should use np.nan instead
convert_dtype = check_None(safe, objects, floats, complexes)
elif val is NaT:
if convert_datetime and convert_timedelta:
convert_dtype = check_NaT(objects, idatetimes, itimedeltas)
elif convert_datetime:
convert_dtype = check_datetime(objects, idatetimes)
elif convert_timedelta:
convert_dtype = check_timedelta(objects, itimedeltas)
elif is_timedelta(val):
if convert_timedelta:
convert_dtype = check_timedelta(objects, itimedeltas)
elif is_datetime(val):
if convert_datetime:
convert_dtype = check_datetime(objects, idatetimes)
elif util.is_bool_object(val):
convert_dtype = check_bool(objects, bools)
elif util.is_integer_object(val):
convert_dtype = check_integer(safe, objects,
ints, uints, floats, complexes)
elif util.is_float_object(val):
convert_dtype = check_float(safe, objects, floats, complexes)
elif util.is_complex_object(val):
convert_dtype = check_complex(safe, objects, complexes)
# QUEST: Why 'try_float' - not 'try_decimal' as e.g. try_timedelta?
elif try_float and not isinstance(val, str):
pass
# TODO: not done yet. It is not "safe" way to just try float(val)
# # this will convert Decimal objects
# try:
# convert_dtype = check_float(safe, objects, floats, complexes)
# except Exception:
# convert_dtype = 'object'
# OUTPUT
if convert_dtype == 'bool':
# QUEST: is view here needed? need to recheck this
return bools.view(np.bool_)
elif convert_dtype == 'sint':
return ints
elif convert_dtype == 'uint':
return uints
elif convert_dtype == 'float':
return floats
elif convert_dtype == 'complex':
return complexes
elif convert_dtype == 'datetime':
return datetimes
elif convert_dtype == 'timedelta':
return timedeltas
elif convert_dtype == 'datetime_tz':
# we try to coerce datetime w/tz but must all have the same tz
# QUEST: havn't check speed of this - can it be done faster?
# QUEST: DateTimeIndex is returned instead np.ndarray.
if is_datetime_with_singletz_array(objects):
from pandas import DatetimeIndex
return DatetimeIndex(objects)
return objects
Code logic tests
I've done initial testing with pytest in order to compare outputs of the old and new functions with this test suite:
import pytest
import itertools
import numpy as np
import pandas as pd
import pandas._libs.lib as lib
import pandas.util.testing as tm
from decimal import Decimal
from datetime import datetime, timezone, timedelta
def data_gen(types):
N = 2
data = np.array([0]*N, dtype='object')
for perm in itertools.product(list(range(len(types))), repeat=N):
for i in range(data.shape[0]):
data[i] = types[perm[i]]
yield data.copy()
check_types = [None, np.nan, pd.NaT,
True,
np.iinfo(np.int64).min-1, -1, 1, np.iinfo(np.int64).max+1,
np.iinfo(np.uint64).max, np.iinfo(np.uint64).max+1,
1.1, np.float_(1.1),
1j, np.complex_(1j),
datetime(2000, 1, 1),
datetime(2000, 1, 1, 0, 0, 0, 0, timezone.utc),
np.datetime64('2000-01-01'),
timedelta(365),
np.timedelta64(1, 's'),
# Decimal(1.1),
'object'
]
datas = list(data_gen(check_types))
class Tests:
@pytest.mark.parametrize("data", datas)
@pytest.mark.parametrize("safe", [0, 1])
def test_maybe_convert_objects(self, data, safe):
old = lib.maybe_convert_objects_OLD(data, 1, safe, 1, 1)
new = lib.maybe_convert_objects(data, 1, safe, 1, 1)
print('data:', data, data.dtype,
'\nold:', old, old.dtype,
'\nnew:', new, new.dtype)
if isinstance(old, np.ndarray):
tm.assert_numpy_array_equal(old, new)
else:
tm.assert_index_equal(old, new)
770/800 tests passed
For now it's simple creates array with 2 values for all the combinations from check_dtypes
. I'm in progress of solving left 30 and then will increase N
.
Performance tests
Asv benchmarks:
class MaybeConvertObjects:
dtypes = {"bool": True, "sint": -1, "uint": np.iinfo(np.uint64).max,
"float": 1.1, "complex": 1j,
"datetime": datetime(2000, 1, 1), "timedelta": timedelta(365),
"datetime_tz": datetime(2000, 1, 1, 0, 0, 0, 0, timezone.utc),
# "decimal": Decimal(1.1),
"object": "object"}
dtypes_to_insert = list(dtypes.values()) + [None, pd.NaT]
datas = {k: np.array([v] * 10**3, dtype='object')
for k, v in dtypes.items()}
params = [[0, 1], [0, 1], [0, 1],
list(dtypes.keys()),
list(itertools.product(list(range(len(dtypes_to_insert))),
repeat=2))]
param_names = ["safe", "convert_datetime", "convert_timedelta",
"dtype", "insrt_inds"]
def setup(self, safe, convert_datetime, convert_timedelta,
dtype, insrt_inds):
data = self.datas[dtype]
data[0] = self.dtypes_to_insert[insrt_inds[0]]
data[len(data) // 2] = self.dtypes_to_insert[insrt_inds[1]]
# data[2 * len(data) // 3] = self.dtypes_to_insert[insrt_inds[2]]
self.data = data
def time_maybe_convert_objects(self, safe, convert_datetime,
convert_timedelta, dtype, insrt_inds):
lib.maybe_convert_objects(self.data, try_float=0, safe=safe,
convert_datetime=convert_datetime,
convert_timedelta=convert_timedelta)
For 4100/4400 of tests with changed performance the performance increased (what is the correct way to paste here the output of these? or is it redundant?). I'm also in progress of resolving left 300.
In order to continue working on this I need to know if this will be even considered or not.