-
Notifications
You must be signed in to change notification settings - Fork 6
Description
I tried to find a more structural solution for doing regression.
Currently we have a nice solution more or less copy/pasted in different projects, but there are several shortcomings:
a) it is not integrated in larray itself
b) it is only OLS (and curve_fit)
c) each of our users wants different statistics included in the result in addition to the "raw" coefficients (r2, r2adj, pvalues, ...)
So, I have worked on an experimental wrapper that I called LazyObjectArray so that users can retrieve whatever they want from the results. It mostly works but is very experimental/proof-of-concepty and needs more work to finalize. Besides I am unsure this is the right direction, because, as can be seen from the experiment below we would need to wrap large parts of statsmodels. This shouldn't be too complex but very time consuming and version dependent.
Offering our own "complete" api would not suffer from that problem but would not be a smaller task, but then we would once again make it harder for our users to find help online. We could also offer a "light" api which does just the most common stuff (ols/curve fit) but then our users will be frustrated by the lack of descriptive/validation stats.
import numpy as np
import statsmodels.api as sm
from larray import *
from larray.random import uniform
def get_axes(value):
return value.axes if isinstance(value, Array) else AxisCollection([])
def expand_to_axes_union(values, min_axes=None):
all_axes = AxisCollection.union(*[get_axes(v) for v in values])
if min_axes is not None:
if not isinstance(min_axes, AxisCollection):
min_axes = AxisCollection(min_axes)
all_axes = min_axes | all_axes
return [v.expand(all_axes, readonly=True) if isinstance(v, Array) else v
for v in values], all_axes
def expand_args_kwargs_to_axes_union(args, kwargs=None, min_axes=None):
values = (args + tuple(kwargs.values())) if kwargs is not None else args
first_kw = len(args)
expanded_values, res_axes = expand_to_axes_union(values, min_axes=min_axes)
expanded_args = expanded_values[:first_kw]
expanded_kwargs = dict(zip(kwargs.keys(), expanded_values[first_kw:]))
return expanded_args, expanded_kwargs, res_axes
class LazyObjectArray(Array):
def __init__(self, calls, axes, subset=None, meta=None):
# calls = [(func, args, kwargs)]
self.calls = calls
self.axes = axes
self.subset = subset
self._data = None
self.meta = meta
def __getitem__(self, key, collapse_slices=False, translate_key=True):
raw_broadcasted_key, res_axes, transpose_indices = self.axes._key_to_raw_and_axes(key, collapse_slices,
translate_key)
if res_axes:
return LazyObjectArray(self.calls, res_axes, raw_broadcasted_key)
else:
return self.eval(key)
def _get_subset(self, value, key):
if isinstance(value, Array):
subset = self.subset
subsetted = value.i[subset] if subset is not None else value
return subsetted[key] if key is not None else subsetted
else:
return value
def eval(self, key):
obj = None
for (func, args, kwargs) in self.calls:
# FIXME: use _getsubset instead of arg[key]
args = [arg[key] if isinstance(arg, Array) else arg
for arg in args]
# if obj is not None:
# args = [obj] + args
kwargs = {k: v[key] if isinstance(v, Array) else v
for k, v in kwargs.items()}
if func == '__getattr__':
assert len(kwargs) == 0
assert 1 <= len(args) <= 2
obj = getattr(obj, *args)
else:
if isinstance(func, str):
func = getattr(obj, func)
obj = func(*args, **kwargs)
return obj
def __getattr__(self, key):
# support for inspect.signature
# TODO: these should trigger evaluation (as it probably means we are in an interactive console)
if key in {'__signature__', '__wrapped__'}:
raise AttributeError(key)
if key in {'__array_struct__', '__array_interface__'}:
raise AttributeError(key)
calls = self.calls + [('__getattr__', (key,), {})]
return LazyObjectArray(calls, self.axes)
def __call__(self, *args, **kwargs):
calls = self.calls + [('__call__', args, kwargs)]
return LazyObjectArray(calls, self.axes)
@property
def data(self):
if self._data is None:
# trying to access .data triggers evaluation
eval_ = self.eval
axes = self.axes
# FIXME: not always object dtype
# FIXME: we should use or share code with Array.apply so that we can detect the type properly and
# return an Array instead of an Array of Array when the "wrapped" function returns an Array
res = np.empty(axes.size, dtype=object)
try:
res[:] = [eval_(key) for key in axes.iter_labels()]
except Exception as e:
print("failed to compute .data", str(e))
self._data = res.reshape(axes.shape)
assert isinstance(self._data, np.ndarray)
return self._data
@property
def shape(self):
return self.axes.shape
@property
def ndim(self):
return self.axes.ndim
@property
def size(self):
return self.axes.size
def __len__(self):
return len(self.axes[0])
def __dir__(self):
axis_names = set(axis.name for axis in self.axes if axis.name is not None)
attributes = self.__slots__
object_attributes = set(attr for attr in dir(self.data.flat[0]) if not attr.startswith('_'))
return list(set(dir(self.__class__)) | set(attributes) | axis_names | object_attributes)
def __iter__(self):
# FIXME: do something here
print("__iter__")
return super().__iter__()
def wrap_func(func):
def wrapped(*args, axes=None, func_axes=None, **kwargs):
# TODO: expand should be done just before evaluating (but we still need to compute res_axes early) to know the
# array axes
args, kwargs, res_axes = expand_args_kwargs_to_axes_union(args, kwargs, axes)
if axes is None:
axes = res_axes - func_axes if func_axes is not None else res_axes
else:
# check they are compatible
raise NotImplementedError("yada")
return LazyObjectArray([(func, args, kwargs)], axes)
return wrapped
class WrappedSMRegressionResult(object):
def __init__(self, result, params_axis=None, sample_axis=None):
assert isinstance(result, sm.regression.linear_model.RegressionResultsWrapper), type(result)
self.result = result
self.params_axis = params_axis
self.sample_axis = sample_axis
@property
def params(self):
return Array(self.result.params, self.params_axis)
def predict(self, *args, **kwargs):
return Array(self.result.predict(*args, **kwargs), self.sample_axis)
def __getattr__(self, key):
return getattr(self.result, key)
class WrappedSMRegressionModel(object):
def __init__(self, model, params_axis=None, sample_axis=None):
assert isinstance(model, sm.regression.linear_model.RegressionModel)
self.model = model
self.params_axis = params_axis
self.sample_axis = sample_axis
def fit(self, *args, **kwargs):
return WrappedSMRegressionResult(self.model.fit(*args, **kwargs),
self.params_axis, self.sample_axis)
def __getattr__(self, key):
return getattr(self.model, key)
def ols1d(endog, exog=None, add_intercept=True, coef_axis='coef'):
assert endog.ndim == 1
sample_axis = endog.axes[0]
if exog is None:
exog = labels_array(sample_axis)
# 2d exog is for multiple features/variables (and thus coefficients)
assert 1 <= exog.ndim <= 2
if coef_axis in exog.axes:
if add_intercept:
# XXX: use const instead of intercept
exog = exog.prepend(coef_axis, value=1, label='intercept')
else:
assert exog.ndim == 1
# TODO: add support for Axis object
assert isinstance(coef_axis, str)
# FIXME: change names (c1 vs intercept!)
data = {'c1': 1, 'c2': exog} if add_intercept else {'c1': exog}
exog = stack(data, coef_axis)
coef_axis = exog.axes[coef_axis]
res = sm.OLS(endog.data, exog.data)
return WrappedSMRegressionModel(res, coef_axis, sample_axis)
ols = wrap_func(ols1d)
time = Axis('time=2000..2019')
test1 = uniform(low=-1, high=1, axes=(time, 'serie=s0,s1,s2')).cumsum(time)
single_model = ols1d(test1['s0'])
single_result = single_model.fit()
print("single params", single_result.params)
print("single r2", single_result.rsquared_adj)
model = ols(test1, func_axes="time")
result = model.fit()
params = result.params
r2 = result.rsquared_adj
print(r2)
print("mean r2", r2.mean())
summary = result.summary()
print("summary", summary)
print("summary s0")
print(summary['s0'])
# this is effectively an (Lazy)Array of Arrays when we want a single Array
print("params", params)
print(params['s0'])
print(params['s1'])
print(params['s2'])
print("predict")
print(result.predict())
print("info\n", r2.info)
print("fit\n", model.fit())