Skip to content

implement a structural solution for doing regressions/machine learning #884

@gdementen

Description

@gdementen

I tried to find a more structural solution for doing regression.

Currently we have a nice solution more or less copy/pasted in different projects, but there are several shortcomings:
a) it is not integrated in larray itself
b) it is only OLS (and curve_fit)
c) each of our users wants different statistics included in the result in addition to the "raw" coefficients (r2, r2adj, pvalues, ...)

So, I have worked on an experimental wrapper that I called LazyObjectArray so that users can retrieve whatever they want from the results. It mostly works but is very experimental/proof-of-concepty and needs more work to finalize. Besides I am unsure this is the right direction, because, as can be seen from the experiment below we would need to wrap large parts of statsmodels. This shouldn't be too complex but very time consuming and version dependent.

Offering our own "complete" api would not suffer from that problem but would not be a smaller task, but then we would once again make it harder for our users to find help online. We could also offer a "light" api which does just the most common stuff (ols/curve fit) but then our users will be frustrated by the lack of descriptive/validation stats.

import numpy as np
import statsmodels.api as sm
from larray import *
from larray.random import uniform


def get_axes(value):
    return value.axes if isinstance(value, Array) else AxisCollection([])


def expand_to_axes_union(values, min_axes=None):
    all_axes = AxisCollection.union(*[get_axes(v) for v in values])
    if min_axes is not None:
        if not isinstance(min_axes, AxisCollection):
            min_axes = AxisCollection(min_axes)
        all_axes = min_axes | all_axes
    return [v.expand(all_axes, readonly=True) if isinstance(v, Array) else v
            for v in values], all_axes


def expand_args_kwargs_to_axes_union(args, kwargs=None, min_axes=None):
    values = (args + tuple(kwargs.values())) if kwargs is not None else args
    first_kw = len(args)
    expanded_values, res_axes = expand_to_axes_union(values, min_axes=min_axes)
    expanded_args = expanded_values[:first_kw]
    expanded_kwargs = dict(zip(kwargs.keys(), expanded_values[first_kw:]))
    return expanded_args, expanded_kwargs, res_axes


class LazyObjectArray(Array):
    def __init__(self, calls, axes, subset=None, meta=None):
        # calls = [(func, args, kwargs)]
        self.calls = calls
        self.axes = axes
        self.subset = subset
        self._data = None
        self.meta = meta

    def __getitem__(self, key, collapse_slices=False, translate_key=True):
        raw_broadcasted_key, res_axes, transpose_indices = self.axes._key_to_raw_and_axes(key, collapse_slices,
                                                                                          translate_key)
        if res_axes:
            return LazyObjectArray(self.calls, res_axes, raw_broadcasted_key)
        else:
            return self.eval(key)

    def _get_subset(self, value, key):
        if isinstance(value, Array):
            subset = self.subset
            subsetted = value.i[subset] if subset is not None else value
            return subsetted[key] if key is not None else subsetted
        else:
            return value

    def eval(self, key):
        obj = None
        for (func, args, kwargs) in self.calls:
            # FIXME: use _getsubset instead of arg[key]
            args = [arg[key] if isinstance(arg, Array) else arg
                    for arg in args]
            # if obj is not None:
            #     args = [obj] + args
            kwargs = {k: v[key] if isinstance(v, Array) else v
                      for k, v in kwargs.items()}
            if func == '__getattr__':
                assert len(kwargs) == 0
                assert 1 <= len(args) <= 2
                obj = getattr(obj, *args)
            else:
                if isinstance(func, str):
                    func = getattr(obj, func)
                obj = func(*args, **kwargs)
        return obj

    def __getattr__(self, key):
        # support for inspect.signature
        # TODO: these should trigger evaluation (as it probably means we are in an interactive console)
        if key in {'__signature__', '__wrapped__'}:
            raise AttributeError(key)
        if key in {'__array_struct__', '__array_interface__'}:
            raise AttributeError(key)
        calls = self.calls + [('__getattr__', (key,), {})]
        return LazyObjectArray(calls, self.axes)

    def __call__(self, *args, **kwargs):
        calls = self.calls + [('__call__', args, kwargs)]
        return LazyObjectArray(calls, self.axes)

    @property
    def data(self):
        if self._data is None:
            # trying to access .data triggers evaluation
            eval_ = self.eval
            axes = self.axes
            # FIXME: not always object dtype
            # FIXME: we should use or share code with Array.apply so that we can detect the type properly and
            #             return an Array instead of an Array of Array when the "wrapped" function returns an Array
            res = np.empty(axes.size, dtype=object)
            try:
                res[:] = [eval_(key) for key in axes.iter_labels()]
            except Exception as e:
                print("failed to compute .data", str(e))
            self._data = res.reshape(axes.shape)
        assert isinstance(self._data, np.ndarray)
        return self._data

    @property
    def shape(self):
        return self.axes.shape

    @property
    def ndim(self):
        return self.axes.ndim

    @property
    def size(self):
        return self.axes.size

    def __len__(self):
        return len(self.axes[0])

    def __dir__(self):
        axis_names = set(axis.name for axis in self.axes if axis.name is not None)
        attributes = self.__slots__
        object_attributes = set(attr for attr in dir(self.data.flat[0]) if not attr.startswith('_'))
        return list(set(dir(self.__class__)) | set(attributes) | axis_names | object_attributes)

    def __iter__(self):
        # FIXME: do something here
        print("__iter__")
        return super().__iter__()

def wrap_func(func):
    def wrapped(*args, axes=None, func_axes=None, **kwargs):
        # TODO: expand should be done just before evaluating (but we still need to compute res_axes early) to know the
        #       array axes
        args, kwargs, res_axes = expand_args_kwargs_to_axes_union(args, kwargs, axes)
        if axes is None:
            axes = res_axes - func_axes if func_axes is not None else res_axes
        else:
            # check they are compatible
            raise NotImplementedError("yada")
        return LazyObjectArray([(func, args, kwargs)], axes)
    return wrapped


class WrappedSMRegressionResult(object):
    def __init__(self, result, params_axis=None, sample_axis=None):
        assert isinstance(result, sm.regression.linear_model.RegressionResultsWrapper), type(result)
        self.result = result
        self.params_axis = params_axis
        self.sample_axis = sample_axis

    @property
    def params(self):
        return Array(self.result.params, self.params_axis)

    def predict(self, *args, **kwargs):
        return Array(self.result.predict(*args, **kwargs), self.sample_axis)

    def __getattr__(self, key):
        return getattr(self.result, key)


class WrappedSMRegressionModel(object):
    def __init__(self, model, params_axis=None, sample_axis=None):
        assert isinstance(model, sm.regression.linear_model.RegressionModel)
        self.model = model
        self.params_axis = params_axis
        self.sample_axis = sample_axis

    def fit(self, *args, **kwargs):
        return WrappedSMRegressionResult(self.model.fit(*args, **kwargs),
                                         self.params_axis, self.sample_axis)

    def __getattr__(self, key):
        return getattr(self.model, key)


def ols1d(endog, exog=None, add_intercept=True, coef_axis='coef'):
    assert endog.ndim == 1
    sample_axis = endog.axes[0]
    if exog is None:
        exog = labels_array(sample_axis)

    # 2d exog is for multiple features/variables (and thus coefficients)
    assert 1 <= exog.ndim <= 2

    if coef_axis in exog.axes:
        if add_intercept:
            # XXX: use const instead of intercept
            exog = exog.prepend(coef_axis, value=1, label='intercept')
    else:
        assert exog.ndim == 1
        # TODO: add support for Axis object
        assert isinstance(coef_axis, str)
        # FIXME: change names (c1 vs intercept!)
        data = {'c1': 1, 'c2': exog} if add_intercept else {'c1': exog}
        exog = stack(data, coef_axis)
    coef_axis = exog.axes[coef_axis]
    res = sm.OLS(endog.data, exog.data)
    return WrappedSMRegressionModel(res, coef_axis, sample_axis)

ols = wrap_func(ols1d)

time = Axis('time=2000..2019')
test1 = uniform(low=-1, high=1, axes=(time, 'serie=s0,s1,s2')).cumsum(time)
single_model = ols1d(test1['s0'])
single_result = single_model.fit()
print("single params", single_result.params)
print("single r2", single_result.rsquared_adj)

model = ols(test1, func_axes="time")
result = model.fit()
params = result.params
r2 = result.rsquared_adj
print(r2)
print("mean r2", r2.mean())

summary = result.summary()
print("summary", summary)

print("summary s0")
print(summary['s0'])

# this is effectively an (Lazy)Array of Arrays when we want a single Array
print("params", params)
print(params['s0'])
print(params['s1'])
print(params['s2'])

print("predict")
print(result.predict())

print("info\n", r2.info)
print("fit\n", model.fit())

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions