Skip to content

Meta learners #170

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 62 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
2c0d551
implemented frequentist S, T and X learners
Feb 24, 2023
ab999b6
Reformatted. Added bootstrapping. Added DRLearner.
Feb 26, 2023
9791b9b
Fixed doc-string for DRLearner
Feb 26, 2023
100f8d7
renamed meta_learners.py to skl_meta_learners.py
Feb 26, 2023
5874281
imported skl_meta_learners
Feb 26, 2023
df90a52
minor code style fixes
Feb 27, 2023
b8a3dff
mostly stylistic changes
Feb 27, 2023
020a65f
fixed an import
Feb 27, 2023
667d3b4
bootstraping does not overwrite self.models anymore
matekadlicsko Feb 28, 2023
d05c156
fixed a citation in docstring
matekadlicsko Mar 1, 2023
542e129
added _fit function to reduce boilerplate code
matekadlicsko Mar 1, 2023
5f8a62f
refactored
matekadlicsko Mar 1, 2023
759b9e2
added BARTModel
matekadlicsko Mar 1, 2023
8c03319
outlined pymc meta-learners
matekadlicsko Mar 1, 2023
18baff5
minor changes helping pymc integration
matekadlicsko Mar 2, 2023
f9d9817
minor changes
matekadlicsko Mar 2, 2023
9917a83
continuing to integrate pymc models
matekadlicsko Mar 2, 2023
a8d6467
bugfix
matekadlicsko Mar 2, 2023
55b43df
more minor bugfixes
matekadlicsko Mar 2, 2023
9d5bb61
added logistic regression
matekadlicsko Mar 2, 2023
3f77e76
added bayesian DRLearner
matekadlicsko Mar 4, 2023
faf0db5
fixed some issues with X and DR learners
matekadlicsko Mar 5, 2023
c1bbf33
small bugfixes
matekadlicsko Mar 6, 2023
2f689dd
added (incomplete) notebook explaining meta-learners
matekadlicsko Mar 6, 2023
b57e31a
wrote section on X-learner
matekadlicsko Mar 7, 2023
483d55b
fixed major error in DRLearner implementation
matekadlicsko Mar 7, 2023
d62eb18
minor changes
matekadlicsko Mar 8, 2023
95e010e
implemented cross_fitting option for DR-learner
matekadlicsko Mar 9, 2023
3e1182d
wrote subsection on DR-learner
matekadlicsko Mar 9, 2023
806cd0f
added docstring + some small changes suggested by @juanitorduz
matekadlicsko Mar 10, 2023
21d0b15
fixed a dependency
matekadlicsko Mar 12, 2023
c4f124b
improvements on LogisticRegression
matekadlicsko Mar 12, 2023
90fddd7
several improvements
matekadlicsko Mar 12, 2023
917216c
BayesianDR now works
matekadlicsko Mar 15, 2023
bb588b9
BayesianXLearner now works
matekadlicsko Mar 15, 2023
f39b856
removed redundant _compute_cate function
matekadlicsko Mar 15, 2023
2ca0ebd
formatting
matekadlicsko Mar 15, 2023
48c8105
added score method
matekadlicsko Mar 16, 2023
ddaebb4
formatting
matekadlicsko Mar 16, 2023
3bb16fe
reworded introduction + included some suggestions by @juanitorduz
matekadlicsko Mar 16, 2023
0d98c53
minor changes
matekadlicsko Mar 16, 2023
02b78e1
formatting
matekadlicsko Mar 17, 2023
3e845bf
added correct docstring
matekadlicsko Mar 17, 2023
d4830cc
added aesera to list of dependencies
matekadlicsko Mar 22, 2023
02d592c
improved docstrings.
matekadlicsko Mar 27, 2023
2007685
XLearner computations were wrong
matekadlicsko Mar 27, 2023
a936306
added summary file
matekadlicsko Mar 29, 2023
e682b27
summary now returns a summary object
matekadlicsko Mar 29, 2023
4751aeb
minor fix
matekadlicsko Mar 29, 2023
aba9255
new summary objects are displayed
matekadlicsko Mar 29, 2023
5fe6c53
changed plot method
matekadlicsko Apr 2, 2023
8fd71ec
Added some docstrings
matekadlicsko Apr 2, 2023
14fac30
fixed pymc-bart import
matekadlicsko Apr 9, 2023
1cbe477
summary now performs bootstrapping only once
matekadlicsko Apr 9, 2023
46a33d2
added summary
matekadlicsko Apr 9, 2023
d88472c
imported summary
matekadlicsko Apr 9, 2023
c154979
Merge branch 'pymc-labs:main' into meta-learners
matekadlicsko Apr 13, 2023
18b6934
made notebook a bit more clear
matekadlicsko Apr 17, 2023
1beda78
Merge branch 'meta-learners' of https://github.com/matekadlicsko/Caus…
matekadlicsko Apr 17, 2023
b43752e
Merge branch 'pymc-labs:main' into meta-learners
matekadlicsko Apr 20, 2023
92b655d
Merge branch 'pymc-labs:main' into meta-learners
matekadlicsko May 10, 2023
9d26c40
Merge branch 'pymc-labs:main' into meta-learners
matekadlicsko Jun 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions causalpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import causalpy.skl_experiments
import causalpy.skl_models
from causalpy.version import __version__
import causalpy.skl_meta_learners

from .data import load_data

Expand Down
374 changes: 374 additions & 0 deletions causalpy/skl_meta_learners.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,374 @@
import pandas as pd
import numpy as np
from sklearn.utils import check_consistent_length
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from causalpy.utils import _is_variable_dummy_coded


class MetaLearner:
"""
Base class for meta-learners.
"""

def __init__(
self,
X: pd.DataFrame,
y: pd.Series,
treated: pd.Series
) -> None:
# Check whether input is appropriate
check_consistent_length(X, y, treated)
if not _is_variable_dummy_coded(treated):
raise ValueError('Treatment variable is not dummy coded.')

self.treated = treated
self.X = X
self.y = y

def predict_cate(self, X: pd.DataFrame) -> np.array:
"""
Predict conditional average treatment effect for given input X.
"""
raise NotImplementedError()

def predict_ate(self, X: pd.DataFrame) -> np.float64:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if we output the distribution instead of immediately returning the mean?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, but maybe we should add a different method for that. I'd like it to be as similar to the sklearn counterpart as possible.

"""
Predict average treatment effect for given input X.
"""
return self.predict_cate(X).mean()

def bootstrap(self,
X_ins: pd.DataFrame,
y: pd.Series,
treated: pd.Series,
X: None,
frac_samples: float = None,
n_samples: int = 1000,
n_iter: int = 1000
) -> np.array:
"""
Runs bootstrap n_iter times on a sample of size n_samples.
Fits on (X_ins, y, treated), then predicts on X.
"""
results = []
for _ in range(n_iter):
X_bs = X_ins.sample(frac=frac_samples,
n=n_samples,
replace=True)
y_bs = y.loc[X_bs.index].reset_index(drop=True)
t_bs = treated.loc[X_bs.index].reset_index(drop=True)

# This overwrites self.models!
self.fit(X_bs.reset_index(drop=True), y_bs, t_bs)
results.append(self.predict_cate(X))

return np.array(results)

def ate_confidence_interval(self,
X_ins: pd.DataFrame,
y: pd.Series,
treated: pd.Series,
X: None,
q: float = .95,
frac_samples: float = None,
n_samples: int = 1000,
n_iter: int = 1000):
"""
Estimates confidence intervals for ATE on X using bootstraping.
"""
cates = self.bootstrap(X_ins,
y,
treated,
X,
frac_samples,
n_samples,
n_iter)
return np.quantile(cates, q=q), np.quantile(cates, q=1-q)

def cate_confidence_interval(self,
X_ins: pd.DataFrame,
y: pd.Series,
treated: pd.Series,
X: None,
q: float = .95,
frac_samples: float = None,
n_samples: int = 1000,
n_iter: int = 1000):
"""
Estimates confidence intervals for CATE on X using bootstraping.
"""
cates = self.bootstrap(X_ins,
y,
treated,
X,
frac_samples,
n_samples,
n_iter)
conf_ints = np.append(np.quantile(cates, q, axis=0).reshape(-1, 1),
np.quantile(cates, 1 - q, axis=0).reshape(-1, 1),
axis=1)
return conf_ints

def fit(self,
X: pd.DataFrame,
y: pd.Series,
treated: pd.Series):
"Fits model."
raise NotImplementedError()

def summary(self):
"Prints summary. Conent is undecided yet."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Prints summary. Conent is undecided yet."
"Prints summary. Content is undecided yet."

raise NotImplementedError()

def plot(self):
"Plots results. Content is undecided yet."
raise NotImplementedError()


class SLearner(MetaLearner):
"""
Implements of S-learner described in [1]. S-learner estimates conditional average
treatment effect with the use of a single model.

[1] Künzel, Sören R., Jasjeet S. Sekhon, Peter J. Bickel, and Bin Yu.
Metalearners for estimating heterogeneous treatment effects using machine learning.
Proceedings of the national academy of sciences 116, no. 10 (2019): 4156-4165.

"""

def __init__(self,
X: pd.DataFrame,
y: pd.Series,
treated: pd.Series,
model) -> None:
super().__init__(X=X, y=y, treated=treated)
self.model = model
self.fit(X, y, treated)
self.cate = self.predict_cate(X)

def fit(self, X: pd.DataFrame,
y: pd.Series,
treated: pd.Series):
X_T = X.assign(treatment=treated)
self.model = self.model.fit(X_T, y)
return self

def predict_cate(self, X: pd.DataFrame) -> np.array:
X_control = X.assign(treatment=0)
X_treated = X.assign(treatment=1)
return self.model.predict(X_treated) - self.model.predict(X_control)


class TLearner(MetaLearner):
"""
Implements of T-learner described in [1]. T-learner fits two separate models to estimate
conditional average treatment effect.

[1] Künzel, Sören R., Jasjeet S. Sekhon, Peter J. Bickel, and Bin Yu.
Metalearners for estimating heterogeneous treatment effects using machine learning.
Proceedings of the national academy of sciences 116, no. 10 (2019): 4156-4165.

"""

def __init__(self,
X: pd.DataFrame,
y: pd.Series,
treated: pd.Series,
model=None,
treated_model=None,
untreated_model=None
) -> None:
super().__init__(X=X, y=y, treated=treated)

if model is None and (untreated_model is None or treated_model is None):
raise (ValueError("Either model or both of treated_model and untreated_model \
have to be specified."))
elif not (model is None or untreated_model is None or treated_model is None):
raise (ValueError("Either model or both of treated_model and untreated_model \
have to be specified."))

if model is not None:
untreated_model = clone(model)
treated_model = clone(model)

self.models = {'treated': treated_model,
'untreated': untreated_model}

self.fit(X, y, treated)
self.cate = self.predict_cate(X)

def fit(self, X: pd.DataFrame,
y: pd.Series,
treated: pd.Series):
self.models['treated'].fit(X[treated == 1], y[treated == 1])
self.models['untreated'].fit(X[treated == 0], y[treated == 0])
return self

def predict_cate(self, X: pd.DataFrame) -> np.array:
treated_model = self.models['treated']
untreated_model = self.models['untreated']
return treated_model.predict(X) - untreated_model.predict(X)


class XLearner(MetaLearner):
"""
Implements of X-learner introduced in [1]. X-learner estimates conditional average treatment
effect with the use of five separate models.

[1] Künzel, Sören R., Jasjeet S. Sekhon, Peter J. Bickel, and Bin Yu.
Metalearners for estimating heterogeneous treatment effects using machine learning.
Proceedings of the national academy of sciences 116, no. 10 (2019): 4156-4165.

"""

def __init__(self,
X,
y,
treated,
model=None,
treated_model=None,
untreated_model=None,
treated_cate_estimator=None,
untreated_cate_estimator=None,
propensity_score_model=None
):
super().__init__(X=X, y=y, treated=treated)

if model is None and (untreated_model is None or treated_model is None):
raise ValueError("""Either model or each of treated_model, untreated_model, \
treated_cate_estimator, untreated_cate_estimator has to be specified.""")
elif not (model is None or untreated_model is None or treated_model is None):
raise ValueError("Either model or each of treated_model, untreated_model, \
treated_cate_estimator, untreated_cate_estimator has to be specified.")

if propensity_score_model is None:
propensity_score_model = LogisticRegression(penalty=None)

if model is not None:
treated_model = clone(model)
untreated_model = clone(model)
treated_cate_estimator = clone(model)
untreated_cate_estimator = clone(model)

self.models = {'treated': treated_model,
'untreated': untreated_model,
'treated_cate': treated_cate_estimator,
'untreated_cate': untreated_cate_estimator,
'propensity': propensity_score_model
}

self.fit(X, y, treated)

# Compute cate
cate_t = treated_cate_estimator.predict(X)
cate_u = treated_cate_estimator.predict(X)
g = self.models['propensity'].predict(X)

self.cate = g * cate_u + (1 - g) * cate_t

def fit(self, X: pd.DataFrame,
y: pd.Series,
treated: pd.Series):
(treated_model,
untreated_model,
treated_cate_estimator,
untreated_cate_estimator,
propensity_score_model) = self.models.values()

# Split data to treated and untreated subsets
X_t, y_t = X[treated == 1], y[treated == 1]
X_u, y_u = X[treated == 0], y[treated == 0]

# Estimate response function
treated_model.fit(X_t, y_t)
untreated_model.fit(X_u, y_u)

tau_t = y_t - untreated_model.predict(X_t)
tau_u = treated_model.predict(X_u) - y_u

# Estimate CATE separately on treated and untreated subsets
treated_cate_estimator.fit(X_t, tau_t)
untreated_cate_estimator.fit(X_u, tau_u)

# Fit propensity score model
propensity_score_model.fit(X, treated)
return self

def predict_cate(self, X):
cate_estimate_treated = self.models['treated_cate'].predict(X)
cate_estimate_untreated = self.models['untreated_cate'].predict(X)

g = self.models['propensity'].predict_proba(X)[:, 1]

return g * cate_estimate_untreated + (1 - g) * cate_estimate_treated


class DRLearner(MetaLearner):
"""
Implements of DR-learner also known as doubly robust learner as described in .

"""

def __init__(self,
X,
y,
treated,
model=None,
treated_model=None,
untreated_model=None,
propensity_score_model=None
):
super().__init__(X=X, y=y, treated=treated)

if model is None and (untreated_model is None or treated_model is None):
raise ValueError("""Either model or each of treated_model, untreated_model, \
treated_cate_estimator, untreated_cate_estimator has to be specified.""")
elif not (model is None or untreated_model is None or treated_model is None):
raise ValueError("Either model or each of treated_model, untreated_model, \
treated_cate_estimator, untreated_cate_estimator has to be specified.")

if propensity_score_model is None:
propensity_score_model = LogisticRegression(penalty=None)

if model is not None:
treated_model = clone(model)
untreated_model = clone(model)

# Estimate response function
self.models = {'treated': treated_model,
'untreated': untreated_model,
'propensity': propensity_score_model}

self.fit(X, y, treated)

# Estimate CATE
g = self.models['propensity'].predict_proba(X)[:, 1]
m0 = untreated_model.predict(X)
m1 = treated_model.predict(X)

self.cate = (treated * (y - m1) / g + m1
- ((1 - treated) * (y - m0) / (1 - g) + m0))

def fit(self, X: pd.DataFrame,
y: pd.Series,
treated: pd.Series):
# Split data to treated and untreated subsets
X_t, y_t = X[treated == 1], y[treated == 1]
X_u, y_u = X[treated == 0], y[treated == 0]

(treated_model,
untreated_model,
propensity_score_model) = self.models.values()

# Estimate response functions
treated_model.fit(X_t, y_t)
untreated_model.fit(X_u, y_u)

# Fit propensity score model
propensity_score_model.fit(X, treated)

return self

def predict_cate(self, X):
return self.cate