class glm:
"""
Convenience helpers for fitting and summarizing Gaussian linear models.
Examples
--------
>>> df = tp.tibble({'y': [1.0, 2.0, 3.0], 'x': [0.0, 1.0, 2.0]})
>>> results = glm.estimate_gaussian('y ~ x', data=df, se_robust='HC1')
>>> summary = glm.estimate_gaussian_summarize(results)
>>> summary['fit_stats']['N.obs']
3
"""
# Regression: Gaussian -----------------------------------------------
def linear(formula, data, se_cluster=None, se_robust=None,
weights=1, *args, **kws):
"""
Fit a Gaussian linear model (OLS) with optional robust or clustered SEs.
Parameters
----------
formula : str
Patsy-style formula, e.g., 'y ~ x1 + x2'.
data : pandas.DataFrame
Data containing all variables in the formula.
se_cluster : str or array-like, optional
Column name in `data` (or array-like of group IDs) for clustered SEs.
If provided, cluster-robust SEs are used.
Ignored if se_robust is provided
se_robust : None or str, optional (default None)
String with the the type (HC1, HC2, etc.)
Returns
-------
statsmodels.regression.linear_model.RegressionResultsWrapper
Fitted model results; call .summary() to view.
"""
assert se_cluster is None or isinstance(se_cluster, str), (
"'se_cluster 'must be None or a string")
# weights
if isinstance(weights, str):
assert weights in df.names, f"'weights' ({weights}) not found."
weights = np.array(df.pull(weights))
# remove NAs
variables = ut.parse_formula(formula)
data = data.select(variables['lhs'], variables['terms'], se_cluster).drop_null().to_pandas()
# Fit vanilla OLS
res = lm(formula, data=data, weights=weights)
# Use heteroskedasticity-robust (HC1)
if se_robust is not None:
res = res.fit(cov_type=se_robust)
# Clustered SE takes precedence if provided
elif se_cluster is not None:
if isinstance(se_cluster, str):
groups = data[se_cluster]
else:
groups = np.asarray(se_cluster)
res = res.fit(cov_type="cluster", cov_kwds={'groups': groups})
# classic SE
else:
res = res.fit()
# Default: conventional OLS SEs
return res
def estimate_gaussian_summarize(model):
# """
# Extract tidy summary results and fit statistics from a statsmodels OLS result.
# Parameters
# ----------
# model : statsmodels.regression.linear_model.RegressionResultsWrapper
# Fitted model object from `estimate_gaussian`.
# Returns
# -------
# dict
# Dictionary with:
# - 'coefficients': tidy table with coef, std err, t, pval
# - 'fit_statistics': R2, Adj R2, AIC, BIC, nobs, df_resid, df_model
# """
# Coefficients table
coef_df = model.summary2().tables[1].reset_index()
coef_df.columns = ["term", "estimate", "se", "t", "pvalue", "lo", "hi"]
coef_df = tp.tibble(coef_df)
se_type = model.cov_type
if se_type=="nonrobust":
se = 'Classic'
elif se_type=='cluster':
se = f"Clustered ({model.cov_kwds['groups'].name})"
elif bool(re.search(pattern="^H", string=se_type)):
se = f"Robust"
se_msg = model.cov_kwds['description']
# Fit statistics
fit_stats = {
'Std.Error': se,
"N.obs": int(model.nobs),
"RMSE": ut.compute_rmse(residuals=model.resid),
"AIC": model.aic,
"BIC": model.bic,
"R2": model.rsquared,
"R2 (adj)": model.rsquared_adj,
"DF (resid)": int(model.df_resid),
"DF (model)": int(model.df_model),
}
return {
"tidy": coef_df.select(FIT_STATS_CORE),
"fit_stats": fit_stats,
"se": {'type':se, 'description':se_msg},
'options':{}
}