Skip to content

glm

Signature/Parameters

class glm

Convenience helpers for fitting and summarizing Gaussian linear models.

Examples:

>>> df = tp.tibble({'y': [1.0, 2.0, 3.0], 'x': [0.0, 1.0, 2.0]})
>>> results = glm.estimate_gaussian('y ~ x', data=df, se_robust='HC1')
>>> summary = glm.estimate_gaussian_summarize(results)
>>> summary['fit_stats']['N.obs']
3
Source code in causalinf/models.py
class glm:
    """
    Convenience helpers for fitting and summarizing Gaussian linear models.

    Examples
    --------
    >>> df = tp.tibble({'y': [1.0, 2.0, 3.0], 'x': [0.0, 1.0, 2.0]})
    >>> results = glm.estimate_gaussian('y ~ x', data=df, se_robust='HC1')
    >>> summary = glm.estimate_gaussian_summarize(results)
    >>> summary['fit_stats']['N.obs']
    3
    """

    # Regression: Gaussian -----------------------------------------------
    def linear(formula, data, se_cluster=None, se_robust=None,
               weights=1, *args, **kws):
        """
        Fit a Gaussian linear model (OLS) with optional robust or clustered SEs.

        Parameters
        ----------
        formula : str
            Patsy-style formula, e.g., 'y ~ x1 + x2'.
        data : pandas.DataFrame
            Data containing all variables in the formula.
        se_cluster : str or array-like, optional
            Column name in `data` (or array-like of group IDs) for clustered SEs.
            If provided, cluster-robust SEs are used.
            Ignored if se_robust is provided
        se_robust : None or str, optional (default None)
            String with the the type (HC1, HC2, etc.)

        Returns
        -------
        statsmodels.regression.linear_model.RegressionResultsWrapper
            Fitted model results; call .summary() to view.
        """
        assert se_cluster is None or isinstance(se_cluster, str), (
            "'se_cluster 'must be None or a string")

        # weights
        if isinstance(weights, str):
            assert weights in df.names, f"'weights' ({weights}) not found."
            weights = np.array(df.pull(weights))

        # remove NAs
        variables = ut.parse_formula(formula)
        data = data.select(variables['lhs'], variables['terms'], se_cluster).drop_null().to_pandas()

        # Fit vanilla OLS
        res = lm(formula, data=data, weights=weights)

        # Use heteroskedasticity-robust (HC1)
        if se_robust is not None:
            res = res.fit(cov_type=se_robust)

        # Clustered SE takes precedence if provided
        elif se_cluster is not None:
            if isinstance(se_cluster, str):
                groups = data[se_cluster]
            else:
                groups = np.asarray(se_cluster)
            res = res.fit(cov_type="cluster", cov_kwds={'groups': groups})

        # classic SE
        else:
            res = res.fit()

        # Default: conventional OLS SEs
        return res

    def estimate_gaussian_summarize(model):
        # """
        # Extract tidy summary results and fit statistics from a statsmodels OLS result.

        # Parameters
        # ----------
        # model : statsmodels.regression.linear_model.RegressionResultsWrapper
        #     Fitted model object from `estimate_gaussian`.

        # Returns
        # -------
        # dict
        #     Dictionary with:
        #     - 'coefficients': tidy table with coef, std err, t, pval
        #     - 'fit_statistics': R2, Adj R2, AIC, BIC, nobs, df_resid, df_model
        # """
        # Coefficients table
        coef_df = model.summary2().tables[1].reset_index()
        coef_df.columns = ["term", "estimate", "se", "t", "pvalue", "lo", "hi"]
        coef_df = tp.tibble(coef_df)

        se_type = model.cov_type
        if se_type=="nonrobust":
            se = 'Classic'
        elif se_type=='cluster':
            se = f"Clustered ({model.cov_kwds['groups'].name})"
        elif bool(re.search(pattern="^H", string=se_type)):
            se = f"Robust"
        se_msg = model.cov_kwds['description']

        # Fit statistics
        fit_stats = {
            'Std.Error': se,
            "N.obs": int(model.nobs),
            "RMSE": ut.compute_rmse(residuals=model.resid),
            "AIC": model.aic,
            "BIC": model.bic,
            "R2": model.rsquared,
            "R2 (adj)": model.rsquared_adj,
            "DF (resid)": int(model.df_resid),
            "DF (model)": int(model.df_model),
        }

        return {
            "tidy": coef_df.select(FIT_STATS_CORE),
            "fit_stats": fit_stats,
            "se": {'type':se, 'description':se_msg},
            'options':{}
        }