Skip to content

linear

Signature/Parameters

def linear(formula, data, se_cluster = None, se_robust = None, weights = 1, *args, **kws)

Fit a Gaussian linear model (OLS) with optional robust or clustered SEs.

Parameters:

Name Type Description Default
formula str

Patsy-style formula, e.g., ‘y ~ x1 + x2’.

required
data DataFrame

Data containing all variables in the formula.

required
se_cluster str or array - like

Column name in data (or array-like of group IDs) for clustered SEs. If provided, cluster-robust SEs are used. Ignored if se_robust is provided

None
se_robust None or str, optional (default None)

String with the the type (HC1, HC2, etc.)

None

Returns:

Type Description
RegressionResultsWrapper

Fitted model results; call .summary() to view.

Source code in causalinf/models.py
def linear(formula, data, se_cluster=None, se_robust=None,
           weights=1, *args, **kws):
    """
    Fit a Gaussian linear model (OLS) with optional robust or clustered SEs.

    Parameters
    ----------
    formula : str
        Patsy-style formula, e.g., 'y ~ x1 + x2'.
    data : pandas.DataFrame
        Data containing all variables in the formula.
    se_cluster : str or array-like, optional
        Column name in `data` (or array-like of group IDs) for clustered SEs.
        If provided, cluster-robust SEs are used.
        Ignored if se_robust is provided
    se_robust : None or str, optional (default None)
        String with the the type (HC1, HC2, etc.)

    Returns
    -------
    statsmodels.regression.linear_model.RegressionResultsWrapper
        Fitted model results; call .summary() to view.
    """
    assert se_cluster is None or isinstance(se_cluster, str), (
        "'se_cluster 'must be None or a string")

    # weights
    if isinstance(weights, str):
        assert weights in df.names, f"'weights' ({weights}) not found."
        weights = np.array(df.pull(weights))

    # remove NAs
    variables = ut.parse_formula(formula)
    data = data.select(variables['lhs'], variables['terms'], se_cluster).drop_null().to_pandas()

    # Fit vanilla OLS
    res = lm(formula, data=data, weights=weights)

    # Use heteroskedasticity-robust (HC1)
    if se_robust is not None:
        res = res.fit(cov_type=se_robust)

    # Clustered SE takes precedence if provided
    elif se_cluster is not None:
        if isinstance(se_cluster, str):
            groups = data[se_cluster]
        else:
            groups = np.asarray(se_cluster)
        res = res.fit(cov_type="cluster", cov_kwds={'groups': groups})

    # classic SE
    else:
        res = res.fit()

    # Default: conventional OLS SEs
    return res