Skip to content

lsem

Signature/Parameters

class lsem
def __init__(self, formula, data, estimator = 'auto', ordinal = 'auto', ordinal_auto_ncat = 5, se = None, se_cluster = None, weights = None, silent = False, *args, **kws)

Interface to estimate linear structural equation models via lavaan.

Parameters:

Name Type Description Default
formula str

lavaan-compatible model specification passed to :func:lavaan.sem.

required
data DataFrame

Dataset containing all variables referenced in formula.

required
estimator str

The name of the estimator to use. If ‘auto’, it uses maximum likelihood (‘ML’) if estimating models with continuous outcomes an diagonally weighted least squares (‘DWLS’) if there are ordinal endogenous variables. For other options, see lavaan lavOptions documentation.

'auto'
ordinal str, list, or None

Used to indicate binary and ordinal endogenous variables in the data A string (list) must be the name(s) of the binary and ordinal endogenous variable(s). If ‘auto’: (Default), auto-detect the type and set the estimator type accordingly (see ‘estimator’) If None, use all variables as continuous.

'auto'
ordinal_auto_ncat int

Threshold for categorizing variables as ordinal when using ordinal='auto'. Defaults to 5.

5
se str, bool, or None

Specification for classical, robust, or bootstrap standard errors. Options: - None: use classical standard errors - “robust.huber.white”: standard errors are computed based on the ‘mlr’ (aka pseudo ML, Huber-White) approach. - “robust.sem”: conventional robust standard errors are computed. - “robust”: either “robust.sem” or”robust.huber.white” is used depending on the estimator, the mimic option, and whether the data are complete or not. - “boot” or “bootstrap”: bootstrap standard errors using standard bootstrapping

None
se_cluster str or None

Column name indicating clustering groups for standard errors. If None no clustering is requested.

None
weights str or None

Optional column name specifying sampling weights.

None
silent bool

Suppress estimation progress messages when True. Defaults to False.

False
*args

Additional positional arguments passed directly to lavaan.sem.

()
**kws

Additional keyword arguments passed directly to lavaan.sem.

{}

Examples:

>>> df = tp.tibble({'X': [0, 1, 0], 'Y': [1.0, 2.3, 1.5]})
>>> model = lsem("Y ~ X", data=df, silent=True)
>>> round(model.est.fit['AIC'], 2)
0.0
Source code in causalinf/models.py
class lsem:
    """
    Interface to estimate linear structural equation models via ``lavaan``.

    Parameters
    ----------
    formula : str
        ``lavaan``-compatible model specification passed to :func:`lavaan.sem`.

    data : tidypolars4sci.DataFrame
        Dataset containing all variables referenced in ``formula``.

    estimator : str
        The name of the estimator to use. 
        If 'auto', it uses maximum likelihood ('ML') if estimating
        models with continuous outcomes an diagonally weighted least squares
        ('DWLS') if there are ordinal endogenous variables.
        For other options, see lavaan lavOptions documentation.

    ordinal : str, list, or None
        Used to indicate binary and ordinal endogenous variables in the data 
        A string (list) must be the name(s) of the binary and ordinal
        endogenous variable(s). If 'auto': (Default), auto-detect the
        type and set the estimator type accordingly (see 'estimator')
        If None, use all variables as continuous.

    ordinal_auto_ncat : int, optional
        Threshold for categorizing variables as ordinal when using
        ``ordinal='auto'``. Defaults to ``5``.

    se : str, bool, or None
        Specification for classical, robust, or bootstrap standard errors. Options:
        - None: use classical standard errors 
        - "robust.huber.white": standard errors are computed based on the 'mlr'
           (aka pseudo ML, Huber-White) approach.
        - "robust.sem": conventional robust standard errors are computed.
        - "robust": either "robust.sem" or"robust.huber.white" is used
           depending on the estimator, the mimic option, and whether
          the data are complete or not.
        - "boot" or "bootstrap": bootstrap standard errors using
           standard bootstrapping 

    se_cluster : str or None
        Column name indicating clustering groups for standard errors. If ``None``
        no clustering is requested.

    weights : str or None, optional
        Optional column name specifying sampling weights.

    silent : bool, optional
        Suppress estimation progress messages when ``True``. Defaults to
        ``False``.

    *args :
        Additional positional arguments passed directly to ``lavaan.sem``.
    **kws :
        Additional keyword arguments passed directly to ``lavaan.sem``.

    Examples
    --------
    >>> df = tp.tibble({'X': [0, 1, 0], 'Y': [1.0, 2.3, 1.5]})
    >>> model = lsem("Y ~ X", data=df, silent=True)
    >>> round(model.est.fit['AIC'], 2)
    0.0
    """
    def __init__(self, formula, data,
                 estimator='auto',
                 ordinal='auto',
                 ordinal_auto_ncat=5,
                 se=None,
                 se_cluster=None,
                 weights=None,
                 silent=False, *args, **kws):
        assert kws.get("ordered", None) is None, "Unknown argument: 'ordered'. Do you mean ordinal?"
        assert se_cluster is None or isinstance(se_cluster, str), (
            "'se_cluster 'must be None or a string")

        self.formula=formula
        self.ordinal_auto_ncat = ordinal_auto_ncat
        self.get_endogenous_types(data)
        self.get_ordered(ordinal) # run before self.get_estimator()
        self.get_estimator(estimator)
        se = 'standard' if se is None and se_cluster is None else se
        se = 'robust' if se is None and se_cluster is not None else se
        se_cluster = NULL if se_cluster is None else se_cluster

        # estimation
        # ----------
        if not silent:
            print('Estimating LSEM...', end='')
            if se=='bootstrap':
                print('(using bootstrap std. errors; it may take a while)...')
        self.fit = lavaan.sem(formula, data=convert().tp2tibble(data),
                              estimator=self.estimator,
                              ordered=StrVector(self.ordinal) if isinstance(self.ordinal, list) else self.ordinal,
                              se = se,
                              cluster = se_cluster,
                              *args, **kws
                              )
        print('done!') if not silent else None

        # summary 
        # -------
        self.est = self.summarize()

        # check 
        # -----
        self.check_fit()

    def summarize(self, alpha=0.05, *args, **kws):
        parameters = convert().rtibble2tp(lavaan.parameterEstimates(self.fit))
        parameters = (parameters
                      .rename({'est':'estimate',
                               'z':'statistic',
                               'ci.lower':'lo',
                               'ci.upper':'hi'})
                      .mutate(sig = tp.map(['pvalue'], lambda p: ut.get_stars(*[p])))
                      .unite("term", ['lhs', 'op', 'rhs'], sep=' ')
                      .mutate(term = tp.str_replace_all('term', '~1', '~ 1'))
                      .mutate(term = tp.str_trim('term'))
                      )

        options = convert().rlist2dict(lavaan.lavInspect(self.fit, what='options'))

        se = options['se']
        se = 'classic' if se == 'standard' else se

        fit_all = convert().rvec2dict(lavaan.fitMeasures(self.fit))
        fit = fit_all | {
            "Model"       : "(footnote)",
            "Outcome_type": "(footnote)",
            'Estimator'   : options.get("estimator", 'NA'),
            'Std_Error'   : se,
            "N_obs"       : int(lavaan.lavInspect(self.fit, what='nobs')[0]),
            "RMSE"        : fit_all.get("rmsea", None), # average RMSE
            "AIC"         : fit_all.get("aic", None),
            "BIC"         : fit_all.get("bic", None),
            "R2"          : fit_all.get("r2", None),
            "R2_adj"      : fit_all.get("r2.adjust", None),
            "DF_resid"    : None,
            "DF_model"    : int(fit_all['df']),
        }

        res = FitSummary(parameters=parameters,
                         fit=fit,
                         se = {'type':se, 'description': f'Standard errors: {se}'},
                         info = (self.get_info_endo_vartype() + "; "+
                                 self.get_info_models(parameters)),
                         options = options
                         )
        return res

    def check_fit(self):
        se = list(self.est.parameters.pull('se'))
        assert se[0] is not None, (
            "Standard errors not computed."+
            " Try to set se='bootstrap'")

    def get_endogenous_types(self, data):
        formulas = self.formula.split('\n')
        contain_tilda = [bool(re.search(pattern='~', string=s)) for s in formulas]
        formulas = [f for keep, f in zip(contain_tilda, formulas) if keep]

        self.endo = [endo.split('~')[0].strip() for endo in formulas]
        self.endo_types = ut.detect_variable_type(data=data,
                                                  variables=self.endo,
                                                  ncats_threshold=self.ordinal_auto_ncat)
        categorical = [var for var, type in self.endo_types.items()  if type=='categorical']
        assert not len(categorical)>0, (
            "Endogeneous variables must be ordinal, binary, or continuous."+
            f" Categorical variables: {', '.join(categorical)}"
        )

    def get_ordered(self, ordinal):
        if ordinal == 'auto':
            ordinal = [endo for endo, endo_type in self.endo_types.items()
                       if endo_type != 'continuous']
        elif isinstance(ordinal, str):
            ordinal = [ordinal]

        if ordinal is None or len(ordinal)==0:
            ordinal = NULL

        self.ordinal = ordinal

    def get_estimator(self, estimator):
        if estimator=='auto':
            if isinstance(self.ordinal, list):
                estimator = 'DWLS'
            elif self.ordinal==NULL:
                estimator = 'ML'
        self.estimator=estimator

    def get_info_endo_vartype(self):
        endo_vars = ut.invert_dict(self.endo_types)
        var_types = ', '.join([f"{type.title()} ({', '.join(vars)})" 
                               for type, vars  in endo_vars.items()])
        txt = f"Endogenous variable types: {var_types}"
        return txt

    def get_info_models(self, parameters):
        # probit
        dep_output_pattern = {
            'Probit': {"sep"  : " | ",
                       "regex": '\\| ?t[0-9]+$'}, # Y | t1, etc.
            "Linear": { "sep"  : " ~ ", # Y ~ 1
                        'regex': " ~ "}
        }
        #
        models = {}
        for model, pattern in dep_output_pattern.items():
            sep = pattern['sep']
            contains = pattern['regex']
            models[model] = list(
                parameters
                .filter(tp.col("term").str.contains(contains))
                .separate('term', into=['dep', 'indep'], sep=sep)
                .mutate(dep = tp.str_trim('dep'),
                        indep = tp.str_trim('indep'),
                        model = 'Probit',
                        )
                .filter(tp.col("dep").is_in(self.endo_types.keys()))
                .distinct('dep')
                .pull('dep')
            )
        # remove doubled counted as Linear (b/c pattern Y~1 appears also in Probit models)
        models['Linear'] = [v for v in models['Linear'] if v not in models['Probit']]
        txt = [f"{model} ({', '.join(vars)})" for model, vars in models.items() if len(vars)>0]
        txt = f"Models: {', '.join(txt)}"
        return txt