Source code for respy.pre_processing.process_covariates

"""This module comprises all functions which process the definition of covariates."""
import copy


[docs]def remove_irrelevant_covariates(options, params):
    """Identify the relevant covariates.

    We try to make every model as sparse as possible which means discarding covariates
    which are irrelevant. The immediate benefit is that memory consumption and start-up
    costs are reduced.

    An advantage further downstream is that the number of lagged choices is inferred
    from covariates. Eliminating irrelevant covariates might reduce the number of
    implemented lags.

    The function catches all relevant "high-level" covariates by looking at the `"name"`
    index in `params`. "Low-level" covariates which are relevant but not included in the
    index are recursively found by checking whether covariates are used in the formula
    of relevant covariates.

    See also
    --------
    separate_covariates_into_core_dense_mixed

    """
    options = copy.deepcopy(options)
    covariates = options["covariates"]

    # Collect initial relevant covariates from params.
    relevant_covs = {}
    for cov in covariates:
        if cov in params.index.get_level_values("name"):
            relevant_covs[cov] = covariates[cov]

    # Start by iterating over initial covariates and add variables which are used to
    # compute them and repeat the process.
    n_relevant_covariates_changed = True
    while n_relevant_covariates_changed:
        n_relevant_covariates = len(relevant_covs)

        for cov in covariates:
            for relevant_cov in relevant_covs:
                if cov in relevant_covs[relevant_cov]:
                    # Append the covariate to the front such that nested covariates are
                    # created in the beginning.
                    relevant_covs = {cov: covariates[cov], **relevant_covs}

        n_relevant_covariates_changed = n_relevant_covariates != len(relevant_covs)

    options["covariates"] = relevant_covs

    return options


[docs]def separate_covariates_into_core_dense_mixed(options, optim_paras):
    """Separate covariates into distinct groups.

    Covariates are separated into three groups.

    1. Covariates which use only information from the core state space.
    2. Covariates which use only information from the dense state space.
    3. Covariates which use information from the core and the dense state space.

    Parameters
    ----------
    options : dict
        Contains among other information covariates and their formulas.
    optim_paras : dict
        Contains information to separate the core and dense state space.

    Returns
    -------
    options : dict
        Contains three new covariate categories.

    """
    options = copy.deepcopy(options)
    covariates = options["covariates"]

    # Define two sets with default covariates for the core and dense state space.
    core_covs = set(
        ["period"]
        + [f"exp_{choice}" for choice in optim_paras["choices_w_exp"]]
        + [f"lagged_choice_{i}" for i in range(1, optim_paras["n_lagged_choices"] + 1)]
    )
    dense_covs = set(optim_paras["observables"])
    if optim_paras["n_types"] >= 2:
        dense_covs |= set(
            ["type"] + [f"type_{i}" for i in range(2, optim_paras["n_types"] + 1)]
        )

    detailed_covariates = {
        cov: {"formula": covariates[cov], "depends_on": set()} for cov in covariates
    }

    # Loop over all covariates and add them two the sets if the formula contains
    # covariates from the sets. If both lengths of the sets do not change anymore, stop.
    n_core_covs_changed = True
    n_dense_covs_changed = True
    while n_core_covs_changed or n_dense_covs_changed:
        n_core_covs = len(core_covs)
        n_dense_covs = len(dense_covs)

        for cov, formula in covariates.items():
            matches_core = [i for i in core_covs if i in formula]
            if matches_core:
                core_covs.update([cov])

            matches_dense = [i for i in dense_covs if i in formula]
            if matches_dense:
                dense_covs.update([cov])

            detailed_covariates[cov]["depends_on"] |= set(matches_core + matches_dense)

        n_core_covs_changed = n_core_covs != len(core_covs)
        n_dense_covs_changed = n_dense_covs != len(dense_covs)

    only_core_covs = core_covs - dense_covs
    only_dense_covs = dense_covs - core_covs
    independent_covs = set(covariates) - core_covs - dense_covs

    options["covariates_core"] = {
        cov: detailed_covariates[cov]
        for cov in only_core_covs | independent_covs
        if cov in detailed_covariates
    }
    options["covariates_dense"] = {
        cov: detailed_covariates[cov]
        for cov in only_dense_covs
        if cov in detailed_covariates
    }
    options["covariates_mixed"] = {
        cov: detailed_covariates[cov] for cov in core_covs & dense_covs
    }
    # We cannot overwrite `options["covariates"]`.
    options["covariates_all"] = detailed_covariates

    return options


[docs]def identify_necessary_covariates(dependents, definitions):
    """Identify covariates necessary to compute `dependents`.

    This function can be used if only a specific subset of covariates is necessary and
    not all covariates.

    See also
    --------
    respy.likelihood._compute_x_beta_for_type_probabilities

    """
    dependents = {dependents} if isinstance(dependents, str) else set(dependents)
    new_dependents = dependents.copy()

    while new_dependents:
        deps = list(new_dependents)
        new_dependents = set()
        for dependent in deps:
            if dependent in definitions and definitions[dependent]["depends_on"]:
                dependents |= definitions[dependent]["depends_on"]
                new_dependents |= definitions[dependent]["depends_on"]
            else:
                dependents.remove(dependent)

    covariates = {dep: definitions[dep] for dep in dependents}

    return covariates