Source code for respy.tests.random_model

"""This module contains the functions for the generation of random requests."""
import collections
import copy

import numpy as np
import pandas as pd
from estimagic.optimization.utilities import cov_matrix_to_params
from estimagic.optimization.utilities import cov_matrix_to_sdcorr_params
from estimagic.optimization.utilities import number_of_triangular_elements_to_dimension

from respy.config import DEFAULT_OPTIONS
from respy.config import ROOT_DIR
from respy.pre_processing.model_processing import process_params_and_options
from respy.pre_processing.specification_helpers import csv_template
from respy.pre_processing.specification_helpers import (
    initial_and_max_experience_template,
)
from respy.pre_processing.specification_helpers import (
    lagged_choices_covariates_template,
)
from respy.pre_processing.specification_helpers import lagged_choices_probs_template
from respy.pre_processing.specification_helpers import observable_coeffs_template
from respy.pre_processing.specification_helpers import observable_prob_template
from respy.shared import generate_column_dtype_dict_for_estimation
from respy.shared import normalize_probabilities
from respy.simulate import _random_choice
from respy.simulate import get_simulate_func


[docs]_BASE_COVARIATES = {
    "not_any_exp_a": "exp_a == 0",
    "not_any_exp_b": "exp_b == 0",
    "any_exp_a": "exp_a > 0",
    "any_exp_b": "exp_b > 0",
    "hs_graduate": "exp_edu >= 12",
    "co_graduate": "exp_edu >= 16",
    "is_minor": "period < 2",
    "is_young_adult": "2 <= period <= 4",
    "is_adult": "5 <= period",
    "constant": "1",
    "exp_a_square": "exp_a ** 2 / 100",
    "exp_b_square": "exp_b ** 2 / 100",
    "up_to_nine_years_edu": "exp_edu <= 9",
    "at_least_ten_years_edu": "exp_edu >= 10",
}
"""dict: Dictionary containing specification of covariates.

.. deprecated:: x.x.x

    This variable must be removed if generate_random_model is rewritten such that
    functions for each replicable paper are written.

"""


[docs]def generate_random_model(
    point_constr=None,
    bound_constr=None,
    n_types=None,
    n_type_covariates=None,
    myopic=False,
):
    """Generate a random model specification.

    Parameters
    ----------
    point_constr : dict
        A full or partial options specification. Elements that are specified here are
        not drawn randomly.
    bound_constr : dict
        Upper bounds for some options to keep computation time reasonable. Can have the
        keys ["max_types", "max_periods", "max_edu_start", "max_agents", "max_draws"]
    n_types : int
        Number of unobserved types.
    n_type_covariates :
        Number of covariates to calculate type probabilities.
    myopic : bool
        Indicator for myopic agents meaning the discount factor is set to zero.

    """
    point_constr = {} if point_constr is None else copy.deepcopy(point_constr)
    bound_constr = {} if bound_constr is None else copy.deepcopy(bound_constr)

    for constr in point_constr, bound_constr:
        assert isinstance(constr, dict)

    bound_constr = _consolidate_bound_constraints(bound_constr)

    if n_types is None:
        n_types = np.random.randint(1, bound_constr["max_types"] + 1)
    if n_type_covariates is None:
        n_type_covariates = np.random.randint(2, 4)

    params = csv_template(
        n_types=n_types, n_type_covariates=n_type_covariates, initialize_coeffs=False
    )
    params["value"] = np.random.uniform(low=-0.05, high=0.05, size=len(params))

    params.loc["delta", "value"] = 1 - np.random.uniform() if myopic is False else 0

    n_shock_coeffs = len(params.loc["shocks_sdcorr"])
    dim = number_of_triangular_elements_to_dimension(n_shock_coeffs)
    helper = np.eye(dim) * 0.5
    helper[np.tril_indices(dim, k=-1)] = np.random.uniform(
        -0.05, 0.2, size=(n_shock_coeffs - dim)
    )
    cov = helper.dot(helper.T)
    params.loc["shocks_sdcorr", "value"] = cov_matrix_to_sdcorr_params(cov)

    params.loc["meas_error", "value"] = np.random.uniform(
        low=0.001, high=0.1, size=len(params.loc["meas_error"])
    )

    n_edu_start = np.random.randint(1, bound_constr["max_edu_start"] + 1)
    edu_starts = point_constr.get(
        "edu_start", np.random.choice(np.arange(1, 15), size=n_edu_start, replace=False)
    )
    edu_shares = point_constr.get("edu_share", _get_initial_shares(n_edu_start))
    edu_max = point_constr.get("edu_max", np.random.randint(max(edu_starts) + 1, 30))
    params = pd.concat(
        [params, initial_and_max_experience_template(edu_starts, edu_shares, edu_max)],
        axis=0,
        sort=False,
    )

    n_lagged_choices = point_constr.get("n_lagged_choices", np.random.choice(2))
    if n_lagged_choices:
        choices = ["a", "b", "edu", "home"]
        lc_probs_params = lagged_choices_probs_template(n_lagged_choices, choices)
        lc_params = pd.read_csv(
            ROOT_DIR / "pre_processing" / "lagged_choice_params.csv"
        )
        lc_params.set_index(["category", "name"], inplace=True)
        params = pd.concat([params, lc_probs_params, lc_params], axis=0, sort=False)
        lc_covariates = lagged_choices_covariates_template()
    else:
        lc_covariates = {}

    observables = point_constr.pop("observables", None)
    if observables is None:
        n_observables = np.random.randint(0, 3)
        # Do not sample observables with 1 level!
        observables = (
            np.random.randint(2, 4, size=n_observables) if n_observables else False
        )

    if observables is not False:
        to_concat = [
            params,
            observable_prob_template(observables),
            observable_coeffs_template(observables, params),
        ]
        params = pd.concat(to_concat, axis="rows", sort=False)

        indices = (
            params.index.get_level_values("category")
            .str.extract(r"observable_([a-z0-9_]+)", expand=False)
            .dropna()
            .unique()
        )
        observable_covs = {x: "{} == {}".format(*x.rsplit("_", 1)) for x in indices}
    else:
        observable_covs = {}

    options = {
        "simulation_agents": np.random.randint(3, bound_constr["max_agents"] + 1),
        "simulation_seed": np.random.randint(1, 1_000),
        "n_periods": np.random.randint(1, bound_constr["max_periods"]),
        "solution_draws": np.random.randint(1, bound_constr["max_draws"]),
        "solution_seed": np.random.randint(1, 10_000),
        "estimation_draws": np.random.randint(1, bound_constr["max_draws"]),
        "estimation_seed": np.random.randint(1, 10_000),
        "estimation_tau": np.random.uniform(100, 500),
        "interpolation_points": -1,
    }

    options = {
        **DEFAULT_OPTIONS,
        **options,
        "covariates": {**_BASE_COVARIATES, **lc_covariates, **observable_covs},
    }

    options = _update_nested_dictionary(options, point_constr)

    return params, options


[docs]def _consolidate_bound_constraints(bound_constr):
    constr = {
        "max_types": 3,
        "max_periods": 3,
        "max_edu_start": 3,
        "max_agents": 1000,
        "max_draws": 100,
    }
    constr.update(bound_constr)

    return constr


[docs]def _get_initial_shares(num_groups):
    """We simply need a valid request for the shares of types summing to one."""
    shares = np.random.uniform(size=num_groups)
    shares = normalize_probabilities(shares)

    return shares


[docs]def simulate_truncated_data(params, options, is_missings=True):
    """Simulate a (truncated) dataset.

    The data can have two more properties. First, truncated history, second, missing
    wages.

    """
    optim_paras, _ = process_params_and_options(params, options)

    simulate = get_simulate_func(params, options)
    df = simulate(params)

    np.random.seed(options["simulation_seed"])

    if is_missings:
        # Truncate the histories of agents. This mimics the effect of attrition.
        # Histories can be truncated after the first period or not at all. So, all
        # individuals have at least one observation.
        period_of_truncation = (  # noqa: F841
            df.reset_index()
            .groupby("Identifier")
            .Period.transform(lambda x: np.random.choice(x.max() + 1) + 1)
            .to_numpy()
        )
        data_subset = df.query("Period < @period_of_truncation").copy()

        # Add some missings to wage data.
        is_working = data_subset["Choice"].isin(optim_paras["choices_w_wage"])
        num_drop_wages = int(is_working.sum() * np.random.uniform(high=0.5))

        if num_drop_wages > 0:
            indices = data_subset["Wage"][is_working].index
            index_missing = np.random.choice(indices, num_drop_wages, replace=False)

            data_subset.loc[index_missing, "Wage"] = np.nan
        else:
            pass
    else:
        data_subset = df

    # We can restrict the information to observed entities only.
    col_dtype = generate_column_dtype_dict_for_estimation(optim_paras)
    data_subset = data_subset[list(col_dtype)[2:]]

    return data_subset


[docs]def _update_nested_dictionary(dict_, other):
    """Update a nested dictionary with another dictionary.

    The basic ``.update()`` method of dictionaries adds non-existing keys or replaces
    existing keys which works fine for unnested dictionaries. For nested dictionaries,
    levels under the current level are not updated but overwritten. This function
    recursively loops over keys and values and inserts the value if it is not a
    dictionary. If it is a dictionary, it applies the same process again.

    """
    for key, value in other.items():
        if isinstance(value, collections.Mapping):
            dict_[key] = _update_nested_dictionary(dict_.get(key, {}), value)
        else:
            dict_[key] = value
    return dict_


[docs]def add_noise_to_params(
    params,
    options,
    delta_low_high=(-0.2, 0.2),
    wages_percent_absolute=(0, 0.2),
    wages_low_high=None,
    wages_null_low_high=(-0.2, 0.2),
    nonpecs_percent_absolute=(0, 0.2),
    nonpecs_low_high=None,
    nonpecs_null_low_high=(-5000, 5000),
    cholesky_low_high=(-0.5, 0.5),
    meas_sd_low_high=(1e-6, 0.5),
    ic_probabilities_low_high=False,
    ic_logit_low_high=False,
    seed=None,
):
    """Add noise to parameters.

    The function allows to vary the noise based on the absolute value for non-zero
    parameters or to simply add noise in forms of bounded random variables.

    The function ensures that special parameters are valid:

    - Probabilities are between 0 and 1.
    - Correlations are between -1 and 1.
    - Diagonal elements of the Cholesky factor have 1e-6 as the lower bound.
    - The standard deviations of the measurement error have 1e-6 as the lower bound.

    Parameters
    ----------
    params : pandas.DataFrame
        The parameters in a DataFrame.
    options : dict
        The options of the model.
    delta_low_high : tuple[float]
        Lower and upper bound to shock to discount factor.
    wages_percent_absolute : float or tuple[float]
        The deviation in percentages of the absolute value of a non-zero parameter is
        either a constant percentage for all parameters or a random percentage between
        upper and lower bounds.
    wages_low_high : tuple[float]
        The deviation for a non-zero parameter value is between an lower and upper
        bound.
    wages_null_low_high : tuple[float]
        The deviation for a parameter with value zero is between the lower and upper
        bound.
    nonpecs_percent_absolute : float or tuple[float]
        The deviation in percentages of the absolute value of a non-zero parameter is
        either a constant percentage for all parameters or a random percentage between
        upper and lower bounds.
    nonpecs_low_high : tuple[float]
        The deviation for a non-zero parameter value is between an lower and upper
        bound.
    nonpecs_null_low_high : tuple[float]
        The deviation for a parameter with value zero is between the lower and upper
        bound.
    cholesky_low_high : tuple[float]
        Lower and upper bound for a shock applied to the Cholesky factor of the shock
        matrix. To ensure proper scaling, the shock is multiplied with the square root
        of the product of diagonal elements for this entry. The shock for the diagonal
        elements is between zero and the upper bound and the resulting diagonal element
        in the Cholesky factor has 1e-6 as the lower bound.
    meas_sd_low_high : tuple[float]
        Lower and upper bound for shock to measurement error standard deviations.
    ic_probabilities_low_high : tuple[float]
        Lower and upper bound for shocks to the probabilities in the initial conditions.
    ic_logit_low_high : tuple[float]
        Lower and upper bound for shocks to the logit coefficients in the initial
        conditions.
    seed : int or None
        Seed to replicate the perturbation.

    Returns
    -------
    params : pandas.DataFrame
        The new parameters.

    """
    if wages_percent_absolute is not None and wages_low_high is not None:
        raise ValueError(
            "Cannot use 'wages_percent_absolute' and 'wages_low_high' at the same "
            "time."
        )
    if nonpecs_percent_absolute is not None and nonpecs_low_high is not None:
        raise ValueError(
            "Cannot use 'nonpecs_percent_absolute' and 'nonpecs_low_high' at the same "
            "time."
        )

    optim_paras, options = process_params_and_options(params, options)

    np.random.seed(seed)

    # Change discount factor.
    delta = params.filter(like="delta", axis=0).copy()
    delta["value"] = np.clip(delta["value"] + np.random.uniform(*delta_low_high), 0, 1)

    # Change non-zero reward parameters.
    wages = params.filter(regex=r"wage_", axis=0).copy()
    nonpecs = params.filter(regex=r"nonpec_", axis=0).copy()
    for rewards, percent_absolute, low_high, null_low_high in zip(
        [wages, nonpecs],
        [wages_percent_absolute, nonpecs_percent_absolute],
        [wages_low_high, nonpecs_low_high],
        [wages_null_low_high, nonpecs_null_low_high],
    ):
        not_zero = ~rewards["value"].eq(0)
        if percent_absolute is not None:
            rewards = _add_percentage_of_absolute_value_as_shock(
                rewards, percent_absolute
            )

        elif low_high is not None:
            low, high = low_high
            rewards.loc[not_zero, "value"] += np.random.uniform(
                low, high, not_zero.sum()
            )

        # Change parameters with value zero.
        if null_low_high is not None:
            low, high = null_low_high
            rewards.loc[~not_zero, "value"] += np.random.uniform(
                low, high, (~not_zero).sum()
            )

    # Change the parameters of the shock matrix.
    shocks = params.filter(regex=r"shocks_(sdcorr|cov|chol)", axis=0).copy()
    if cholesky_low_high:
        low, high = cholesky_low_high
        # Add a random shock to the Cholesky factor of the shock matrix.
        chol = optim_paras["shocks_cholesky"]

        # Create matrix for scaling.
        diag = np.sqrt(np.diag(chol))
        scaling_factor = np.outer(diag, diag)

        # Add random shock to lower triangular.
        idx = np.tril_indices_from(chol, k=-1)
        chol[idx] += (
            np.random.uniform(low, high, size=len(idx[0])) * scaling_factor[idx]
        )

        # Add random shock to diagonal and ensure non-zero elements.
        idx = np.diag_indices_from(chol)
        chol[idx] += np.random.uniform(0, high, size=len(chol)) * scaling_factor[idx]
        chol[idx] = np.clip(chol[idx], 1e-6, None)

        if "shocks_sdcorr" in shocks.index:
            shocks["value"] = cov_matrix_to_sdcorr_params(chol.dot(chol.T))
        elif "shocks_cov" in shocks.index:
            shocks["value"] = cov_matrix_to_params(chol.dot(chol.T))
        elif "shocks_chol" in shocks.index:
            shocks["value"] = cov_matrix_to_params(chol)
        else:
            raise NotImplementedError

    # Change measurement errors.
    meas_sds = params.filter(regex=r"meas_error", axis=0).copy()
    if meas_sd_low_high:
        meas_sds["value"] += np.random.uniform(*meas_sd_low_high, size=len(meas_sds))
        meas_sds["value"] = np.clip(meas_sds["value"], 1e-6, None)

    # Change the parameters of the initial conditions.
    initial_conditions = params.loc[
        params.index.get_level_values("category").str.contains(
            r"initial_exp|lagged_choice|observable|type"
        )
    ].copy()
    is_prob = initial_conditions.index.get_level_values("name") == "probability"
    if ic_probabilities_low_high and bool(initial_conditions):
        initial_conditions.loc[is_prob, "value"] += np.random.uniform(
            *ic_probabilities_low_high, size=is_prob.sum()
        )

        # Correct probabilities.
        initial_conditions.loc[is_prob, "value"] = initial_conditions.loc[
            is_prob, "value"
        ].clip(0, 1)

    if ic_logit_low_high and bool(initial_conditions):
        initial_conditions.loc[~is_prob, "value"] += np.random.uniform(
            *ic_logit_low_high, size=(~is_prob).sum()
        )

    maximum_exps = params.query("category == 'maximum_exp'")

    params = pd.concat(
        [delta, wages, nonpecs, shocks, meas_sds, initial_conditions, maximum_exps]
    ).reindex(index=params.index)

    return params


[docs]def _add_percentage_of_absolute_value_as_shock(rewards, rewards_percent_absolute):
    """Add percentage of value as the shock to rewards."""
    not_zero = ~rewards["value"].eq(0)

    if isinstance(rewards_percent_absolute, tuple):
        low, high = rewards_percent_absolute
        rewards_percent_absolute = np.random.uniform(
            low=low, high=high, size=not_zero.sum()
        )
    possible_signs = np.tile(np.array([-1, 1]), (not_zero.sum(), 1))
    sign = _random_choice(possible_signs)

    rewards.loc[not_zero, "value"] += (
        sign * rewards_percent_absolute * rewards.loc[not_zero, "value"].abs()
    )

    return rewards