Source code for respy.data

"""Everything related to the original data from Keane and Wolpin (1997)."""
import numpy as np
import pandas as pd

from respy import shared as rp_shared
from respy.config import TEST_RESOURCES_DIR
from respy.pre_processing.model_processing import process_params_and_options


[docs]def _create_working_experience(df, optim_paras):
    for choice in optim_paras["choices_w_wage"]:
        df[f"Experience_{choice.title()}"] = df.Choice.eq(choice)
        df[f"Experience_{choice.title()}"] = (
            df.groupby("Identifier")[f"Experience_{choice.title()}"]
            .shift()
            .fillna(0)
            .astype(np.uint8)
        )
        df[f"Experience_{choice.title()}"] = df.groupby("Identifier")[
            f"Experience_{choice.title()}"
        ].cumsum()

    return df


[docs]def create_kw_97(params, options):
    """Create data for Keane and Wolpin (1997).

    The data includes individuals labor market history and accumulated experiences in
    white-collar, blue-collar occupations, military and schooling.

    """
    optim_paras, options = process_params_and_options(params, options)

    dtypes = {
        "Identifier": int,
        "Age": int,
        "Experience_School": np.uint8,
        "Choice": "category",
        "Wage": float,
    }

    df = pd.read_csv(
        TEST_RESOURCES_DIR / "kw_97_data.csv", dtype=dtypes, float_precision="high"
    )

    df.Identifier = df.groupby("Identifier").ngroup().astype(np.uint16)

    codes_to_choices = {
        "3": "white_collar",
        "4": "blue_collar",
        "5": "military",
        "1": "school",
        "2": "home",
    }
    df.Choice = df.Choice.cat.set_categories(codes_to_choices).cat.rename_categories(
        codes_to_choices
    )

    df = _create_working_experience(df, optim_paras)

    df["Lagged_Choice_1"] = df.groupby("Identifier").Choice.shift(1)
    df["Period"] = df.Age - 16
    df = df.query("Age >= 16")

    cd_dict = rp_shared.generate_column_dtype_dict_for_estimation(optim_paras)

    df = df[cd_dict].set_index(["Identifier", "Period"])

    return df