Source code for respy.data
"""Everything related to the original data from Keane and Wolpin (1997)."""
import numpy as np
import pandas as pd
from respy import shared as rp_shared
from respy.config import TEST_RESOURCES_DIR
from respy.pre_processing.model_processing import process_params_and_options
[docs]def _create_working_experience(df, optim_paras):
for choice in optim_paras["choices_w_wage"]:
df[f"Experience_{choice.title()}"] = df.Choice.eq(choice)
df[f"Experience_{choice.title()}"] = (
df.groupby("Identifier")[f"Experience_{choice.title()}"]
.shift()
.fillna(0)
.astype(np.uint8)
)
df[f"Experience_{choice.title()}"] = df.groupby("Identifier")[
f"Experience_{choice.title()}"
].cumsum()
return df
[docs]def create_kw_97(params, options):
"""Create data for Keane and Wolpin (1997).
The data includes individuals labor market history and accumulated experiences in
white-collar, blue-collar occupations, military and schooling.
"""
optim_paras, options = process_params_and_options(params, options)
dtypes = {
"Identifier": int,
"Age": int,
"Experience_School": np.uint8,
"Choice": "category",
"Wage": float,
}
df = pd.read_csv(
TEST_RESOURCES_DIR / "kw_97_data.csv", dtype=dtypes, float_precision="high"
)
df.Identifier = df.groupby("Identifier").ngroup().astype(np.uint16)
codes_to_choices = {
"3": "white_collar",
"4": "blue_collar",
"5": "military",
"1": "school",
"2": "home",
}
df.Choice = df.Choice.cat.set_categories(codes_to_choices).cat.rename_categories(
codes_to_choices
)
df = _create_working_experience(df, optim_paras)
df["Lagged_Choice_1"] = df.groupby("Identifier").Choice.shift(1)
df["Period"] = df.Age - 16
df = df.query("Age >= 16")
cd_dict = rp_shared.generate_column_dtype_dict_for_estimation(optim_paras)
df = df[cd_dict].set_index(["Identifier", "Period"])
return df