Source code for respy.pre_processing.data_checking

"""Test functions to ensure the validity of data."""
import numpy as np

from respy.shared import generate_column_dtype_dict_for_estimation


[docs]def check_estimation_data(df, optim_paras): """Check data for estimation. Parameters ---------- optim_paras : dict Dictionary containing model optim_paras. df : pandas.DataFrame Data for estimation. Raises ------ AssertionError If data has not the expected format. """ # Make sure all columns are available. col_dtype = generate_column_dtype_dict_for_estimation(optim_paras) df = df.reset_index()[col_dtype] n_periods = optim_paras["n_periods"] # 1. Identifier. # It is assumed in the likelihood function that Identifier starts at 0 and # increments in steps of one. unique = df["Identifier"].unique() assert (unique == np.arange(len(unique))).all() # 2. Period. assert df.Period.le(n_periods - 1).all() # 3. Choice. assert df.Choice.isin(optim_paras["choices"]).all() # 4. Wage. assert df.Wage.fillna(1).gt(0).all() # 8. Lagged_Choice. for i in range(1, optim_paras["n_lagged_choices"] + 1): assert df[f"Lagged_Choice_{i}"].isin(optim_paras["choices"]).all() if optim_paras["n_periods"] > 1 and optim_paras["n_lagged_choices"] > 0: choices = ["Choice"] + [ f"Lagged_Choice_{i}" for i in range(1, optim_paras["n_lagged_choices"] + 1) ][:-1] for i in range(len(choices) - 1): lc = df.groupby("Identifier")[choices[i]].transform("shift").dropna() assert ( df[choices[i + 1]].loc[~df.Period.le(i)].cat.codes == lc.cat.codes ).all() # Observable characteristics. for observable in optim_paras["observables"]: assert df[observable.title()].nunique() <= len( optim_paras["observables"][observable] ) # Others. assert df.drop(columns="Wage").notna().all().all() # We check individual state variables against the recorded choices. for choice in optim_paras["choices_w_exp"]: df["__exp"] = df[f"Experience_{choice.title()}"] + df["Choice"].eq(choice) shifted_exp = ( df.groupby("Identifier")["__exp"].transform("shift").dropna().astype(int) ) assert shifted_exp.eq( df.loc[shifted_exp.index, f"Experience_{choice.title()}"] ).all() # Check that there are no duplicated observations for any period by agent. assert ~df.duplicated(subset=["Identifier", "Period"]).any() # Check that we observe the whole sequence of observations. max_periods_per_ind = df.groupby("Identifier").Period.max() + 1 n_obs_per_ind = df.groupby("Identifier").size() assert (max_periods_per_ind == n_obs_per_ind).all()
[docs]def check_simulated_data(optim_paras, df): """Check simulated data. This routine runs some consistency checks on the simulated dataset. Some more restrictions are imposed on the simulated dataset than the observed data. """ df = df.copy() # Distribute class attributes n_types = optim_paras["n_types"] # Run all tests available for the estimation data. check_estimation_data(df, optim_paras) # 9. Types. if optim_paras["n_types"] >= 2: assert df.Type.max() <= n_types - 1 assert df.Type.notna().all() assert df.groupby("Identifier").Type.nunique().eq(1).all() # Check that there are not missing wage observations if an agent is working. Also, # we check that if an agent is not working, there also is no wage observation. is_working = df["Choice"].isin(optim_paras["choices_w_wage"]) assert df.Wage[is_working].notna().all() assert df.Wage[~is_working].isna().all()