CDC SVI Index: Additions to data-pipeline and comparison tool (#1096)

* wip

* working

* working

* rename

* documentation

* add link

* add readme

* update fieldnames

* typo

* add comparison tool

* revise wording

* variable change for FIPS

* workding

* wording in readme

* cleanup wording

* update comparison tool

* final tune up

* grammar and punctuation in the documentation

* period

* cleanup comments

* added revisions

* parallelism

* PR feedback from Lucas

* remove extraneous fields from comparison tool

* style

* updates

* remove themes

* formatting

* remove referenes to percentile rank

* remove referenes to percentile rank

* typo in fieldnames

* updates based on feedback from Lucas

* fieldnames formatting

* fix broken markdown link

Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
Saran Ahluwalia 2022-01-14 14:52:37 -05:00 committed by GitHub
commit 87e08f5fe1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 337 additions and 36 deletions

View file

@ -50,7 +50,9 @@
}
],
"source": [
"df_raw = pd.read_csv('master_raw_data.csv') #file from the 'Data collection.ipynb'\n",
"df_raw = pd.read_csv(\n",
" \"master_raw_data.csv\"\n",
") # file from the 'Data collection.ipynb'\n",
"print(df_raw.shape)\n",
"print(df_raw.columns)"
]
@ -296,8 +298,8 @@
}
],
"source": [
"#More advanced imputation method\n",
"df_imputed_x = pd.read_excel('20200420_input_final.xlsx')\n",
"# More advanced imputation method\n",
"df_imputed_x = pd.read_excel(\"20200420_input_final.xlsx\")\n",
"df_imputed_x.head()"
]
},
@ -389,22 +391,48 @@
}
],
"source": [
"data = df_imputed_x.copy() #data\n",
"model1 = Model(data) #Initializing class variable\n",
"data = df_imputed_x.copy() # data\n",
"model1 = Model(data) # Initializing class variable\n",
"\n",
"# variables list to include in our regression model.\n",
"columns_regress = ['HE_FOOD', 'HE_WALK', 'HE_VACANCY', 'HE_SUPRFND','HE_HLTHINS','BINGE_CrudePrev',\n",
" 'CHECKUP_CrudePrev','BPHIGH_CrudePrev','SLEEP_CrudePrev', 'STROKE_CrudePrev', 'Drug Test', 'Pedalcyclist']\n",
"columns_regress = [\n",
" \"HE_FOOD\",\n",
" \"HE_WALK\",\n",
" \"HE_VACANCY\",\n",
" \"HE_SUPRFND\",\n",
" \"HE_HLTHINS\",\n",
" \"BINGE_CrudePrev\",\n",
" \"CHECKUP_CrudePrev\",\n",
" \"BPHIGH_CrudePrev\",\n",
" \"SLEEP_CrudePrev\",\n",
" \"STROKE_CrudePrev\",\n",
" \"Drug Test\",\n",
" \"Pedalcyclist\",\n",
"]\n",
"\n",
"# multipliers for each variable to rescale the variables as higher the value better for the health score.\n",
"multiply_cols = {'HE_FOOD': -1,'HE_WALK': 1,'HE_VACANCY': -1,'HE_SUPRFND':-1 , 'HE_HLTHINS': 1 ,'BINGE_CrudePrev': -1 , \n",
" 'CHECKUP_CrudePrev': 1,'BPHIGH_CrudePrev': -1,'SLEEP_CrudePrev': -1,'STROKE_CrudePrev' : -1, 'Drug Test' : -1, 'Pedalcyclist' : -1,}\n",
"multiply_cols = {\n",
" \"HE_FOOD\": -1,\n",
" \"HE_WALK\": 1,\n",
" \"HE_VACANCY\": -1,\n",
" \"HE_SUPRFND\": -1,\n",
" \"HE_HLTHINS\": 1,\n",
" \"BINGE_CrudePrev\": -1,\n",
" \"CHECKUP_CrudePrev\": 1,\n",
" \"BPHIGH_CrudePrev\": -1,\n",
" \"SLEEP_CrudePrev\": -1,\n",
" \"STROKE_CrudePrev\": -1,\n",
" \"Drug Test\": -1,\n",
" \"Pedalcyclist\": -1,\n",
"}\n",
"\n",
"#target variable\n",
"target = 'life expectancy'\n",
"# target variable\n",
"target = \"life expectancy\"\n",
"\n",
"#storing the data and model weights to calculate health score.\n",
"multiplied_zscore_data_le, params_le = model1.model_output(columns_regress,target,multiply_cols)"
"# storing the data and model weights to calculate health score.\n",
"multiplied_zscore_data_le, params_le = model1.model_output(\n",
" columns_regress, target, multiply_cols\n",
")"
]
},
{
@ -456,16 +484,42 @@
}
],
"source": [
"columns_regress = ['HE_FOOD', 'HE_WALK', 'HE_VACANCY', 'HE_SUPRFND','HE_HLTHINS','BINGE_CrudePrev',\n",
" 'CHECKUP_CrudePrev','BPHIGH_CrudePrev','SLEEP_CrudePrev', 'STROKE_CrudePrev', 'Drug Test', 'Pedalcyclist']\n",
"columns_regress = [\n",
" \"HE_FOOD\",\n",
" \"HE_WALK\",\n",
" \"HE_VACANCY\",\n",
" \"HE_SUPRFND\",\n",
" \"HE_HLTHINS\",\n",
" \"BINGE_CrudePrev\",\n",
" \"CHECKUP_CrudePrev\",\n",
" \"BPHIGH_CrudePrev\",\n",
" \"SLEEP_CrudePrev\",\n",
" \"STROKE_CrudePrev\",\n",
" \"Drug Test\",\n",
" \"Pedalcyclist\",\n",
"]\n",
"\n",
"multiply_cols = {'HE_FOOD': -1,'HE_WALK': 1,'HE_VACANCY': -1,'HE_SUPRFND':-1 , 'HE_HLTHINS': 1 ,'BINGE_CrudePrev': -1 , \n",
" 'CHECKUP_CrudePrev': 1,'BPHIGH_CrudePrev': -1,'SLEEP_CrudePrev': -1,'STROKE_CrudePrev' : -1, 'Drug Test' : -1, 'Pedalcyclist' : -1,}\n",
"multiply_cols = {\n",
" \"HE_FOOD\": -1,\n",
" \"HE_WALK\": 1,\n",
" \"HE_VACANCY\": -1,\n",
" \"HE_SUPRFND\": -1,\n",
" \"HE_HLTHINS\": 1,\n",
" \"BINGE_CrudePrev\": -1,\n",
" \"CHECKUP_CrudePrev\": 1,\n",
" \"BPHIGH_CrudePrev\": -1,\n",
" \"SLEEP_CrudePrev\": -1,\n",
" \"STROKE_CrudePrev\": -1,\n",
" \"Drug Test\": -1,\n",
" \"Pedalcyclist\": -1,\n",
"}\n",
"\n",
"target = 'PHLTH_CrudePrev'\n",
"target = \"PHLTH_CrudePrev\"\n",
"target_multiplier = -1\n",
"\n",
"multiplied_zscore_data_1, params_1 = model1.model_output(columns_regress,target,multiply_cols,target_multiplier = target_multiplier)"
"multiplied_zscore_data_1, params_1 = model1.model_output(\n",
" columns_regress, target, multiply_cols, target_multiplier=target_multiplier\n",
")"
]
},
{
@ -517,20 +571,46 @@
}
],
"source": [
"#regression of MHLTH_CrudePrev \n",
"# regression of MHLTH_CrudePrev\n",
"\n",
"\n",
"columns_regress = ['HE_FOOD', 'HE_WALK', 'HE_VACANCY', 'HE_SUPRFND','HE_HLTHINS','BINGE_CrudePrev',\n",
" 'CHECKUP_CrudePrev','BPHIGH_CrudePrev','SLEEP_CrudePrev', 'STROKE_CrudePrev', 'Drug Test', 'Pedalcyclist']\n",
"columns_regress = [\n",
" \"HE_FOOD\",\n",
" \"HE_WALK\",\n",
" \"HE_VACANCY\",\n",
" \"HE_SUPRFND\",\n",
" \"HE_HLTHINS\",\n",
" \"BINGE_CrudePrev\",\n",
" \"CHECKUP_CrudePrev\",\n",
" \"BPHIGH_CrudePrev\",\n",
" \"SLEEP_CrudePrev\",\n",
" \"STROKE_CrudePrev\",\n",
" \"Drug Test\",\n",
" \"Pedalcyclist\",\n",
"]\n",
"\n",
"multiply_cols = {'HE_FOOD': -1,'HE_WALK': 1,'HE_VACANCY': -1,'HE_SUPRFND':-1 , 'HE_HLTHINS': 1 ,'BINGE_CrudePrev': -1 , \n",
" 'CHECKUP_CrudePrev': 1,'BPHIGH_CrudePrev': -1,'SLEEP_CrudePrev': -1,'STROKE_CrudePrev' : -1, 'Drug Test' : -1, 'Pedalcyclist' : -1,}\n",
"multiply_cols = {\n",
" \"HE_FOOD\": -1,\n",
" \"HE_WALK\": 1,\n",
" \"HE_VACANCY\": -1,\n",
" \"HE_SUPRFND\": -1,\n",
" \"HE_HLTHINS\": 1,\n",
" \"BINGE_CrudePrev\": -1,\n",
" \"CHECKUP_CrudePrev\": 1,\n",
" \"BPHIGH_CrudePrev\": -1,\n",
" \"SLEEP_CrudePrev\": -1,\n",
" \"STROKE_CrudePrev\": -1,\n",
" \"Drug Test\": -1,\n",
" \"Pedalcyclist\": -1,\n",
"}\n",
"\n",
"\n",
"target = 'MHLTH_CrudePrev'\n",
"target = \"MHLTH_CrudePrev\"\n",
"target_multiplier = -1\n",
"\n",
"multiplied_zscore_data_2, params_2 = model1.model_output(columns_regress,target,multiply_cols,target_multiplier = target_multiplier)"
"multiplied_zscore_data_2, params_2 = model1.model_output(\n",
" columns_regress, target, multiply_cols, target_multiplier=target_multiplier\n",
")"
]
},
{
@ -579,10 +659,12 @@
}
],
"source": [
"health_scores1 = HealthScores(params_1,params_2,params_le,multiplied_zscore_data_1,data['geoid'])\n",
"final_data,weights_table = health_scores1.final_scaled_data()\n",
"health_scores1 = HealthScores(\n",
" params_1, params_2, params_le, multiplied_zscore_data_1, data[\"geoid\"]\n",
")\n",
"final_data, weights_table = health_scores1.final_scaled_data()\n",
"\n",
"health_scores1.weights #Can access the weights directly like this."
"health_scores1.weights # Can access the weights directly like this."
]
},
{

View file

@ -276,6 +276,21 @@
"mapping_inequality_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fceb3136",
"metadata": {},
"outputs": [],
"source": [
"cdc_svi_index_path = DATA_DIR / \"dataset\" / \"cdc_svi_index\" / \"usa.csv\"\n",
"cdc_svi_index_df = pd.read_csv(\n",
" cdc_svi_index_path,\n",
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
")\n",
"cdc_svi_index_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -302,7 +317,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Load alternative energy-related definition \n",
"# Load alternative energy-related definition\n",
"energy_definition_alternative_draft_path = (\n",
" DATA_DIR / \"dataset\" / \"energy_definition_alternative_draft\" / \"usa.csv\"\n",
")\n",
@ -370,7 +385,8 @@
" epa_rsei_aggregate_df,\n",
" maryland_ejscreen_df,\n",
" energy_definition_alternative_draft_df,\n",
" michigan_ejscreen_df\n",
" michigan_ejscreen_df,\n",
" cdc_svi_index_df,\n",
"]\n",
"\n",
"merged_df = functools.reduce(\n",
@ -493,8 +509,8 @@
" ),\n",
" Index(\n",
" method_name=\"EPA RSEI Aggregate Microdata\",\n",
" priority_communities_field=field_names.EPA_RSEI_SCORE_THRESHOLD_FIELD\n",
" ), \n",
" priority_communities_field=field_names.EPA_RSEI_SCORE_THRESHOLD_FIELD,\n",
" ),\n",
" Index(\n",
" method_name=\"Persistent Poverty\",\n",
" priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n",
@ -502,15 +518,19 @@
" Index(\n",
" method_name=\"Maryland EJSCREEN\",\n",
" priority_communities_field=field_names.MARYLAND_EJSCREEN_BURDENED_THRESHOLD_FIELD,\n",
" ), \n",
" ),\n",
" Index(\n",
" method_name=field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE,\n",
" priority_communities_field=field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE,\n",
" ),\n",
" Index(\n",
" method_name=\"CDC SVI Index\",\n",
" priority_communities_field=field_names.CDC_SVI_INDEX_THEMES_PRIORITY_COMMUNITY,\n",
" ),\n",
" Index(\n",
" method_name=\"Michigan EJSCREEN\",\n",
" priority_communities_field=field_names.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_FIELD,\n",
" ), \n",
" ),\n",
" ]\n",
" # Insert indices for each of the HOLC factors.\n",
" # Note: since these involve no renaming, we write them using list comprehension.\n",