including fraction of state AMI

This commit is contained in:
lucasmbrown-usds 2021-08-09 21:30:41 -05:00
commit ce5e8c5351
4 changed files with 1016 additions and 775 deletions

View file

@ -0,0 +1,53 @@
GEOID2,Median household income (State)
01,50536
02,77640
04,58945
05,47597
06,75235
08,72331
09,78444
10,68287
11,86420
12,55660
13,58700
15,81275
16,55785
17,65886
18,56303
19,60523
20,59597
21,50589
22,49469
23,57918
24,84805
25,81215
26,57144
27,71306
28,45081
29,55461
30,54970
31,61439
32,60365
33,76768
34,82545
35,49754
36,68486
37,54602
38,64894
39,56602
40,52919
41,62818
42,61744
44,67167
45,53199
46,58275
47,53320
48,61874
49,71621
50,61973
51,74222
53,73775
54,46711
55,61747
56,64049
72,20539
1 GEOID2 Median household income (State)
2 01 50536
3 02 77640
4 04 58945
5 05 47597
6 06 75235
7 08 72331
8 09 78444
9 10 68287
10 11 86420
11 12 55660
12 13 58700
13 15 81275
14 16 55785
15 17 65886
16 18 56303
17 19 60523
18 20 59597
19 21 50589
20 22 49469
21 23 57918
22 24 84805
23 25 81215
24 26 57144
25 27 71306
26 28 45081
27 29 55461
28 30 54970
29 31 61439
30 32 60365
31 33 76768
32 34 82545
33 35 49754
34 36 68486
35 37 54602
36 38 64894
37 39 56602
38 40 52919
39 41 62818
40 42 61744
41 44 67167
42 45 53199
43 46 58275
44 47 53320
45 48 61874
46 49 71621
47 50 61973
48 51 74222
49 53 73775
50 54 46711
51 55 61747
52 56 64049
53 72 20539

View file

@ -24,7 +24,20 @@ class CensusACSETL(ExtractTransformLoad):
]
self.MEDIAN_INCOME_FIELD = "B19013_001E"
self.MEDIAN_INCOME_FIELD_NAME = "Median household income in the past 12 months"
self.MEDIAN_INCOME_STATE_FIELD_NAME = "Median household income (State)"
self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = (
"Median household income (% of state median household income)"
)
self.STATE_GEOID_FIELD_NAME = "GEOID2"
self.df: pd.DataFrame
self.state_median_income_df: pd.DataFrame
# TODO: refactor this to put this file on s3 and download it from there
self.STATE_MEDIAN_INCOME_FILE_PATH = (
self.DATA_PATH
/ "needs_to_be_moved_to_s3"
/ "2014_to_2019_state_median_income.csv"
)
def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
"""Create a FIPS code from the proprietary censusgeo index."""
@ -59,12 +72,36 @@ class CensusACSETL(ExtractTransformLoad):
func=self._fips_from_censusdata_censusgeo
)
self.state_median_income_df = pd.read_csv(
# TODO: Replace with reading from S3.
filepath_or_buffer=self.STATE_MEDIAN_INCOME_FILE_PATH,
dtype={self.STATE_GEOID_FIELD_NAME: "string"},
)
def transform(self) -> None:
logger.info("Starting Census ACS Transform")
# Rename median income
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[self.MEDIAN_INCOME_FIELD]
# TODO: handle null values for CBG median income, which are `-666666666`.
# Join state data on CBG data:
self.df[self.STATE_GEOID_FIELD_NAME] = (
self.df[self.GEOID_FIELD_NAME].astype(str).str[0:2]
)
self.df = self.df.merge(
self.state_median_income_df,
how="left",
on=self.STATE_GEOID_FIELD_NAME,
)
# Calculate the income of the block group as a fraction of the state income:
self.df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] = (
self.df[self.MEDIAN_INCOME_FIELD_NAME]
/ self.df[self.MEDIAN_INCOME_STATE_FIELD_NAME]
)
# Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
@ -98,6 +135,8 @@ class CensusACSETL(ExtractTransformLoad):
self.UNEMPLOYED_FIELD_NAME,
self.LINGUISTIC_ISOLATION_FIELD_NAME,
self.MEDIAN_INCOME_FIELD_NAME,
self.MEDIAN_INCOME_STATE_FIELD_NAME,
self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
]
self.df[columns_to_include].to_csv(

View file

@ -0,0 +1,136 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0491828b",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import censusdata\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n",
"\n",
"\n",
"ACS_YEAR = 2019\n",
"\n",
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
"\n",
"GEOID_FIELD_NAME = \"GEOID10\"\n",
"UNEMPLOYED_FIELD_NAME = \"Unemployed Civilians (fraction)\"\n",
"\n",
"# Some display settings to make pandas outputs more readable.\n",
"pd.set_option(\"display.expand_frame_repr\", False)\n",
"pd.set_option(\"display.precision\", 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "654f25a1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
"censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8999cea4",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
" \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
" fips = \"\".join([value for (key, value) in censusgeo.params()])\n",
" return fips\n",
"\n",
"\n",
"dfs = []\n",
"for fips in get_state_fips_codes(DATA_PATH):\n",
" print(f\"Fetching data for fips {fips}\")\n",
" dfs.append(\n",
" censusdata.download(\n",
" src=\"acs5\",\n",
" year=ACS_YEAR,\n",
" geo=censusdata.censusgeo(\n",
" [\n",
" (\"state\", fips) \n",
" #, (\"county\", \"*\"), (\"block group\", \"*\")\n",
" ]\n",
" ),\n",
" var=[\"B23025_005E\", \"B23025_003E\", \"B19013_001E\"],\n",
" )\n",
" )\n",
"\n",
"df = pd.concat(dfs)\n",
"\n",
"df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a269bb1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
"\n",
"df.rename(columns={\"GEOID10\": \"GEOID2\", \"B19013_001E\": \"Median household income (State)\"}, inplace=True)\n",
"\n",
"df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91932af5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}