mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 01:54:18 -08:00
This ended up being a pretty large task. Here's what this PR does: 1. Pulls in Vincent's data from island areas into the score ETL. This is from the 2010 decennial census, the last census of any kind in the island areas. 2. Grabs a few new fields from 2010 island areas decennial census. 3. Calculates area median income for island areas. 4. Stops using EJSCREEN as the source of our high school education data and directly pulls that from census (this was related to this project so I went ahead and fixed it). 5. Grabs a bunch of data from the 2010 ACS in the states/Puerto Rico/DC, so that we can create percentiles comparing apples-to-apples (ish) from 2010 island areas decennial census data to 2010 ACS data. This required creating a new class because all the ACS fields are different between 2010 and 2019, so it wasn't as simple as looping over a year parameter. 6. Creates a combined population field of island areas and mainland so we can use those stats in our comparison tool, and updates the comparison tool accordingly.
150 lines
4.1 KiB
Text
150 lines
4.1 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4899d2ef",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import censusdata\n",
|
|
"import csv\n",
|
|
"from pathlib import Path\n",
|
|
"import os\n",
|
|
"import sys\n",
|
|
"\n",
|
|
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
|
|
"if module_path not in sys.path:\n",
|
|
" sys.path.append(module_path)\n",
|
|
"\n",
|
|
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n",
|
|
"\n",
|
|
"\n",
|
|
"ACS_YEAR = 2010\n",
|
|
"\n",
|
|
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
|
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
|
|
"\n",
|
|
"GEOID_FIELD_NAME = \"GEOID10\"\n",
|
|
"UNEMPLOYED_FIELD_NAME = \"Unemployed Civilians (fraction)\"\n",
|
|
"\n",
|
|
"# Some display settings to make pandas outputs more readable.\n",
|
|
"pd.set_option(\"display.expand_frame_repr\", False)\n",
|
|
"pd.set_option(\"display.precision\", 2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4dd8feec",
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
|
|
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
|
|
"# censusdata.printtable(\n",
|
|
"# censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
|
|
"# )\n",
|
|
"\n",
|
|
"censusdata.search(\n",
|
|
" src=\"acs5\", year=ACS_YEAR, field=\"label\", criterion=\"employment status\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7b40afd3",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
|
|
" \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
|
|
" fips = \"\".join([value for (key, value) in censusgeo.params()])\n",
|
|
" return fips\n",
|
|
"\n",
|
|
"\n",
|
|
"dfs = []\n",
|
|
"for fips in get_state_fips_codes(DATA_PATH):\n",
|
|
" print(f\"Fetching data for fips {fips}\")\n",
|
|
" dfs.append(\n",
|
|
" censusdata.download(\n",
|
|
" src=\"acs5\",\n",
|
|
" year=ACS_YEAR,\n",
|
|
" geo=censusdata.censusgeo(\n",
|
|
" [\n",
|
|
" (\"state\", fips)\n",
|
|
" # , (\"county\", \"*\"), (\"block group\", \"*\")\n",
|
|
" ]\n",
|
|
" ),\n",
|
|
" var=[\"B23025_005E\", \"B23025_003E\", \"B19013_001E\"],\n",
|
|
" )\n",
|
|
" )\n",
|
|
"\n",
|
|
"df = pd.concat(dfs)\n",
|
|
"\n",
|
|
"df[GEOID_FIELD_NAME] = df.index.to_series().apply(\n",
|
|
" func=fips_from_censusdata_censusgeo\n",
|
|
")\n",
|
|
"\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "caa0b502",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
|
|
"\n",
|
|
"df.rename(\n",
|
|
" columns={\n",
|
|
" \"GEOID10\": \"GEOID2\",\n",
|
|
" \"B19013_001E\": \"Median household income (State)\",\n",
|
|
" },\n",
|
|
" inplace=True,\n",
|
|
")\n",
|
|
"\n",
|
|
"# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f2bddf6a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|