diff --git a/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv b/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv new file mode 100644 index 00000000..6f0cea70 --- /dev/null +++ b/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv @@ -0,0 +1,53 @@ +GEOID2,Median household income (State) +01,50536 +02,77640 +04,58945 +05,47597 +06,75235 +08,72331 +09,78444 +10,68287 +11,86420 +12,55660 +13,58700 +15,81275 +16,55785 +17,65886 +18,56303 +19,60523 +20,59597 +21,50589 +22,49469 +23,57918 +24,84805 +25,81215 +26,57144 +27,71306 +28,45081 +29,55461 +30,54970 +31,61439 +32,60365 +33,76768 +34,82545 +35,49754 +36,68486 +37,54602 +38,64894 +39,56602 +40,52919 +41,62818 +42,61744 +44,67167 +45,53199 +46,58275 +47,53320 +48,61874 +49,71621 +50,61973 +51,74222 +53,73775 +54,46711 +55,61747 +56,64049 +72,20539 diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index 3bee9cb7..103e4572 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -24,7 +24,20 @@ class CensusACSETL(ExtractTransformLoad): ] self.MEDIAN_INCOME_FIELD = "B19013_001E" self.MEDIAN_INCOME_FIELD_NAME = "Median household income in the past 12 months" + self.MEDIAN_INCOME_STATE_FIELD_NAME = "Median household income (State)" + self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = ( + "Median household income (% of state median household income)" + ) + self.STATE_GEOID_FIELD_NAME = "GEOID2" self.df: pd.DataFrame + self.state_median_income_df: pd.DataFrame + + # TODO: refactor this to put this file on s3 and download it from there + self.STATE_MEDIAN_INCOME_FILE_PATH = ( + self.DATA_PATH + / "needs_to_be_moved_to_s3" + / "2014_to_2019_state_median_income.csv" + ) def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str: """Create a FIPS code from the proprietary censusgeo index.""" @@ -59,12 +72,36 @@ class CensusACSETL(ExtractTransformLoad): func=self._fips_from_censusdata_censusgeo ) + self.state_median_income_df = pd.read_csv( + # TODO: Replace with reading from S3. + filepath_or_buffer=self.STATE_MEDIAN_INCOME_FILE_PATH, + dtype={self.STATE_GEOID_FIELD_NAME: "string"}, + ) + def transform(self) -> None: logger.info("Starting Census ACS Transform") # Rename median income self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[self.MEDIAN_INCOME_FIELD] + # TODO: handle null values for CBG median income, which are `-666666666`. + + # Join state data on CBG data: + self.df[self.STATE_GEOID_FIELD_NAME] = ( + self.df[self.GEOID_FIELD_NAME].astype(str).str[0:2] + ) + self.df = self.df.merge( + self.state_median_income_df, + how="left", + on=self.STATE_GEOID_FIELD_NAME, + ) + + # Calculate the income of the block group as a fraction of the state income: + self.df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] = ( + self.df[self.MEDIAN_INCOME_FIELD_NAME] + / self.df[self.MEDIAN_INCOME_STATE_FIELD_NAME] + ) + # Calculate percent unemployment. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction. self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E @@ -98,6 +135,8 @@ class CensusACSETL(ExtractTransformLoad): self.UNEMPLOYED_FIELD_NAME, self.LINGUISTIC_ISOLATION_FIELD_NAME, self.MEDIAN_INCOME_FIELD_NAME, + self.MEDIAN_INCOME_STATE_FIELD_NAME, + self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME, ] self.df[columns_to_include].to_csv( diff --git a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb new file mode 100644 index 00000000..76d22fd7 --- /dev/null +++ b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0491828b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import censusdata\n", + "import csv\n", + "from pathlib import Path\n", + "import os\n", + "import sys\n", + "\n", + "module_path = os.path.abspath(os.path.join(\"../..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + "\n", + "from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n", + "\n", + "\n", + "ACS_YEAR = 2019\n", + "\n", + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n", + "\n", + "GEOID_FIELD_NAME = \"GEOID10\"\n", + "UNEMPLOYED_FIELD_NAME = \"Unemployed Civilians (fraction)\"\n", + "\n", + "# Some display settings to make pandas outputs more readable.\n", + "pd.set_option(\"display.expand_frame_repr\", False)\n", + "pd.set_option(\"display.precision\", 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "654f25a1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", + "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", + "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8999cea4", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n", + " \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n", + " fips = \"\".join([value for (key, value) in censusgeo.params()])\n", + " return fips\n", + "\n", + "\n", + "dfs = []\n", + "for fips in get_state_fips_codes(DATA_PATH):\n", + " print(f\"Fetching data for fips {fips}\")\n", + " dfs.append(\n", + " censusdata.download(\n", + " src=\"acs5\",\n", + " year=ACS_YEAR,\n", + " geo=censusdata.censusgeo(\n", + " [\n", + " (\"state\", fips) \n", + " #, (\"county\", \"*\"), (\"block group\", \"*\")\n", + " ]\n", + " ),\n", + " var=[\"B23025_005E\", \"B23025_003E\", \"B19013_001E\"],\n", + " )\n", + " )\n", + "\n", + "df = pd.concat(dfs)\n", + "\n", + "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a269bb1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n", + "\n", + "df.rename(columns={\"GEOID10\": \"GEOID2\", \"B19013_001E\": \"Median household income (State)\"}, inplace=True)\n", + "\n", + "df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91932af5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index 0e7bbdd1..7eb50ce0 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -3,6 +3,11 @@ { "cell_type": "code", "execution_count": null, + "id": "51412a14", + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ "import collections\n", "import functools\n", @@ -33,15 +38,16 @@ "\n", "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n", "tqdm_notebook.pandas()" - ], - "outputs": [], - "metadata": { - "scrolled": true - } + ] }, { "cell_type": "code", "execution_count": null, + "id": "e3234c61", + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n", "pd.options.display.float_format = \"{:.2f}\".format\n", @@ -72,97 +78,26 @@ "\n", "# Define some suffixes\n", "POPULATION_SUFFIX = \" (priority population)\"" - ], - "outputs": [], - "metadata": { - "scrolled": true - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "# Load CEJST score data\n", - "cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n", - "cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n", - "\n", - "# Create the CBG's Census Tract ID by dropping the last number from the FIPS CODE of the CBG.\n", - "# The CBG ID is the last one character.\n", - "# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n", - "cejst_df.loc[:, GEOID_TRACT_FIELD_NAME] = (\n", - " cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[:-1]\n", - ")\n", - "\n", - "cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (\n", - " cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[0:2]\n", - ")\n", - "\n", - "cejst_df.head()" - ], + "id": "3b1b5ccf", + "metadata": { + "scrolled": false + }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3169: DtypeWarning: Columns (87,88,90) have mixed types.Specify dtype option on import or set low_memory=False.\n", " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n" ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " GEOID10 Housing burden (percent) Total population \\\n", - "0 010010201001 0.15 692 \n", - "1 010010201002 0.15 1153 \n", - "2 010010202001 0.25 1020 \n", - "3 010010202002 0.25 1152 \n", - "4 010010203001 0.21 2555 \n", - "\n", - " Air toxics cancer risk Respiratory hazard index \\\n", - "0 49.38 0.79 \n", - "1 49.38 0.79 \n", - "2 50.32 0.81 \n", - "3 50.32 0.81 \n", - "4 50.77 0.82 \n", - "\n", - " Diesel particulate matter Particulate matter (PM2.5) Ozone \\\n", - "0 0.28 10.00 40.12 \n", - "1 0.28 10.00 40.12 \n", - "2 0.30 10.07 40.22 \n", - "3 0.30 10.07 40.22 \n", - "4 0.36 10.12 40.31 \n", - "\n", - " Traffic proximity and volume Proximity to RMP sites ... \\\n", - "0 91.02 0.09 ... \n", - "1 2.62 0.07 ... \n", - "2 4.68 0.08 ... \n", - "3 218.65 0.09 ... \n", - "4 69.64 0.08 ... \n", - "\n", - " Score D (top 25th percentile) Score E (percentile) \\\n", - "0 False 0.35 \n", - "1 False 0.11 \n", - "2 False 0.51 \n", - "3 False 0.59 \n", - "4 False 0.47 \n", - "\n", - " Score E (top 25th percentile) GEOID State Abbreviation County Name \\\n", - "0 False 1001 AL Autauga County \n", - "1 False 1001 AL Baldwin County \n", - "2 False 1001 AL Barbour County \n", - "3 False 1001 AL Bibb County \n", - "4 False 1001 AL Blount County \n", - "\n", - " State Code State Name GEOID10_TRACT GEOID10_STATE \n", - "0 1.00 Alabama 01001020100 01 \n", - "1 2.00 Alaska 01001020100 01 \n", - "2 4.00 Arizona 01001020200 01 \n", - "3 5.00 Arkansas 01001020200 01 \n", - "4 6.00 California 01001020300 01 \n", - "\n", - "[5 rows x 93 columns]" - ], "text/html": [ "
\n", + " | \n", + " | GEOID10_STATE | \n", + "State name | \n", + "Total CBGs in state | \n", + "Total population in state | \n", + "Score A (top 25th percentile) (priority population) | \n", + "Score A (top 25th percentile) (total CBGs) | \n", + "Score A (top 25th percentile) (percent CBGs) | \n", + "Score A (top 25th percentile) (percent population) | \n", + "Score B (top 25th percentile) (priority population) | \n", + "Score B (top 25th percentile) (total CBGs) | \n", + "... | \n", + "Score E (top 25th percentile) (percent CBGs) | \n", + "Score E (top 25th percentile) (percent population) | \n", + "calenviroscreen_priority_community (priority population) | \n", + "calenviroscreen_priority_community (total CBGs) | \n", + "calenviroscreen_priority_community (percent CBGs) | \n", + "calenviroscreen_priority_community (percent population) | \n", + "hud_recap_priority_community (priority population) | \n", + "hud_recap_priority_community (total CBGs) | \n", + "hud_recap_priority_community (percent CBGs) | \n", + "hud_recap_priority_community (percent population) | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
GEOID10_STATE | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
01 | \n", + "0 | \n", + "01 | \n", + "Alabama | \n", + "3438 | \n", + "4850771 | \n", + "1547345 | \n", + "1326 | \n", + "0.39 | \n", + "0.32 | \n", + "1556417 | \n", + "1323 | \n", + "... | \n", + "0.23 | \n", + "0.19 | \n", + "0 | \n", + "0 | \n", + "0.00 | \n", + "0.00 | \n", + "235117 | \n", + "258 | \n", + "0.08 | \n", + "0.05 | \n", + "
02 | \n", + "0 | \n", + "02 | \n", + "Alaska | \n", + "534 | \n", + "738565 | \n", + "63868 | \n", + "57 | \n", + "0.11 | \n", + "0.09 | \n", + "63868 | \n", + "57 | \n", + "... | \n", + "0.14 | \n", + "0.12 | \n", + "0 | \n", + "0 | \n", + "0.00 | \n", + "0.00 | \n", + "6536 | \n", + "8 | \n", + "0.01 | \n", + "0.01 | \n", + "
04 | \n", + "0 | \n", + "04 | \n", + "Arizona | \n", + "4178 | \n", + "6809946 | \n", + "1956052 | \n", + "1230 | \n", + "0.29 | \n", + "0.29 | \n", + "1960856 | \n", + "1231 | \n", + "... | \n", + "0.30 | \n", + "0.30 | \n", + "0 | \n", + "0 | \n", + "0.00 | \n", + "0.00 | \n", + "560353 | \n", + "378 | \n", + "0.09 | \n", + "0.08 | \n", + "
05 | \n", + "0 | \n", + "05 | \n", + "Arkansas | \n", + "2147 | \n", + "2977944 | \n", + "960799 | \n", + "817 | \n", + "0.38 | \n", + "0.32 | \n", + "975780 | \n", + "826 | \n", + "... | \n", + "0.20 | \n", + "0.18 | \n", + "0 | \n", + "0 | \n", + "0.00 | \n", + "0.00 | \n", + "101200 | \n", + "106 | \n", + "0.05 | \n", + "0.03 | \n", + "
06 | \n", + "0 | \n", + "06 | \n", + "California | \n", + "23212 | \n", + "38982847 | \n", + "12610810 | \n", + "7102 | \n", + "0.31 | \n", + "0.32 | \n", + "12556846 | \n", + "7065 | \n", + "... | \n", + "0.40 | \n", + "0.42 | \n", + "9610287 | \n", + "5690 | \n", + "0.25 | \n", + "0.25 | \n", + "1748765 | \n", + "1013 | \n", + "0.04 | \n", + "0.04 | \n", + "
5 rows × 32 columns
\n", + "\n", - " | \n", - " | GEOID10_STATE | \n", - "State name | \n", - "Total CBGs in state | \n", - "Total population in state | \n", - "Score A (top 25th percentile) (priority population) | \n", - "Score A (top 25th percentile) (total CBGs) | \n", - "Score A (top 25th percentile) (percent CBGs) | \n", - "Score A (top 25th percentile) (percent population) | \n", - "Score B (top 25th percentile) (priority population) | \n", - "Score B (top 25th percentile) (total CBGs) | \n", - "... | \n", - "Score E (top 25th percentile) (percent CBGs) | \n", - "Score E (top 25th percentile) (percent population) | \n", - "calenviroscreen_priority_community (priority population) | \n", - "calenviroscreen_priority_community (total CBGs) | \n", - "calenviroscreen_priority_community (percent CBGs) | \n", - "calenviroscreen_priority_community (percent population) | \n", - "hud_recap_priority_community (priority population) | \n", - "hud_recap_priority_community (total CBGs) | \n", - "hud_recap_priority_community (percent CBGs) | \n", - "hud_recap_priority_community (percent population) | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
GEOID10_STATE | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " |
01 | \n", - "0 | \n", - "01 | \n", - "Alabama | \n", - "3438 | \n", - "4850771 | \n", - "1547345 | \n", - "1326 | \n", - "0.39 | \n", - "0.32 | \n", - "1556417 | \n", - "1323 | \n", - "... | \n", - "0.23 | \n", - "0.19 | \n", - "0 | \n", - "0 | \n", - "0.00 | \n", - "0.00 | \n", - "235117 | \n", - "258 | \n", - "0.08 | \n", - "0.05 | \n", - "
02 | \n", - "0 | \n", - "02 | \n", - "Alaska | \n", - "534 | \n", - "738565 | \n", - "63868 | \n", - "57 | \n", - "0.11 | \n", - "0.09 | \n", - "63868 | \n", - "57 | \n", - "... | \n", - "0.14 | \n", - "0.12 | \n", - "0 | \n", - "0 | \n", - "0.00 | \n", - "0.00 | \n", - "6536 | \n", - "8 | \n", - "0.01 | \n", - "0.01 | \n", - "
04 | \n", - "0 | \n", - "04 | \n", - "Arizona | \n", - "4178 | \n", - "6809946 | \n", - "1956052 | \n", - "1230 | \n", - "0.29 | \n", - "0.29 | \n", - "1960856 | \n", - "1231 | \n", - "... | \n", - "0.30 | \n", - "0.30 | \n", - "0 | \n", - "0 | \n", - "0.00 | \n", - "0.00 | \n", - "560353 | \n", - "378 | \n", - "0.09 | \n", - "0.08 | \n", - "
05 | \n", - "0 | \n", - "05 | \n", - "Arkansas | \n", - "2147 | \n", - "2977944 | \n", - "960799 | \n", - "817 | \n", - "0.38 | \n", - "0.32 | \n", - "975780 | \n", - "826 | \n", - "... | \n", - "0.20 | \n", - "0.18 | \n", - "0 | \n", - "0 | \n", - "0.00 | \n", - "0.00 | \n", - "101200 | \n", - "106 | \n", - "0.05 | \n", - "0.03 | \n", - "
06 | \n", - "0 | \n", - "06 | \n", - "California | \n", - "23212 | \n", - "38982847 | \n", - "12610810 | \n", - "7102 | \n", - "0.31 | \n", - "0.32 | \n", - "12556846 | \n", - "7065 | \n", - "... | \n", - "0.40 | \n", - "0.42 | \n", - "9610287 | \n", - "5690 | \n", - "0.25 | \n", - "0.25 | \n", - "1748765 | \n", - "1013 | \n", - "0.04 | \n", - "0.04 | \n", - "
5 rows × 32 columns
\n", - "