From 05ebf9b48cab92fbe5166ddaaae43a4f0a236907 Mon Sep 17 00:00:00 2001 From: Lucas Merrill Brown Date: Sat, 13 Nov 2021 10:29:23 -0500 Subject: [PATCH] Add median house value to Definition L (#882) * Added house value to ETL * Adding house value to score formula and comp tool --- .../data_pipeline/etl/score/etl_score.py | 1 + .../etl/sources/census_acs/etl.py | 46 ++++++++++------ .../etl/sources/census_decennial/etl.py | 10 ++-- .../ipython/census_explore.ipynb | 53 +++++++++++-------- .../ipython/scoring_comparison.ipynb | 2 + .../data_pipeline/score/field_names.py | 2 + .../data_pipeline/score/score_l.py | 24 +++++++-- 7 files changed, 90 insertions(+), 48 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 729586f0..8c610fac 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -312,6 +312,7 @@ class ScoreETL(ExtractTransformLoad): field_names.HIGH_SCHOOL_ED_FIELD, field_names.UNEMPLOYMENT_FIELD, field_names.HT_INDEX_FIELD, + field_names.MEDIAN_HOUSE_VALUE_FIELD, ] non_numeric_columns = [ diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index 25b1bf06..dda15015 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -50,6 +50,11 @@ class CensusACSETL(ExtractTransformLoad): "Percent of individuals < 200% Federal Poverty Line" ) + self.MEDIAN_HOUSE_VALUE_FIELD = "B25077_001E" + self.MEDIAN_HOUSE_VALUE_FIELD_NAME = ( + "Median value ($) of owner-occupied housing units" + ) + self.STATE_GEOID_FIELD_NAME = "GEOID2" self.df: pd.DataFrame @@ -78,7 +83,10 @@ class CensusACSETL(ExtractTransformLoad): # Emploment fields "B23025_005E", "B23025_003E", + # Income field self.MEDIAN_INCOME_FIELD, + # House value + self.MEDIAN_HOUSE_VALUE_FIELD, ] + self.LINGUISTIC_ISOLATION_FIELDS + self.POVERTY_FIELDS, @@ -94,22 +102,27 @@ class CensusACSETL(ExtractTransformLoad): def transform(self) -> None: logger.info("Starting Census ACS Transform") - # Rename median income - self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[ - self.MEDIAN_INCOME_FIELD - ] + # Rename two fields. + self.df = self.df.rename( + columns={ + self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME, + self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME, + } + ) - # Handle null values for CBG median income, which are `-666666666`. - missing_value_count = sum( - self.df[self.MEDIAN_INCOME_FIELD_NAME] == -666666666 - ) - logger.info( - f"There are {missing_value_count} ({int(100*missing_value_count/self.df[self.MEDIAN_INCOME_FIELD_NAME].count())}%) values of " - + f"`{self.MEDIAN_INCOME_FIELD_NAME}` being marked as null values." - ) - self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[ - self.MEDIAN_INCOME_FIELD_NAME - ].replace(to_replace=-666666666, value=None) + # Handle null values for various fields, which are `-666666666`. + for field in [ + self.MEDIAN_INCOME_FIELD_NAME, + self.MEDIAN_HOUSE_VALUE_FIELD_NAME, + ]: + missing_value_count = sum(self.df[field] == -666666666) + logger.info( + f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of " + + f"`{field}` being marked as null values." + ) + self.df[field] = self.df[field].replace( + to_replace=-666666666, value=None + ) # Calculate percent unemployment. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction. @@ -133,8 +146,6 @@ class CensusACSETL(ExtractTransformLoad): / self.df["C16002_001E"] ) - self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME].describe() - # Calculate percent at different poverty thresholds self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = ( self.df["C17002_002E"] + self.df["C17002_003E"] @@ -170,6 +181,7 @@ class CensusACSETL(ExtractTransformLoad): self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME, self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME, self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME, + self.MEDIAN_HOUSE_VALUE_FIELD_NAME, ] self.df[columns_to_include].to_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index 6b51a9c7..f3ba33fe 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -56,13 +56,17 @@ class CensusDecennialETL(ExtractTransformLoad): self.MALE_HIGH_SCHOOL_ED_FIELD = "PBG026005" self.MALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032011" - self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Male!!High school graduate, GED, or alternative; "\ + self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = ( + "Total!!Male!!High school graduate, GED, or alternative; " "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER" + ) self.FEMALE_HIGH_SCHOOL_ED_FIELD = "PBG026012" self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032028" - self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Female!!High school graduate, GED, or alternative; "\ - "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER" + self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = ( + "Total!!Female!!High school graduate, GED, or alternative; " + "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER" + ) self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = ( "PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME" diff --git a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb index 731b7371..d29076d2 100644 --- a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb @@ -3,6 +3,9 @@ { "cell_type": "code", "execution_count": null, + "id": "4899d2ef", + "metadata": {}, + "outputs": [], "source": [ "import pandas as pd\n", "import censusdata\n", @@ -29,28 +32,34 @@ "# Some display settings to make pandas outputs more readable.\n", "pd.set_option(\"display.expand_frame_repr\", False)\n", "pd.set_option(\"display.precision\", 2)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "id": "4dd8feec", + "metadata": { + "scrolled": false + }, + "outputs": [], "source": [ "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", "censusdata.printtable(\n", - " censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\")\n", - ")" - ], - "outputs": [], - "metadata": { - "scrolled": true - } + " censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n", + ")\n", + "\n", + "# censusdata.search(src=\"acs5\", year=ACS_YEAR, field='label', criterion='Owner-occupied units!!Median')" + ] }, { "cell_type": "code", "execution_count": null, + "id": "7b40afd3", + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ "def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n", " \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n", @@ -82,15 +91,16 @@ ")\n", "\n", "df.head()" - ], - "outputs": [], - "metadata": { - "scrolled": true - } + ] }, { "cell_type": "code", "execution_count": null, + "id": "caa0b502", + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ "columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n", "\n", @@ -103,18 +113,15 @@ ")\n", "\n", "# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)" - ], - "outputs": [], - "metadata": { - "scrolled": true - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [], + "id": "f2bddf6a", + "metadata": {}, "outputs": [], - "metadata": {} + "source": [] } ], "metadata": { @@ -138,4 +145,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index 3beb7a0f..b33ae10d 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -86,6 +86,7 @@ "BAD_HEALTH_FIELD = (\n", " \"Physical health not good for >=14 days among adults aged >=18 years\"\n", ")\n", + "MEDIAN_HOUSE_VALUE_FIELD = \"Median value ($) of owner-occupied housing units\"\n", "\n", "# Define some suffixes\n", "POPULATION_SUFFIX = \" (priority population)\"" @@ -186,6 +187,7 @@ " \"Particulate matter (PM2.5) (percentile)\",\n", " \"Traffic proximity and volume (percentile)\",\n", " \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n", + " MEDIAN_HOUSE_VALUE_FIELD,\n", "]:\n", " print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n", " print(cejst_df[field].describe())\n", diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 9c7c22d9..f1f30a76 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -114,6 +114,8 @@ OVER_64_FIELD = "Individuals over 64 years old" # Urban Rural Map URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag" +# Housing value +MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units" # EJSCREEN Areas of Concern EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( diff --git a/data/data-pipeline/data_pipeline/score/score_l.py b/data/data-pipeline/data_pipeline/score/score_l.py index fa149300..6eba9cb1 100644 --- a/data/data-pipeline/data_pipeline/score/score_l.py +++ b/data/data-pipeline/data_pipeline/score/score_l.py @@ -11,6 +11,7 @@ class ScoreL(Score): def __init__(self, df: pd.DataFrame) -> None: self.LOW_INCOME_THRESHOLD: float = 0.65 self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 + self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90 super().__init__(df) def add_columns(self) -> pd.DataFrame: @@ -135,8 +136,12 @@ class ScoreL(Score): ) & transportation_criteria def _housing_factor(self) -> bool: + # ( # In Xth percentile or above for lead paint (Source: Census's American Community Survey’s # percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes) + # AND + # In Yth percentile or below for Median House Value (Source: Census's American Community Survey) + # ) # or # In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset # AND @@ -144,11 +149,20 @@ class ScoreL(Score): # of households where household income is less than or equal to twice the federal # poverty level. Source: Census's American Community Survey] housing_criteria = ( - self.df[ - field_names.LEAD_PAINT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ( + self.df[ + field_names.LEAD_PAINT_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX + ] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + & ( + self.df[ + field_names.MEDIAN_HOUSE_VALUE_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX + ] + < self.MEDIAN_HOUSE_VALUE_THRESHOLD + ) ) | ( self.df[ field_names.HOUSING_BURDEN_FIELD