From 1083e953da638a490dda3787339e6da687822caa Mon Sep 17 00:00:00 2001 From: Lucas Merrill Brown Date: Tue, 14 Sep 2021 09:48:11 -0500 Subject: [PATCH] Prototype G (#672) * wip * cleanup * cleanup 2 * fixing import ordering linter error * updating backend to use score G * adding percentile to score output * update tippeanoe compression Co-authored-by: Jorge Escobar --- .../data_pipeline/etl/score/constants.py | 2 + .../data_pipeline/etl/score/etl_score.py | 92 ++++++++++++++----- .../data_pipeline/etl/score/etl_score_geo.py | 4 +- .../etl/sources/census_acs/etl.py | 10 ++ .../ipython/scoring_comparison.ipynb | 52 ++++++++--- .../data_pipeline/tile/generate.py | 2 + 6 files changed, 123 insertions(+), 39 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 3d262f98..9eac120f 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -57,6 +57,8 @@ TILES_SCORE_COLUMNS = [ "Score D (top 25th percentile)", "Score E (percentile)", "Score E (top 25th percentile)", + "Score G (communities)", + "Score G", "Poverty (Less than 200% of federal poverty line) (percentile)", "Percent individuals age 25 or over with less than high school degree (percentile)", "Linguistic isolation (percent) (percentile)", diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 1ede5a5a..14f181b7 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -1,7 +1,8 @@ import collections import functools - +from pathlib import Path import pandas as pd + from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.utils import get_module_logger @@ -11,11 +12,11 @@ logger = get_module_logger(__name__) class ScoreETL(ExtractTransformLoad): def __init__(self): # Define some global parameters - self.BUCKET_SOCIOECONOMIC = "Socioeconomic Factors" - self.BUCKET_SENSITIVE = "Sensitive populations" - self.BUCKET_ENVIRONMENTAL = "Environmental effects" - self.BUCKET_EXPOSURES = "Exposures" - self.BUCKETS = [ + self.BUCKET_SOCIOECONOMIC: str = "Socioeconomic Factors" + self.BUCKET_SENSITIVE: str = "Sensitive populations" + self.BUCKET_ENVIRONMENTAL: str = "Environmental effects" + self.BUCKET_EXPOSURES: str = "Exposures" + self.BUCKETS: str = [ self.BUCKET_SOCIOECONOMIC, self.BUCKET_SENSITIVE, self.BUCKET_ENVIRONMENTAL, @@ -24,43 +25,47 @@ class ScoreETL(ExtractTransformLoad): # A few specific field names # TODO: clean this up, I name some fields but not others. - self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" - self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)" - self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)" - self.POVERTY_FIELD_NAME = ( + self.UNEMPLOYED_FIELD_NAME: str = "Unemployed civilians (percent)" + self.LINGUISTIC_ISOLATION_FIELD_NAME: str = "Linguistic isolation (percent)" + self.HOUSING_BURDEN_FIELD_NAME: str = "Housing burden (percent)" + self.POVERTY_FIELD_NAME: str = ( "Poverty (Less than 200% of federal poverty line)" ) - self.HIGH_SCHOOL_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree" + self.HIGH_SCHOOL_FIELD_NAME: str = "Percent individuals age 25 or over with less than high school degree" self.STATE_MEDIAN_INCOME_FIELD_NAME: str = ( "Median household income (State; 2019 inflation-adjusted dollars)" ) - self.MEDIAN_INCOME_FIELD_NAME = ( + self.MEDIAN_INCOME_FIELD_NAME: str = ( "Median household income in the past 12 months" ) - self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = ( + self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME: str = ( "Median household income (% of state median household income)" ) + self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME: str = ( + "Median household income (% of AMI)" + ) + self.AMI_FIELD_NAME: str = "Area Median Income (State or metropolitan)" # Note: these variable names are slightly different (missing the word `PERCENT`) than those in the source ETL to avoid pylint's duplicate # code error. - LMB - self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME = ( + self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME: str = ( "Percent of individuals < 100% Federal Poverty Line" ) - self.POVERTY_LESS_THAN_150_FPL_FIELD_NAME = ( + self.POVERTY_LESS_THAN_150_FPL_FIELD_NAME: str = ( "Percent of individuals < 150% Federal Poverty Line" ) - self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME = ( + self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME: str = ( "Percent of individuals < 200% Federal Poverty Line" ) # There's another aggregation level (a second level of "buckets"). - self.AGGREGATION_POLLUTION = "Pollution Burden" - self.AGGREGATION_POPULATION = "Population Characteristics" + self.AGGREGATION_POLLUTION: str = "Pollution Burden" + self.AGGREGATION_POPULATION: str = "Population Characteristics" - self.PERCENTILE_FIELD_SUFFIX = " (percentile)" - self.MIN_MAX_FIELD_SUFFIX = " (min-max normalized)" + self.PERCENTILE_FIELD_SUFFIX: str = " (percentile)" + self.MIN_MAX_FIELD_SUFFIX: str = " (min-max normalized)" - self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" / "full" + self.SCORE_CSV_PATH: Path = self.DATA_PATH / "score" / "csv" / "full" # dataframes self.df: pd.DataFrame @@ -146,6 +151,16 @@ class ScoreETL(ExtractTransformLoad): renamed_field=self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME, bucket=None, ), + DataSet( + input_field=self.AMI_FIELD_NAME, + renamed_field=self.AMI_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME, + renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME, + bucket=None, + ), # The following data sets have buckets, because they're used in Score C DataSet( input_field="CANCER", @@ -523,7 +538,33 @@ class ScoreETL(ExtractTransformLoad): def _add_score_g(self, df: pd.DataFrame) -> pd.DataFrame: logger.info("Adding Score G") - # TODO: add scoring + + high_school_cutoff_threshold = 0.05 + + df["Score G (communities)"] = ( + (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.7) + & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold) + ) | ( + (df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.50) + & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold) + ) + df["Score G"] = df["Score G (communities)"].astype(int) + df["Score G (percentile)"] = df["Score G"] + + df["NMTC (communities)"] = ( + (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8) + ) | ( + (df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20) + ) + + df["NMTC modified (communities)"] = ( + (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8) + & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold) + ) | ( + (df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20) + & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold) + ) + return df # TODO Move a lot of this to the ETL part of the pipeline @@ -564,13 +605,15 @@ class ScoreETL(ExtractTransformLoad): # Calculate median income variables. # First, calculate the income of the block group as a fraction of the state income. - # TODO: handle null values for CBG median income, which are `-666666666`. df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] = ( df[self.MEDIAN_INCOME_FIELD_NAME] / df[self.STATE_MEDIAN_INCOME_FIELD_NAME] ) - # TODO: Calculate the income of the block group as a fraction of the AMI (either state or metropolitan, depending on reference). + # Calculate the income of the block group as a fraction of the AMI (either state or metropolitan, depending on reference). + df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] = ( + df[self.MEDIAN_INCOME_FIELD_NAME] / df[self.AMI_FIELD_NAME] + ) # TODO Refactor to no longer use the data_sets list and do all renaming in ETL step # Rename columns: @@ -669,4 +712,5 @@ class ScoreETL(ExtractTransformLoad): def load(self) -> None: logger.info("Saving Score CSV") self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) + self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index 50172186..5ded2956 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -26,8 +26,8 @@ class GeoScoreETL(ExtractTransformLoad): self.DATA_PATH / "census" / "geojson" / "us.json" ) - self.TARGET_SCORE_NAME = "Score D (percentile)" - self.TARGET_SCORE_RENAME_TO = "D_SCORE" + self.TARGET_SCORE_NAME = "Score G" + self.TARGET_SCORE_RENAME_TO = "G_SCORE" self.NUMBER_OF_BUCKETS = 10 diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index 052a6f36..29a6c9ca 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -99,6 +99,16 @@ class CensusACSETL(ExtractTransformLoad): self.MEDIAN_INCOME_FIELD ] + # Handle null values for CBG median income, which are `-666666666`. + missing_value_count = sum(self.df[self.MEDIAN_INCOME_FIELD_NAME]==-666666666) + logger.info( + f"There are {missing_value_count} ({int(100*missing_value_count/self.df[self.MEDIAN_INCOME_FIELD_NAME].count())}%) values of " + + f"`{self.MEDIAN_INCOME_FIELD_NAME}` being marked as null values." + ) + self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[ + self.MEDIAN_INCOME_FIELD_NAME + ].replace(to_replace=-666666666, value=None) + # Calculate percent unemployment. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction. self.df[self.UNEMPLOYED_FIELD_NAME] = ( diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index 22d7b865..01538091 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -85,7 +85,7 @@ "execution_count": null, "id": "3b1b5ccf", "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ @@ -107,6 +107,21 @@ "cejst_df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9968187", + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze one field at a time (useful for setting thresholds)\n", + "field = \"Percent of individuals < 200% Federal Poverty Line\"\n", + "print(cejst_df[field].describe())\n", + "quantile = .8\n", + "print(f\"Quantile at {quantile} is {np.nanquantile(a=cejst_df[field], q=quantile)}\")\n", + "cejst_df[field].hist()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -201,8 +216,8 @@ ")\n", "\n", "\n", - "if len(merged_df) > 220333:\n", - " raise ValueError(\"Too many rows in the join.\")\n", + "if len(merged_df) > 220335:\n", + " raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n", "\n", "merged_df.head()\n", "\n", @@ -232,22 +247,33 @@ "\n", "# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n", "# (`census_tract_indices`).\n", + "\n", "census_block_group_indices = [\n", " Index(\n", + " method_name=\"Score G\",\n", + " priority_communities_field=\"Score G (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"NMTC\",\n", + " priority_communities_field=\"NMTC (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", " method_name=\"Score F\",\n", " priority_communities_field=\"Score F (communities)\",\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", - " Index(\n", - " method_name=\"Score F (socioeconomic only)\",\n", - " priority_communities_field=\"Meets socioeconomic criteria\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Score F (burden only)\",\n", - " priority_communities_field=\"Meets burden criteria\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", + "# Index(\n", + "# method_name=\"Score F (socioeconomic only)\",\n", + "# priority_communities_field=\"Meets socioeconomic criteria\",\n", + "# other_census_tract_fields_to_keep=[],\n", + "# ),\n", + "# Index(\n", + "# method_name=\"Score F (burden only)\",\n", + "# priority_communities_field=\"Meets burden criteria\",\n", + "# other_census_tract_fields_to_keep=[],\n", + "# ),\n", " Index(\n", " method_name=\"Score A\",\n", " priority_communities_field=\"Score A (top 25th percentile)\",\n", diff --git a/data/data-pipeline/data_pipeline/tile/generate.py b/data/data-pipeline/data_pipeline/tile/generate.py index 17c18922..328555b0 100644 --- a/data/data-pipeline/data_pipeline/tile/generate.py +++ b/data/data-pipeline/data_pipeline/tile/generate.py @@ -38,6 +38,7 @@ def generate_tiles(data_path: Path) -> None: logger.info("Generating USA High mvt folders and files") cmd = "tippecanoe " cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --no-tile-compression " + cmd += "--drop-densest-as-needed " cmd += f"--output-to-directory={high_tile_path} --layer=blocks " cmd += str(score_geojson_dir / "usa-high.json") call(cmd, shell=True) @@ -54,6 +55,7 @@ def generate_tiles(data_path: Path) -> None: logger.info("Generating USA Low mvt folders and files") cmd = "tippecanoe " cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --no-tile-compression " + cmd += "--drop-densest-as-needed " cmd += f"--output-to-directory={low_tile_path} --layer=blocks " cmd += str(score_geojson_dir / "usa-low.json") call(cmd, shell=True)