Prototype G (#672)

* wip

* cleanup

* cleanup 2

* fixing import ordering linter error

* updating backend to use score G

* adding percentile to score output

* update tippeanoe compression

Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
This commit is contained in:
Lucas Merrill Brown 2021-09-14 09:48:11 -05:00 committed by GitHub
parent 92d7f40004
commit 1083e953da
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 123 additions and 39 deletions

View file

@ -57,6 +57,8 @@ TILES_SCORE_COLUMNS = [
"Score D (top 25th percentile)", "Score D (top 25th percentile)",
"Score E (percentile)", "Score E (percentile)",
"Score E (top 25th percentile)", "Score E (top 25th percentile)",
"Score G (communities)",
"Score G",
"Poverty (Less than 200% of federal poverty line) (percentile)", "Poverty (Less than 200% of federal poverty line) (percentile)",
"Percent individuals age 25 or over with less than high school degree (percentile)", "Percent individuals age 25 or over with less than high school degree (percentile)",
"Linguistic isolation (percent) (percentile)", "Linguistic isolation (percent) (percentile)",

View file

@ -1,7 +1,8 @@
import collections import collections
import functools import functools
from pathlib import Path
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
@ -11,11 +12,11 @@ logger = get_module_logger(__name__)
class ScoreETL(ExtractTransformLoad): class ScoreETL(ExtractTransformLoad):
def __init__(self): def __init__(self):
# Define some global parameters # Define some global parameters
self.BUCKET_SOCIOECONOMIC = "Socioeconomic Factors" self.BUCKET_SOCIOECONOMIC: str = "Socioeconomic Factors"
self.BUCKET_SENSITIVE = "Sensitive populations" self.BUCKET_SENSITIVE: str = "Sensitive populations"
self.BUCKET_ENVIRONMENTAL = "Environmental effects" self.BUCKET_ENVIRONMENTAL: str = "Environmental effects"
self.BUCKET_EXPOSURES = "Exposures" self.BUCKET_EXPOSURES: str = "Exposures"
self.BUCKETS = [ self.BUCKETS: str = [
self.BUCKET_SOCIOECONOMIC, self.BUCKET_SOCIOECONOMIC,
self.BUCKET_SENSITIVE, self.BUCKET_SENSITIVE,
self.BUCKET_ENVIRONMENTAL, self.BUCKET_ENVIRONMENTAL,
@ -24,43 +25,47 @@ class ScoreETL(ExtractTransformLoad):
# A few specific field names # A few specific field names
# TODO: clean this up, I name some fields but not others. # TODO: clean this up, I name some fields but not others.
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" self.UNEMPLOYED_FIELD_NAME: str = "Unemployed civilians (percent)"
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)" self.LINGUISTIC_ISOLATION_FIELD_NAME: str = "Linguistic isolation (percent)"
self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)" self.HOUSING_BURDEN_FIELD_NAME: str = "Housing burden (percent)"
self.POVERTY_FIELD_NAME = ( self.POVERTY_FIELD_NAME: str = (
"Poverty (Less than 200% of federal poverty line)" "Poverty (Less than 200% of federal poverty line)"
) )
self.HIGH_SCHOOL_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree" self.HIGH_SCHOOL_FIELD_NAME: str = "Percent individuals age 25 or over with less than high school degree"
self.STATE_MEDIAN_INCOME_FIELD_NAME: str = ( self.STATE_MEDIAN_INCOME_FIELD_NAME: str = (
"Median household income (State; 2019 inflation-adjusted dollars)" "Median household income (State; 2019 inflation-adjusted dollars)"
) )
self.MEDIAN_INCOME_FIELD_NAME = ( self.MEDIAN_INCOME_FIELD_NAME: str = (
"Median household income in the past 12 months" "Median household income in the past 12 months"
) )
self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = ( self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME: str = (
"Median household income (% of state median household income)" "Median household income (% of state median household income)"
) )
self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME: str = (
"Median household income (% of AMI)"
)
self.AMI_FIELD_NAME: str = "Area Median Income (State or metropolitan)"
# Note: these variable names are slightly different (missing the word `PERCENT`) than those in the source ETL to avoid pylint's duplicate # Note: these variable names are slightly different (missing the word `PERCENT`) than those in the source ETL to avoid pylint's duplicate
# code error. - LMB # code error. - LMB
self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME = ( self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME: str = (
"Percent of individuals < 100% Federal Poverty Line" "Percent of individuals < 100% Federal Poverty Line"
) )
self.POVERTY_LESS_THAN_150_FPL_FIELD_NAME = ( self.POVERTY_LESS_THAN_150_FPL_FIELD_NAME: str = (
"Percent of individuals < 150% Federal Poverty Line" "Percent of individuals < 150% Federal Poverty Line"
) )
self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME = ( self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME: str = (
"Percent of individuals < 200% Federal Poverty Line" "Percent of individuals < 200% Federal Poverty Line"
) )
# There's another aggregation level (a second level of "buckets"). # There's another aggregation level (a second level of "buckets").
self.AGGREGATION_POLLUTION = "Pollution Burden" self.AGGREGATION_POLLUTION: str = "Pollution Burden"
self.AGGREGATION_POPULATION = "Population Characteristics" self.AGGREGATION_POPULATION: str = "Population Characteristics"
self.PERCENTILE_FIELD_SUFFIX = " (percentile)" self.PERCENTILE_FIELD_SUFFIX: str = " (percentile)"
self.MIN_MAX_FIELD_SUFFIX = " (min-max normalized)" self.MIN_MAX_FIELD_SUFFIX: str = " (min-max normalized)"
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" / "full" self.SCORE_CSV_PATH: Path = self.DATA_PATH / "score" / "csv" / "full"
# dataframes # dataframes
self.df: pd.DataFrame self.df: pd.DataFrame
@ -146,6 +151,16 @@ class ScoreETL(ExtractTransformLoad):
renamed_field=self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME, renamed_field=self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME,
bucket=None, bucket=None,
), ),
DataSet(
input_field=self.AMI_FIELD_NAME,
renamed_field=self.AMI_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME,
renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME,
bucket=None,
),
# The following data sets have buckets, because they're used in Score C # The following data sets have buckets, because they're used in Score C
DataSet( DataSet(
input_field="CANCER", input_field="CANCER",
@ -523,7 +538,33 @@ class ScoreETL(ExtractTransformLoad):
def _add_score_g(self, df: pd.DataFrame) -> pd.DataFrame: def _add_score_g(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Adding Score G") logger.info("Adding Score G")
# TODO: add scoring
high_school_cutoff_threshold = 0.05
df["Score G (communities)"] = (
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.7)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
) | (
(df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.50)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
)
df["Score G"] = df["Score G (communities)"].astype(int)
df["Score G (percentile)"] = df["Score G"]
df["NMTC (communities)"] = (
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
) | (
(df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20)
)
df["NMTC modified (communities)"] = (
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
) | (
(df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
)
return df return df
# TODO Move a lot of this to the ETL part of the pipeline # TODO Move a lot of this to the ETL part of the pipeline
@ -564,13 +605,15 @@ class ScoreETL(ExtractTransformLoad):
# Calculate median income variables. # Calculate median income variables.
# First, calculate the income of the block group as a fraction of the state income. # First, calculate the income of the block group as a fraction of the state income.
# TODO: handle null values for CBG median income, which are `-666666666`.
df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] = ( df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] = (
df[self.MEDIAN_INCOME_FIELD_NAME] df[self.MEDIAN_INCOME_FIELD_NAME]
/ df[self.STATE_MEDIAN_INCOME_FIELD_NAME] / df[self.STATE_MEDIAN_INCOME_FIELD_NAME]
) )
# TODO: Calculate the income of the block group as a fraction of the AMI (either state or metropolitan, depending on reference). # Calculate the income of the block group as a fraction of the AMI (either state or metropolitan, depending on reference).
df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] = (
df[self.MEDIAN_INCOME_FIELD_NAME] / df[self.AMI_FIELD_NAME]
)
# TODO Refactor to no longer use the data_sets list and do all renaming in ETL step # TODO Refactor to no longer use the data_sets list and do all renaming in ETL step
# Rename columns: # Rename columns:
@ -669,4 +712,5 @@ class ScoreETL(ExtractTransformLoad):
def load(self) -> None: def load(self) -> None:
logger.info("Saving Score CSV") logger.info("Saving Score CSV")
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False) self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)

View file

@ -26,8 +26,8 @@ class GeoScoreETL(ExtractTransformLoad):
self.DATA_PATH / "census" / "geojson" / "us.json" self.DATA_PATH / "census" / "geojson" / "us.json"
) )
self.TARGET_SCORE_NAME = "Score D (percentile)" self.TARGET_SCORE_NAME = "Score G"
self.TARGET_SCORE_RENAME_TO = "D_SCORE" self.TARGET_SCORE_RENAME_TO = "G_SCORE"
self.NUMBER_OF_BUCKETS = 10 self.NUMBER_OF_BUCKETS = 10

View file

@ -99,6 +99,16 @@ class CensusACSETL(ExtractTransformLoad):
self.MEDIAN_INCOME_FIELD self.MEDIAN_INCOME_FIELD
] ]
# Handle null values for CBG median income, which are `-666666666`.
missing_value_count = sum(self.df[self.MEDIAN_INCOME_FIELD_NAME]==-666666666)
logger.info(
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[self.MEDIAN_INCOME_FIELD_NAME].count())}%) values of "
+ f"`{self.MEDIAN_INCOME_FIELD_NAME}` being marked as null values."
)
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
self.MEDIAN_INCOME_FIELD_NAME
].replace(to_replace=-666666666, value=None)
# Calculate percent unemployment. # Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
self.df[self.UNEMPLOYED_FIELD_NAME] = ( self.df[self.UNEMPLOYED_FIELD_NAME] = (

View file

@ -85,7 +85,7 @@
"execution_count": null, "execution_count": null,
"id": "3b1b5ccf", "id": "3b1b5ccf",
"metadata": { "metadata": {
"scrolled": false "scrolled": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -107,6 +107,21 @@
"cejst_df.head()" "cejst_df.head()"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "d9968187",
"metadata": {},
"outputs": [],
"source": [
"# Analyze one field at a time (useful for setting thresholds)\n",
"field = \"Percent of individuals < 200% Federal Poverty Line\"\n",
"print(cejst_df[field].describe())\n",
"quantile = .8\n",
"print(f\"Quantile at {quantile} is {np.nanquantile(a=cejst_df[field], q=quantile)}\")\n",
"cejst_df[field].hist()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -201,8 +216,8 @@
")\n", ")\n",
"\n", "\n",
"\n", "\n",
"if len(merged_df) > 220333:\n", "if len(merged_df) > 220335:\n",
" raise ValueError(\"Too many rows in the join.\")\n", " raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n",
"\n", "\n",
"merged_df.head()\n", "merged_df.head()\n",
"\n", "\n",
@ -232,22 +247,33 @@
"\n", "\n",
"# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n", "# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n",
"# (`census_tract_indices`).\n", "# (`census_tract_indices`).\n",
"\n",
"census_block_group_indices = [\n", "census_block_group_indices = [\n",
" Index(\n", " Index(\n",
" method_name=\"Score G\",\n",
" priority_communities_field=\"Score G (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"NMTC\",\n",
" priority_communities_field=\"NMTC (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score F\",\n", " method_name=\"Score F\",\n",
" priority_communities_field=\"Score F (communities)\",\n", " priority_communities_field=\"Score F (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n", " other_census_tract_fields_to_keep=[],\n",
" ),\n", " ),\n",
" Index(\n", "# Index(\n",
" method_name=\"Score F (socioeconomic only)\",\n", "# method_name=\"Score F (socioeconomic only)\",\n",
" priority_communities_field=\"Meets socioeconomic criteria\",\n", "# priority_communities_field=\"Meets socioeconomic criteria\",\n",
" other_census_tract_fields_to_keep=[],\n", "# other_census_tract_fields_to_keep=[],\n",
" ),\n", "# ),\n",
" Index(\n", "# Index(\n",
" method_name=\"Score F (burden only)\",\n", "# method_name=\"Score F (burden only)\",\n",
" priority_communities_field=\"Meets burden criteria\",\n", "# priority_communities_field=\"Meets burden criteria\",\n",
" other_census_tract_fields_to_keep=[],\n", "# other_census_tract_fields_to_keep=[],\n",
" ),\n", "# ),\n",
" Index(\n", " Index(\n",
" method_name=\"Score A\",\n", " method_name=\"Score A\",\n",
" priority_communities_field=\"Score A (top 25th percentile)\",\n", " priority_communities_field=\"Score A (top 25th percentile)\",\n",

View file

@ -38,6 +38,7 @@ def generate_tiles(data_path: Path) -> None:
logger.info("Generating USA High mvt folders and files") logger.info("Generating USA High mvt folders and files")
cmd = "tippecanoe " cmd = "tippecanoe "
cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --no-tile-compression " cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --no-tile-compression "
cmd += "--drop-densest-as-needed "
cmd += f"--output-to-directory={high_tile_path} --layer=blocks " cmd += f"--output-to-directory={high_tile_path} --layer=blocks "
cmd += str(score_geojson_dir / "usa-high.json") cmd += str(score_geojson_dir / "usa-high.json")
call(cmd, shell=True) call(cmd, shell=True)
@ -54,6 +55,7 @@ def generate_tiles(data_path: Path) -> None:
logger.info("Generating USA Low mvt folders and files") logger.info("Generating USA Low mvt folders and files")
cmd = "tippecanoe " cmd = "tippecanoe "
cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --no-tile-compression " cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --no-tile-compression "
cmd += "--drop-densest-as-needed "
cmd += f"--output-to-directory={low_tile_path} --layer=blocks " cmd += f"--output-to-directory={low_tile_path} --layer=blocks "
cmd += str(score_geojson_dir / "usa-low.json") cmd += str(score_geojson_dir / "usa-low.json")
call(cmd, shell=True) call(cmd, shell=True)