mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 17:44:20 -08:00
Prototype G (#672)
* wip * cleanup * cleanup 2 * fixing import ordering linter error * updating backend to use score G * adding percentile to score output * update tippeanoe compression Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
This commit is contained in:
parent
92d7f40004
commit
1083e953da
6 changed files with 123 additions and 39 deletions
|
@ -57,6 +57,8 @@ TILES_SCORE_COLUMNS = [
|
|||
"Score D (top 25th percentile)",
|
||||
"Score E (percentile)",
|
||||
"Score E (top 25th percentile)",
|
||||
"Score G (communities)",
|
||||
"Score G",
|
||||
"Poverty (Less than 200% of federal poverty line) (percentile)",
|
||||
"Percent individuals age 25 or over with less than high school degree (percentile)",
|
||||
"Linguistic isolation (percent) (percentile)",
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import collections
|
||||
import functools
|
||||
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
|
@ -11,11 +12,11 @@ logger = get_module_logger(__name__)
|
|||
class ScoreETL(ExtractTransformLoad):
|
||||
def __init__(self):
|
||||
# Define some global parameters
|
||||
self.BUCKET_SOCIOECONOMIC = "Socioeconomic Factors"
|
||||
self.BUCKET_SENSITIVE = "Sensitive populations"
|
||||
self.BUCKET_ENVIRONMENTAL = "Environmental effects"
|
||||
self.BUCKET_EXPOSURES = "Exposures"
|
||||
self.BUCKETS = [
|
||||
self.BUCKET_SOCIOECONOMIC: str = "Socioeconomic Factors"
|
||||
self.BUCKET_SENSITIVE: str = "Sensitive populations"
|
||||
self.BUCKET_ENVIRONMENTAL: str = "Environmental effects"
|
||||
self.BUCKET_EXPOSURES: str = "Exposures"
|
||||
self.BUCKETS: str = [
|
||||
self.BUCKET_SOCIOECONOMIC,
|
||||
self.BUCKET_SENSITIVE,
|
||||
self.BUCKET_ENVIRONMENTAL,
|
||||
|
@ -24,43 +25,47 @@ class ScoreETL(ExtractTransformLoad):
|
|||
|
||||
# A few specific field names
|
||||
# TODO: clean this up, I name some fields but not others.
|
||||
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
|
||||
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
|
||||
self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)"
|
||||
self.POVERTY_FIELD_NAME = (
|
||||
self.UNEMPLOYED_FIELD_NAME: str = "Unemployed civilians (percent)"
|
||||
self.LINGUISTIC_ISOLATION_FIELD_NAME: str = "Linguistic isolation (percent)"
|
||||
self.HOUSING_BURDEN_FIELD_NAME: str = "Housing burden (percent)"
|
||||
self.POVERTY_FIELD_NAME: str = (
|
||||
"Poverty (Less than 200% of federal poverty line)"
|
||||
)
|
||||
self.HIGH_SCHOOL_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree"
|
||||
self.HIGH_SCHOOL_FIELD_NAME: str = "Percent individuals age 25 or over with less than high school degree"
|
||||
self.STATE_MEDIAN_INCOME_FIELD_NAME: str = (
|
||||
"Median household income (State; 2019 inflation-adjusted dollars)"
|
||||
)
|
||||
self.MEDIAN_INCOME_FIELD_NAME = (
|
||||
self.MEDIAN_INCOME_FIELD_NAME: str = (
|
||||
"Median household income in the past 12 months"
|
||||
)
|
||||
self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = (
|
||||
self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME: str = (
|
||||
"Median household income (% of state median household income)"
|
||||
)
|
||||
self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME: str = (
|
||||
"Median household income (% of AMI)"
|
||||
)
|
||||
self.AMI_FIELD_NAME: str = "Area Median Income (State or metropolitan)"
|
||||
|
||||
# Note: these variable names are slightly different (missing the word `PERCENT`) than those in the source ETL to avoid pylint's duplicate
|
||||
# code error. - LMB
|
||||
self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME = (
|
||||
self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME: str = (
|
||||
"Percent of individuals < 100% Federal Poverty Line"
|
||||
)
|
||||
self.POVERTY_LESS_THAN_150_FPL_FIELD_NAME = (
|
||||
self.POVERTY_LESS_THAN_150_FPL_FIELD_NAME: str = (
|
||||
"Percent of individuals < 150% Federal Poverty Line"
|
||||
)
|
||||
self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME = (
|
||||
self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME: str = (
|
||||
"Percent of individuals < 200% Federal Poverty Line"
|
||||
)
|
||||
|
||||
# There's another aggregation level (a second level of "buckets").
|
||||
self.AGGREGATION_POLLUTION = "Pollution Burden"
|
||||
self.AGGREGATION_POPULATION = "Population Characteristics"
|
||||
self.AGGREGATION_POLLUTION: str = "Pollution Burden"
|
||||
self.AGGREGATION_POPULATION: str = "Population Characteristics"
|
||||
|
||||
self.PERCENTILE_FIELD_SUFFIX = " (percentile)"
|
||||
self.MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
|
||||
self.PERCENTILE_FIELD_SUFFIX: str = " (percentile)"
|
||||
self.MIN_MAX_FIELD_SUFFIX: str = " (min-max normalized)"
|
||||
|
||||
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" / "full"
|
||||
self.SCORE_CSV_PATH: Path = self.DATA_PATH / "score" / "csv" / "full"
|
||||
|
||||
# dataframes
|
||||
self.df: pd.DataFrame
|
||||
|
@ -146,6 +151,16 @@ class ScoreETL(ExtractTransformLoad):
|
|||
renamed_field=self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME,
|
||||
bucket=None,
|
||||
),
|
||||
DataSet(
|
||||
input_field=self.AMI_FIELD_NAME,
|
||||
renamed_field=self.AMI_FIELD_NAME,
|
||||
bucket=None,
|
||||
),
|
||||
DataSet(
|
||||
input_field=self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME,
|
||||
renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME,
|
||||
bucket=None,
|
||||
),
|
||||
# The following data sets have buckets, because they're used in Score C
|
||||
DataSet(
|
||||
input_field="CANCER",
|
||||
|
@ -523,7 +538,33 @@ class ScoreETL(ExtractTransformLoad):
|
|||
|
||||
def _add_score_g(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
logger.info("Adding Score G")
|
||||
# TODO: add scoring
|
||||
|
||||
high_school_cutoff_threshold = 0.05
|
||||
|
||||
df["Score G (communities)"] = (
|
||||
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.7)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
|
||||
) | (
|
||||
(df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.50)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
|
||||
)
|
||||
df["Score G"] = df["Score G (communities)"].astype(int)
|
||||
df["Score G (percentile)"] = df["Score G"]
|
||||
|
||||
df["NMTC (communities)"] = (
|
||||
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
|
||||
) | (
|
||||
(df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20)
|
||||
)
|
||||
|
||||
df["NMTC modified (communities)"] = (
|
||||
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
|
||||
) | (
|
||||
(df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
# TODO Move a lot of this to the ETL part of the pipeline
|
||||
|
@ -564,13 +605,15 @@ class ScoreETL(ExtractTransformLoad):
|
|||
|
||||
# Calculate median income variables.
|
||||
# First, calculate the income of the block group as a fraction of the state income.
|
||||
# TODO: handle null values for CBG median income, which are `-666666666`.
|
||||
df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] = (
|
||||
df[self.MEDIAN_INCOME_FIELD_NAME]
|
||||
/ df[self.STATE_MEDIAN_INCOME_FIELD_NAME]
|
||||
)
|
||||
|
||||
# TODO: Calculate the income of the block group as a fraction of the AMI (either state or metropolitan, depending on reference).
|
||||
# Calculate the income of the block group as a fraction of the AMI (either state or metropolitan, depending on reference).
|
||||
df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] = (
|
||||
df[self.MEDIAN_INCOME_FIELD_NAME] / df[self.AMI_FIELD_NAME]
|
||||
)
|
||||
|
||||
# TODO Refactor to no longer use the data_sets list and do all renaming in ETL step
|
||||
# Rename columns:
|
||||
|
@ -669,4 +712,5 @@ class ScoreETL(ExtractTransformLoad):
|
|||
def load(self) -> None:
|
||||
logger.info("Saving Score CSV")
|
||||
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)
|
||||
|
|
|
@ -26,8 +26,8 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
)
|
||||
|
||||
self.TARGET_SCORE_NAME = "Score D (percentile)"
|
||||
self.TARGET_SCORE_RENAME_TO = "D_SCORE"
|
||||
self.TARGET_SCORE_NAME = "Score G"
|
||||
self.TARGET_SCORE_RENAME_TO = "G_SCORE"
|
||||
|
||||
self.NUMBER_OF_BUCKETS = 10
|
||||
|
||||
|
|
|
@ -99,6 +99,16 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.MEDIAN_INCOME_FIELD
|
||||
]
|
||||
|
||||
# Handle null values for CBG median income, which are `-666666666`.
|
||||
missing_value_count = sum(self.df[self.MEDIAN_INCOME_FIELD_NAME]==-666666666)
|
||||
logger.info(
|
||||
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[self.MEDIAN_INCOME_FIELD_NAME].count())}%) values of "
|
||||
+ f"`{self.MEDIAN_INCOME_FIELD_NAME}` being marked as null values."
|
||||
)
|
||||
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
|
||||
self.MEDIAN_INCOME_FIELD_NAME
|
||||
].replace(to_replace=-666666666, value=None)
|
||||
|
||||
# Calculate percent unemployment.
|
||||
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
||||
self.df[self.UNEMPLOYED_FIELD_NAME] = (
|
||||
|
|
|
@ -85,7 +85,7 @@
|
|||
"execution_count": null,
|
||||
"id": "3b1b5ccf",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -107,6 +107,21 @@
|
|||
"cejst_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d9968187",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Analyze one field at a time (useful for setting thresholds)\n",
|
||||
"field = \"Percent of individuals < 200% Federal Poverty Line\"\n",
|
||||
"print(cejst_df[field].describe())\n",
|
||||
"quantile = .8\n",
|
||||
"print(f\"Quantile at {quantile} is {np.nanquantile(a=cejst_df[field], q=quantile)}\")\n",
|
||||
"cejst_df[field].hist()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -201,8 +216,8 @@
|
|||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if len(merged_df) > 220333:\n",
|
||||
" raise ValueError(\"Too many rows in the join.\")\n",
|
||||
"if len(merged_df) > 220335:\n",
|
||||
" raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n",
|
||||
"\n",
|
||||
"merged_df.head()\n",
|
||||
"\n",
|
||||
|
@ -232,22 +247,33 @@
|
|||
"\n",
|
||||
"# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n",
|
||||
"# (`census_tract_indices`).\n",
|
||||
"\n",
|
||||
"census_block_group_indices = [\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Score G\",\n",
|
||||
" priority_communities_field=\"Score G (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"NMTC\",\n",
|
||||
" priority_communities_field=\"NMTC (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Score F\",\n",
|
||||
" priority_communities_field=\"Score F (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Score F (socioeconomic only)\",\n",
|
||||
" priority_communities_field=\"Meets socioeconomic criteria\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Score F (burden only)\",\n",
|
||||
" priority_communities_field=\"Meets burden criteria\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
"# Index(\n",
|
||||
"# method_name=\"Score F (socioeconomic only)\",\n",
|
||||
"# priority_communities_field=\"Meets socioeconomic criteria\",\n",
|
||||
"# other_census_tract_fields_to_keep=[],\n",
|
||||
"# ),\n",
|
||||
"# Index(\n",
|
||||
"# method_name=\"Score F (burden only)\",\n",
|
||||
"# priority_communities_field=\"Meets burden criteria\",\n",
|
||||
"# other_census_tract_fields_to_keep=[],\n",
|
||||
"# ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Score A\",\n",
|
||||
" priority_communities_field=\"Score A (top 25th percentile)\",\n",
|
||||
|
|
|
@ -38,6 +38,7 @@ def generate_tiles(data_path: Path) -> None:
|
|||
logger.info("Generating USA High mvt folders and files")
|
||||
cmd = "tippecanoe "
|
||||
cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --no-tile-compression "
|
||||
cmd += "--drop-densest-as-needed "
|
||||
cmd += f"--output-to-directory={high_tile_path} --layer=blocks "
|
||||
cmd += str(score_geojson_dir / "usa-high.json")
|
||||
call(cmd, shell=True)
|
||||
|
@ -54,6 +55,7 @@ def generate_tiles(data_path: Path) -> None:
|
|||
logger.info("Generating USA Low mvt folders and files")
|
||||
cmd = "tippecanoe "
|
||||
cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --no-tile-compression "
|
||||
cmd += "--drop-densest-as-needed "
|
||||
cmd += f"--output-to-directory={low_tile_path} --layer=blocks "
|
||||
cmd += str(score_geojson_dir / "usa-low.json")
|
||||
call(cmd, shell=True)
|
||||
|
|
Loading…
Add table
Reference in a new issue