Update Side Panel Tile Data (#866)

* Update Side Panel Tile Data

* Update Side Panel Tile Data

* Correct indicator names to match csv

* Replace Score with Rate

* Comment out FEMA Loss Rate to troubleshoot

* Removes all "FEMA Loss Rate" array elements

* Revert FEMA to Score

* Remove expected loss rate

* Remove RMP and NPL from BASIC array

* Attempt to make shape mismatch align

- update README typo

* Add Score L indicators to TILE_SCORE_FLOAT_COLUMNS

* removing cbg references

* completes the ticket

* Update side panel fields

* Update index file writing to create parent dir

* Updates from linting

* fixing missing field_names for island territories 90th percentile fields

* Update downloadable fields and fix field name

* Update file fields and tests

* Update ordering of fields and leave TODO

* Update pickle after re-ordering of file

* fixing bugs in etl_score_geo

* Repeating index for diesel fix

* passing tests

* adding pytest.ini

Co-authored-by: Vim USDS <vimal.k.shah@omb.eop.gov>
Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
Jorge Escobar 2021-12-13 14:53:50 -05:00 committed by GitHub
commit 9709d08ca3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 328 additions and 141 deletions

View file

@ -1,7 +1,6 @@
from pathlib import Path
import datetime
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.score import field_names
@ -38,6 +37,9 @@ FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
# Score Tile CSV source path
DATA_SCORE_CSV_TILES_PATH = DATA_SCORE_CSV_DIR / "tiles"
DATA_SCORE_CSV_TILES_FILE_PATH = DATA_SCORE_CSV_TILES_PATH / "usa.csv"
DATA_SCORE_JSON_INDEX_FILE_PATH = (
DATA_SCORE_CSV_TILES_PATH / "tile_indexes.json"
)
## Tile path
DATA_SCORE_TILES_DIR = DATA_SCORE_DIR / "tiles"
@ -60,119 +62,214 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
# Column subsets
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
TILES_SCORE_COLUMNS = [
field_names.GEOID_TRACT_FIELD,
field_names.STATE_FIELD,
field_names.COUNTY_FIELD,
field_names.TOTAL_POP_FIELD,
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_G_COMMUNITIES,
field_names.SCORE_G,
field_names.SCORE_L_COMMUNITIES,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
TILES_ROUND_NUM_DECIMALS = 2
# Tiles data: full field name, tile index name
TILES_SCORE_COLUMNS = {
field_names.GEOID_TRACT_FIELD: "GTF",
field_names.STATE_FIELD: "SF",
field_names.COUNTY_FIELD: "CF",
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "DF_PFS",
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "AF_PFS",
field_names.HEART_DISEASE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "HDF_PFS",
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "DSF_PFS",
field_names.ENERGY_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "EBF_PFS",
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "EALR_PFS",
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "EBLR_PFS",
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "EPLR_PFS",
field_names.HOUSING_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "HBF_PFS",
field_names.LOW_LIFE_EXPECTANCY_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "LLEF_PFS",
field_names.LINGUISTIC_ISO_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "LIF_PFS",
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
+ field_names.PERCENTILE_FIELD_SUFFIX: "LMI_PFS",
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "MHVF_PFS",
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "PM25F_PFS",
field_names.HIGH_SCHOOL_ED_FIELD: "HSEF",
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "P100_PFS",
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
]
+ field_names.PERCENTILE_FIELD_SUFFIX: "P200_PFS",
field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "LPF_PFS",
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "NPL_PFS",
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "RMP_PFS",
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "TSDF_PFS",
field_names.TOTAL_POP_FIELD: "TPF",
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "TF_PFS",
field_names.UNEMPLOYMENT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "UF_PFS",
field_names.WASTEWATER_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "WF_PFS",
field_names.L_WATER: "L_WTR",
field_names.L_WORKFORCE: "L_WKFC",
field_names.L_CLIMATE: "L_CLT",
field_names.L_ENERGY: "L_ENY",
field_names.L_TRANSPORTATION: "L_TRN",
field_names.L_HOUSING: "L_HSG",
field_names.L_POLLUTION: "L_PLN",
field_names.L_HEALTH: "L_HLTH",
field_names.SCORE_L_COMMUNITIES: "SL_C",
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX: "SL_PFS",
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD: "EPLRLI",
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD: "EALRLI",
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD: "EBLRLI",
field_names.PM25_EXPOSURE_LOW_INCOME_FIELD: "PM25LI",
field_names.ENERGY_BURDEN_LOW_INCOME_FIELD: "EBLI",
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD: "DPMLI",
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD: "TPLI",
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD: "LPMHVLI",
field_names.HOUSING_BURDEN_LOW_INCOME_FIELD: "HBLI",
field_names.RMP_LOW_INCOME_FIELD: "RMPLI",
field_names.SUPERFUND_LOW_INCOME_FIELD: "SFLI",
field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD: "HWLI",
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD: "WDLI",
field_names.DIABETES_LOW_INCOME_FIELD: "DLI",
field_names.ASTHMA_LOW_INCOME_FIELD: "ALI",
field_names.HEART_DISEASE_LOW_INCOME_FIELD: "HDLI",
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD: "LLELI",
field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD: "LILHSE",
field_names.POVERTY_LOW_HS_EDUCATION_FIELD: "PLHSE",
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD: "LMILHSE",
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "ULHSE",
field_names.FPL_200_SERIES: "FPL200S",
field_names.THRESHOLD_COUNT: "TC",
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "IAULHSE",
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD: "ISPLHSE",
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD: "IALMILHSE",
}
# columns to round floats to 2 decimals
# TODO refactor to use much smaller subset of fields we DON'T want to round
TILES_SCORE_FLOAT_COLUMNS = [
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
]
TILES_ROUND_NUM_DECIMALS = 2
DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC = [
field_names.AMI_FIELD,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.DIABETES_FIELD,
field_names.ASTHMA_FIELD,
field_names.HEART_DISEASE_FIELD,
field_names.TRAFFIC_FIELD,
field_names.FEMA_RISK_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.WASTEWATER_FIELD,
field_names.LEAD_PAINT_FIELD,
field_names.DIESEL_FIELD,
field_names.PM25_FIELD,
field_names.TOTAL_POP_FIELD,
]
# For every indicator above, we want to include percentile also.
DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
pd.core.common.flatten(
[
[p, f"{p}{field_names.PERCENTILE_FIELD_SUFFIX}"]
for p in DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC
]
)
)
# Finally we augment with the GEOID10, county, and state
DOWNLOADABLE_SCORE_COLUMNS = [
field_names.GEOID_TRACT_FIELD,
field_names.COUNTY_FIELD,
field_names.STATE_FIELD,
field_names.SCORE_G_COMMUNITIES,
# Note: the reverse percentile fields get moved down here because
# we put the raw value in the download along with the *reversed* percentile.
# All other fields we put in f"{field_name}" and
# f"{field_name}{field_names.PERCENTILE_FIELD_SUFFIX}", which doesn't work for the
# reversed percentile fields.
field_names.SCORE_L_COMMUNITIES,
field_names.TOTAL_POP_FIELD,
field_names.FPL_200_SERIES,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_LOW_INCOME_FIELD,
field_names.PM25_FIELD,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_EXPOSURE_LOW_INCOME_FIELD,
field_names.DIESEL_FIELD,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD,
field_names.TRAFFIC_FIELD,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_LOW_INCOME_FIELD,
field_names.LEAD_PAINT_FIELD,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD,
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD,
field_names.NPL_FIELD,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SUPERFUND_LOW_INCOME_FIELD,
field_names.RMP_FIELD,
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RMP_LOW_INCOME_FIELD,
field_names.WASTEWATER_FIELD,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
field_names.ASTHMA_FIELD,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_LOW_INCOME_FIELD,
field_names.DIABETES_FIELD,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_LOW_INCOME_FIELD,
field_names.HEART_DISEASE_FIELD,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_LOW_INCOME_FIELD,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
*DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL,
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.THRESHOLD_COUNT,
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.COMBINED_UNEMPLOYMENT_2010,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
]

View file

@ -3,10 +3,12 @@ import pandas as pd
import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.score import constants
from data_pipeline.etl.sources.census.etl_utils import (
check_census_data_source,
)
from data_pipeline.etl.score.etl_utils import check_score_data_source
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -31,9 +33,19 @@ class GeoScoreETL(ExtractTransformLoad):
self.DATA_PATH / "census" / "geojson" / "us.json"
)
self.TARGET_SCORE_NAME = "Definition L (percentile)"
# Import the shortened name for Score L percentile ("SL_PFS") that's used on the
# tiles.
self.TARGET_SCORE_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX
]
self.TARGET_SCORE_RENAME_TO = "L_SCORE"
# Import the shortened name for tract ("GTF") that's used on the tiles.
self.TRACT_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
field_names.GEOID_TRACT_FIELD
]
self.GEOMETRY_FIELD_NAME = "geometry"
self.NUMBER_OF_BUCKETS = 10
self.geojson_usa_df: gpd.GeoDataFrame
@ -57,45 +69,52 @@ class GeoScoreETL(ExtractTransformLoad):
logger.info("Reading US GeoJSON (~6 minutes)")
self.geojson_usa_df = gpd.read_file(
self.CENSUS_USA_GEOJSON,
dtype={"GEOID10": "string"},
usecols=["GEOID10", "geometry"],
dtype={self.GEOID_FIELD_NAME: "string"},
usecols=[self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME],
low_memory=False,
)
self.geojson_usa_df.head()
logger.info("Reading score CSV")
self.score_usa_df = pd.read_csv(
self.TILE_SCORE_CSV,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
dtype={self.TRACT_SHORT_FIELD: "string"},
low_memory=False,
)
def transform(self) -> None:
# rename GEOID10_TRACT to GEOID10 on score to allow merging with Census GeoJSON
# Rename GEOID10_TRACT to GEOID10 on score to allow merging with Census GeoJSON
self.score_usa_df.rename(
columns={self.GEOID_TRACT_FIELD_NAME: "GEOID10"},
columns={self.TRACT_SHORT_FIELD: self.GEOID_FIELD_NAME},
inplace=True,
)
logger.info("Pruning Census GeoJSON")
fields = ["GEOID10", "geometry"]
fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME]
self.geojson_usa_df = self.geojson_usa_df[fields]
logger.info("Merging and compressing score CSV with USA GeoJSON")
self.geojson_score_usa_high = self.score_usa_df.merge(
self.geojson_usa_df, on="GEOID10", how="left"
self.geojson_usa_df, on=self.GEOID_FIELD_NAME, how="left"
)
self.geojson_score_usa_high = gpd.GeoDataFrame(
self.geojson_score_usa_high, crs="EPSG:4326"
)
logger.info(f"Columns: {self.geojson_score_usa_high.columns}")
usa_simplified = self.geojson_score_usa_high[
["GEOID10", self.TARGET_SCORE_NAME, "geometry"]
[
self.GEOID_FIELD_NAME,
self.TARGET_SCORE_SHORT_FIELD,
self.GEOMETRY_FIELD_NAME,
]
].reset_index(drop=True)
usa_simplified.rename(
columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO},
columns={
self.TARGET_SCORE_SHORT_FIELD: self.TARGET_SCORE_RENAME_TO
},
inplace=True,
)
@ -104,7 +123,7 @@ class GeoScoreETL(ExtractTransformLoad):
usa_tracts = gpd.GeoDataFrame(
usa_tracts,
columns=[self.TARGET_SCORE_RENAME_TO, "geometry"],
columns=[self.TARGET_SCORE_RENAME_TO, self.GEOMETRY_FIELD_NAME],
crs="EPSG:4326",
)
@ -122,7 +141,7 @@ class GeoScoreETL(ExtractTransformLoad):
self.geojson_score_usa_low = gpd.GeoDataFrame(
compressed,
columns=[self.TARGET_SCORE_RENAME_TO, "geometry"],
columns=[self.TARGET_SCORE_RENAME_TO, self.GEOMETRY_FIELD_NAME],
crs="EPSG:4326",
)
@ -135,7 +154,7 @@ class GeoScoreETL(ExtractTransformLoad):
) -> gpd.GeoDataFrame:
# The tract identifier is the first 11 digits of the GEOID
block_group_df["tract"] = block_group_df.apply(
lambda row: row["GEOID10"][0:11], axis=1
lambda row: row[self.GEOID_FIELD_NAME][0:11], axis=1
)
state_tracts = block_group_df.dissolve(by="tract", aggfunc="mean")
return state_tracts
@ -160,7 +179,7 @@ class GeoScoreETL(ExtractTransformLoad):
[
self.TARGET_SCORE_RENAME_TO,
f"{self.TARGET_SCORE_RENAME_TO}_bucket",
"geometry",
self.GEOMETRY_FIELD_NAME,
]
].reset_index(drop=True)
state_dissolve = state_attr.dissolve(
@ -173,11 +192,13 @@ class GeoScoreETL(ExtractTransformLoad):
) -> gpd.GeoDataFrame:
compressed = []
for i in range(num_buckets):
for j in range(len(state_bucketed_df["geometry"][i].geoms)):
for j in range(
len(state_bucketed_df[self.GEOMETRY_FIELD_NAME][i].geoms)
):
compressed.append(
[
state_bucketed_df[self.TARGET_SCORE_RENAME_TO][i],
state_bucketed_df["geometry"][i].geoms[j],
state_bucketed_df[self.GEOMETRY_FIELD_NAME][i].geoms[j],
]
)
return compressed

View file

@ -1,5 +1,7 @@
from pathlib import Path
import json
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger, zip_files
from data_pipeline.score import field_names
@ -198,16 +200,37 @@ class PostScoreETL(ExtractTransformLoad):
self, score_county_state_merged_df: pd.DataFrame
) -> pd.DataFrame:
logger.info("Rounding Decimals")
score_tiles = score_county_state_merged_df[
constants.TILES_SCORE_COLUMNS
]
# grab all the keys from tiles score columns
tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys())
# filter the columns on full score
score_tiles = score_county_state_merged_df[tiles_score_column_titles]
# round decimals
decimals = pd.Series(
[constants.TILES_ROUND_NUM_DECIMALS]
* len(constants.TILES_SCORE_FLOAT_COLUMNS),
index=constants.TILES_SCORE_FLOAT_COLUMNS,
)
score_tiles = score_tiles.round(decimals)
return score_tiles.round(decimals)
# create indexes
score_tiles = score_tiles.rename(
columns=constants.TILES_SCORE_COLUMNS,
inplace=False,
)
# write the json map to disk
inverse_tiles_columns = {
v: k for k, v in constants.TILES_SCORE_COLUMNS.items()
} # reverse dict
index_file_path = constants.DATA_SCORE_JSON_INDEX_FILE_PATH
index_file_path.parent.mkdir(parents=True, exist_ok=True)
with open(index_file_path, "w", encoding="utf-8") as fp:
json.dump(inverse_tiles_columns, fp)
return score_tiles
def _create_downloadable_data(
self, score_county_state_merged_df: pd.DataFrame

File diff suppressed because one or more lines are too long

View file

@ -136,7 +136,7 @@ class CensusETL(ExtractTransformLoad):
def transform(self) -> None:
"""Download all census shape files from the Census FTP and extract the geojson
to generate national and by state Census Block Group CSVs and GeoJSONs
to generate national and by state Census tract CSVs and GeoJSONs
Returns:
None
@ -225,7 +225,7 @@ class CensusETL(ExtractTransformLoad):
logger.info("Writing national geojson file")
usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
logger.info("Census block groups downloading complete")
logger.info("Census tract downloading complete")
def load(self) -> None:
"""Create state CSVs, National CSV, and National GeoJSON

View file

@ -291,14 +291,27 @@ POVERTY_LOW_HS_EDUCATION_FIELD = (
" and has low HS education"
)
LOW_READING_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for low 3rd grade reading proficiency"
" and has low HS education"
LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for low median household income as a "
f"percent of area median income and has low HS education"
)
LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = (
f"At or below the {PERCENTILE}th percentile for low median household income as a "
f"percent of area median income and has low HS education"
# Workforce for island areas
ISLAND_AREAS_SUFFIX = " in 2009 (island areas)"
ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for unemployment"
f" and has low HS education{ISLAND_AREAS_SUFFIX}"
)
ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for households at or below 100% federal poverty level"
f" and has low HS education{ISLAND_AREAS_SUFFIX}"
)
ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for low median household income as a "
f"percent of area median income"
f" and has low HS education{ISLAND_AREAS_SUFFIX}"
)
# Not currently used in a factor
@ -317,6 +330,10 @@ HEALTHY_FOOD_LOW_INCOME_FIELD = (
f"At or above the {PERCENTILE}th percentile for low "
f"access to healthy food and is low income"
)
LOW_READING_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for low 3rd grade reading proficiency"
" and has low HS education"
)
THRESHOLD_COUNT = "Total threshold criteria exceeded"

View file

@ -117,7 +117,7 @@ class ScoreL(Score):
"""
self.df[field_names.THRESHOLD_COUNT] += self.df[columns_for_subset].sum(
axis=1
axis=1, skipna=True
)
def add_columns(self) -> pd.DataFrame:
@ -162,7 +162,7 @@ class ScoreL(Score):
non_workforce_factors
].any(axis=1)
self.df["Definition L (percentile)"] = self.df[
self.df[field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX] = self.df[
field_names.SCORE_L_COMMUNITIES
].astype(int)
@ -586,12 +586,16 @@ class ScoreL(Score):
)
# Now, calculate workforce criteria for island territories.
island_areas_workforce_eligibility_columns = [
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
]
# F a couple of values, create a combined field and criteria field.
# First, combine unemployment.
(
self.df,
unemployment_island_areas_criteria_field_name,
island_areas_unemployment_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
@ -603,7 +607,7 @@ class ScoreL(Score):
# Next, combine poverty.
(
self.df,
poverty_island_areas_criteria_field_name,
island_areas_poverty_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
@ -614,12 +618,12 @@ class ScoreL(Score):
# Also check whether low area median income is 90th percentile or higher
# within the islands.
low_median_income_as_a_percent_of_ami_island_areas_criteria_field_name = (
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name = (
f"{field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009} exceeds "
f"{field_names.PERCENTILE}th percentile"
)
self.df[
low_median_income_as_a_percent_of_ami_island_areas_criteria_field_name
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name
] = (
self.df[
field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
@ -628,17 +632,40 @@ class ScoreL(Score):
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
workforce_combined_criteria_for_island_areas = (
self.df[unemployment_island_areas_criteria_field_name]
| self.df[poverty_island_areas_criteria_field_name]
| self.df[
low_median_income_as_a_percent_of_ami_island_areas_criteria_field_name
]
) & (
island_areas_high_scool_achievement_rate_threshold = (
self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
)
self.df[
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD
] = (
self.df[island_areas_unemployment_criteria_field_name]
& island_areas_high_scool_achievement_rate_threshold
)
self.df[field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD] = (
self.df[island_areas_poverty_criteria_field_name]
& island_areas_high_scool_achievement_rate_threshold
)
self.df[
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD
] = (
self.df[
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name
]
& island_areas_high_scool_achievement_rate_threshold
)
workforce_combined_criteria_for_island_areas = self.df[
island_areas_workforce_eligibility_columns
].any(axis="columns")
self._increment_total_eligibility_exceeded(
island_areas_workforce_eligibility_columns
)
percent_of_island_tracts_highlighted = (
100
* workforce_combined_criteria_for_island_areas.sum()