Update Side Panel Tile Data (#866)

* Update Side Panel Tile Data

* Update Side Panel Tile Data

* Correct indicator names to match csv

* Replace Score with Rate

* Comment out FEMA Loss Rate to troubleshoot

* Removes all "FEMA Loss Rate" array elements

* Revert FEMA to Score

* Remove expected loss rate

* Remove RMP and NPL from BASIC array

* Attempt to make shape mismatch align

- update README typo

* Add Score L indicators to TILE_SCORE_FLOAT_COLUMNS

* removing cbg references

* completes the ticket

* Update side panel fields

* Update index file writing to create parent dir

* Updates from linting

* fixing missing field_names for island territories 90th percentile fields

* Update downloadable fields and fix field name

* Update file fields and tests

* Update ordering of fields and leave TODO

* Update pickle after re-ordering of file

* fixing bugs in etl_score_geo

* Repeating index for diesel fix

* passing tests

* adding pytest.ini

Co-authored-by: Vim USDS <vimal.k.shah@omb.eop.gov>
Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
Jorge Escobar 2021-12-13 14:53:50 -05:00 committed by GitHub
commit 9709d08ca3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 328 additions and 141 deletions

View file

@ -1,7 +1,6 @@
from pathlib import Path
import datetime
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.score import field_names
@ -38,6 +37,9 @@ FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
# Score Tile CSV source path
DATA_SCORE_CSV_TILES_PATH = DATA_SCORE_CSV_DIR / "tiles"
DATA_SCORE_CSV_TILES_FILE_PATH = DATA_SCORE_CSV_TILES_PATH / "usa.csv"
DATA_SCORE_JSON_INDEX_FILE_PATH = (
DATA_SCORE_CSV_TILES_PATH / "tile_indexes.json"
)
## Tile path
DATA_SCORE_TILES_DIR = DATA_SCORE_DIR / "tiles"
@ -60,119 +62,214 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
# Column subsets
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
TILES_SCORE_COLUMNS = [
field_names.GEOID_TRACT_FIELD,
field_names.STATE_FIELD,
field_names.COUNTY_FIELD,
field_names.TOTAL_POP_FIELD,
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_G_COMMUNITIES,
field_names.SCORE_G,
field_names.SCORE_L_COMMUNITIES,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
TILES_ROUND_NUM_DECIMALS = 2
# Tiles data: full field name, tile index name
TILES_SCORE_COLUMNS = {
field_names.GEOID_TRACT_FIELD: "GTF",
field_names.STATE_FIELD: "SF",
field_names.COUNTY_FIELD: "CF",
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "DF_PFS",
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "AF_PFS",
field_names.HEART_DISEASE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "HDF_PFS",
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "DSF_PFS",
field_names.ENERGY_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "EBF_PFS",
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "EALR_PFS",
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "EBLR_PFS",
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "EPLR_PFS",
field_names.HOUSING_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "HBF_PFS",
field_names.LOW_LIFE_EXPECTANCY_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "LLEF_PFS",
field_names.LINGUISTIC_ISO_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "LIF_PFS",
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
+ field_names.PERCENTILE_FIELD_SUFFIX: "LMI_PFS",
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "MHVF_PFS",
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "PM25F_PFS",
field_names.HIGH_SCHOOL_ED_FIELD: "HSEF",
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "P100_PFS",
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
]
+ field_names.PERCENTILE_FIELD_SUFFIX: "P200_PFS",
field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "LPF_PFS",
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "NPL_PFS",
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "RMP_PFS",
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "TSDF_PFS",
field_names.TOTAL_POP_FIELD: "TPF",
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "TF_PFS",
field_names.UNEMPLOYMENT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "UF_PFS",
field_names.WASTEWATER_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "WF_PFS",
field_names.L_WATER: "L_WTR",
field_names.L_WORKFORCE: "L_WKFC",
field_names.L_CLIMATE: "L_CLT",
field_names.L_ENERGY: "L_ENY",
field_names.L_TRANSPORTATION: "L_TRN",
field_names.L_HOUSING: "L_HSG",
field_names.L_POLLUTION: "L_PLN",
field_names.L_HEALTH: "L_HLTH",
field_names.SCORE_L_COMMUNITIES: "SL_C",
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX: "SL_PFS",
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD: "EPLRLI",
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD: "EALRLI",
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD: "EBLRLI",
field_names.PM25_EXPOSURE_LOW_INCOME_FIELD: "PM25LI",
field_names.ENERGY_BURDEN_LOW_INCOME_FIELD: "EBLI",
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD: "DPMLI",
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD: "TPLI",
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD: "LPMHVLI",
field_names.HOUSING_BURDEN_LOW_INCOME_FIELD: "HBLI",
field_names.RMP_LOW_INCOME_FIELD: "RMPLI",
field_names.SUPERFUND_LOW_INCOME_FIELD: "SFLI",
field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD: "HWLI",
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD: "WDLI",
field_names.DIABETES_LOW_INCOME_FIELD: "DLI",
field_names.ASTHMA_LOW_INCOME_FIELD: "ALI",
field_names.HEART_DISEASE_LOW_INCOME_FIELD: "HDLI",
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD: "LLELI",
field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD: "LILHSE",
field_names.POVERTY_LOW_HS_EDUCATION_FIELD: "PLHSE",
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD: "LMILHSE",
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "ULHSE",
field_names.FPL_200_SERIES: "FPL200S",
field_names.THRESHOLD_COUNT: "TC",
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "IAULHSE",
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD: "ISPLHSE",
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD: "IALMILHSE",
}
# columns to round floats to 2 decimals
# TODO refactor to use much smaller subset of fields we DON'T want to round
TILES_SCORE_FLOAT_COLUMNS = [
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
]
TILES_ROUND_NUM_DECIMALS = 2
DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC = [
field_names.AMI_FIELD,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.DIABETES_FIELD,
field_names.ASTHMA_FIELD,
field_names.HEART_DISEASE_FIELD,
field_names.TRAFFIC_FIELD,
field_names.FEMA_RISK_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.WASTEWATER_FIELD,
field_names.LEAD_PAINT_FIELD,
field_names.DIESEL_FIELD,
field_names.PM25_FIELD,
field_names.TOTAL_POP_FIELD,
]
# For every indicator above, we want to include percentile also.
DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
pd.core.common.flatten(
[
[p, f"{p}{field_names.PERCENTILE_FIELD_SUFFIX}"]
for p in DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC
]
)
)
# Finally we augment with the GEOID10, county, and state
DOWNLOADABLE_SCORE_COLUMNS = [
field_names.GEOID_TRACT_FIELD,
field_names.COUNTY_FIELD,
field_names.STATE_FIELD,
field_names.SCORE_G_COMMUNITIES,
# Note: the reverse percentile fields get moved down here because
# we put the raw value in the download along with the *reversed* percentile.
# All other fields we put in f"{field_name}" and
# f"{field_name}{field_names.PERCENTILE_FIELD_SUFFIX}", which doesn't work for the
# reversed percentile fields.
field_names.SCORE_L_COMMUNITIES,
field_names.TOTAL_POP_FIELD,
field_names.FPL_200_SERIES,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_LOW_INCOME_FIELD,
field_names.PM25_FIELD,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_EXPOSURE_LOW_INCOME_FIELD,
field_names.DIESEL_FIELD,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD,
field_names.TRAFFIC_FIELD,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_LOW_INCOME_FIELD,
field_names.LEAD_PAINT_FIELD,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD,
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD,
field_names.NPL_FIELD,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SUPERFUND_LOW_INCOME_FIELD,
field_names.RMP_FIELD,
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RMP_LOW_INCOME_FIELD,
field_names.WASTEWATER_FIELD,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
field_names.ASTHMA_FIELD,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_LOW_INCOME_FIELD,
field_names.DIABETES_FIELD,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_LOW_INCOME_FIELD,
field_names.HEART_DISEASE_FIELD,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_LOW_INCOME_FIELD,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
*DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL,
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.THRESHOLD_COUNT,
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.COMBINED_UNEMPLOYMENT_2010,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
]

View file

@ -3,10 +3,12 @@ import pandas as pd
import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.score import constants
from data_pipeline.etl.sources.census.etl_utils import (
check_census_data_source,
)
from data_pipeline.etl.score.etl_utils import check_score_data_source
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -31,9 +33,19 @@ class GeoScoreETL(ExtractTransformLoad):
self.DATA_PATH / "census" / "geojson" / "us.json"
)
self.TARGET_SCORE_NAME = "Definition L (percentile)"
# Import the shortened name for Score L percentile ("SL_PFS") that's used on the
# tiles.
self.TARGET_SCORE_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX
]
self.TARGET_SCORE_RENAME_TO = "L_SCORE"
# Import the shortened name for tract ("GTF") that's used on the tiles.
self.TRACT_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
field_names.GEOID_TRACT_FIELD
]
self.GEOMETRY_FIELD_NAME = "geometry"
self.NUMBER_OF_BUCKETS = 10
self.geojson_usa_df: gpd.GeoDataFrame
@ -57,45 +69,52 @@ class GeoScoreETL(ExtractTransformLoad):
logger.info("Reading US GeoJSON (~6 minutes)")
self.geojson_usa_df = gpd.read_file(
self.CENSUS_USA_GEOJSON,
dtype={"GEOID10": "string"},
usecols=["GEOID10", "geometry"],
dtype={self.GEOID_FIELD_NAME: "string"},
usecols=[self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME],
low_memory=False,
)
self.geojson_usa_df.head()
logger.info("Reading score CSV")
self.score_usa_df = pd.read_csv(
self.TILE_SCORE_CSV,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
dtype={self.TRACT_SHORT_FIELD: "string"},
low_memory=False,
)
def transform(self) -> None:
# rename GEOID10_TRACT to GEOID10 on score to allow merging with Census GeoJSON
# Rename GEOID10_TRACT to GEOID10 on score to allow merging with Census GeoJSON
self.score_usa_df.rename(
columns={self.GEOID_TRACT_FIELD_NAME: "GEOID10"},
columns={self.TRACT_SHORT_FIELD: self.GEOID_FIELD_NAME},
inplace=True,
)
logger.info("Pruning Census GeoJSON")
fields = ["GEOID10", "geometry"]
fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME]
self.geojson_usa_df = self.geojson_usa_df[fields]
logger.info("Merging and compressing score CSV with USA GeoJSON")
self.geojson_score_usa_high = self.score_usa_df.merge(
self.geojson_usa_df, on="GEOID10", how="left"
self.geojson_usa_df, on=self.GEOID_FIELD_NAME, how="left"
)
self.geojson_score_usa_high = gpd.GeoDataFrame(
self.geojson_score_usa_high, crs="EPSG:4326"
)
logger.info(f"Columns: {self.geojson_score_usa_high.columns}")
usa_simplified = self.geojson_score_usa_high[
["GEOID10", self.TARGET_SCORE_NAME, "geometry"]
[
self.GEOID_FIELD_NAME,
self.TARGET_SCORE_SHORT_FIELD,
self.GEOMETRY_FIELD_NAME,
]
].reset_index(drop=True)
usa_simplified.rename(
columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO},
columns={
self.TARGET_SCORE_SHORT_FIELD: self.TARGET_SCORE_RENAME_TO
},
inplace=True,
)
@ -104,7 +123,7 @@ class GeoScoreETL(ExtractTransformLoad):
usa_tracts = gpd.GeoDataFrame(
usa_tracts,
columns=[self.TARGET_SCORE_RENAME_TO, "geometry"],
columns=[self.TARGET_SCORE_RENAME_TO, self.GEOMETRY_FIELD_NAME],
crs="EPSG:4326",
)
@ -122,7 +141,7 @@ class GeoScoreETL(ExtractTransformLoad):
self.geojson_score_usa_low = gpd.GeoDataFrame(
compressed,
columns=[self.TARGET_SCORE_RENAME_TO, "geometry"],
columns=[self.TARGET_SCORE_RENAME_TO, self.GEOMETRY_FIELD_NAME],
crs="EPSG:4326",
)
@ -135,7 +154,7 @@ class GeoScoreETL(ExtractTransformLoad):
) -> gpd.GeoDataFrame:
# The tract identifier is the first 11 digits of the GEOID
block_group_df["tract"] = block_group_df.apply(
lambda row: row["GEOID10"][0:11], axis=1
lambda row: row[self.GEOID_FIELD_NAME][0:11], axis=1
)
state_tracts = block_group_df.dissolve(by="tract", aggfunc="mean")
return state_tracts
@ -160,7 +179,7 @@ class GeoScoreETL(ExtractTransformLoad):
[
self.TARGET_SCORE_RENAME_TO,
f"{self.TARGET_SCORE_RENAME_TO}_bucket",
"geometry",
self.GEOMETRY_FIELD_NAME,
]
].reset_index(drop=True)
state_dissolve = state_attr.dissolve(
@ -173,11 +192,13 @@ class GeoScoreETL(ExtractTransformLoad):
) -> gpd.GeoDataFrame:
compressed = []
for i in range(num_buckets):
for j in range(len(state_bucketed_df["geometry"][i].geoms)):
for j in range(
len(state_bucketed_df[self.GEOMETRY_FIELD_NAME][i].geoms)
):
compressed.append(
[
state_bucketed_df[self.TARGET_SCORE_RENAME_TO][i],
state_bucketed_df["geometry"][i].geoms[j],
state_bucketed_df[self.GEOMETRY_FIELD_NAME][i].geoms[j],
]
)
return compressed

View file

@ -1,5 +1,7 @@
from pathlib import Path
import json
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger, zip_files
from data_pipeline.score import field_names
@ -198,16 +200,37 @@ class PostScoreETL(ExtractTransformLoad):
self, score_county_state_merged_df: pd.DataFrame
) -> pd.DataFrame:
logger.info("Rounding Decimals")
score_tiles = score_county_state_merged_df[
constants.TILES_SCORE_COLUMNS
]
# grab all the keys from tiles score columns
tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys())
# filter the columns on full score
score_tiles = score_county_state_merged_df[tiles_score_column_titles]
# round decimals
decimals = pd.Series(
[constants.TILES_ROUND_NUM_DECIMALS]
* len(constants.TILES_SCORE_FLOAT_COLUMNS),
index=constants.TILES_SCORE_FLOAT_COLUMNS,
)
score_tiles = score_tiles.round(decimals)
return score_tiles.round(decimals)
# create indexes
score_tiles = score_tiles.rename(
columns=constants.TILES_SCORE_COLUMNS,
inplace=False,
)
# write the json map to disk
inverse_tiles_columns = {
v: k for k, v in constants.TILES_SCORE_COLUMNS.items()
} # reverse dict
index_file_path = constants.DATA_SCORE_JSON_INDEX_FILE_PATH
index_file_path.parent.mkdir(parents=True, exist_ok=True)
with open(index_file_path, "w", encoding="utf-8") as fp:
json.dump(inverse_tiles_columns, fp)
return score_tiles
def _create_downloadable_data(
self, score_county_state_merged_df: pd.DataFrame

File diff suppressed because one or more lines are too long