Update Side Panel Tile Data (#866)

* Update Side Panel Tile Data

* Update Side Panel Tile Data

* Correct indicator names to match csv

* Replace Score with Rate

* Comment out FEMA Loss Rate to troubleshoot

* Removes all "FEMA Loss Rate" array elements

* Revert FEMA to Score

* Remove expected loss rate

* Remove RMP and NPL from BASIC array

* Attempt to make shape mismatch align

- update README typo

* Add Score L indicators to TILE_SCORE_FLOAT_COLUMNS

* removing cbg references

* completes the ticket

* Update side panel fields

* Update index file writing to create parent dir

* Updates from linting

* fixing missing field_names for island territories 90th percentile fields

* Update downloadable fields and fix field name

* Update file fields and tests

* Update ordering of fields and leave TODO

* Update pickle after re-ordering of file

* fixing bugs in etl_score_geo

* Repeating index for diesel fix

* passing tests

* adding pytest.ini

Co-authored-by: Vim USDS <vimal.k.shah@omb.eop.gov>
Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
Jorge Escobar 2021-12-13 14:53:50 -05:00 committed by GitHub
commit 9709d08ca3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 328 additions and 141 deletions

View file

@ -159,7 +159,7 @@ We use Docker to install the necessary libraries in a container that can be run
To build the docker container the first time, make sure you're in the root directory of the repository and run `docker-compose build --no-cache`. To build the docker container the first time, make sure you're in the root directory of the repository and run `docker-compose build --no-cache`.
Once completed, run `docker-compose up`. Docker will spin up 3 containers: the client container, the static server container and the data container. Once all data is generated, you can see the application using a browser and navigating to `htto://localhost:8000`. Once completed, run `docker-compose up`. Docker will spin up 3 containers: the client container, the static server container and the data container. Once all data is generated, you can see the application using a browser and navigating to `http://localhost:8000`.
If you want to run specific data tasks, you can open a terminal window, navigate to the root folder for this repository and then execute any command for the application using this format: If you want to run specific data tasks, you can open a terminal window, navigate to the root folder for this repository and then execute any command for the application using this format:
@ -322,7 +322,7 @@ score_initial_df = pd.read_csv(score_csv_path, dtype={"GEOID10_TRACT": "string"}
score_initial_df.to_csv(data_path / "data_pipeline" / "etl" / "score" / "tests" / "sample_data" /"score_data_initial.csv", index=False) score_initial_df.to_csv(data_path / "data_pipeline" / "etl" / "score" / "tests" / "sample_data" /"score_data_initial.csv", index=False)
``` ```
Now you can move on to updating inidvidual pickles for the tests. Note that it is helpful to do them in this order: Now you can move on to updating individual pickles for the tests. Note that it is helpful to do them in this order:
We have four pickle files that correspond to expected files: We have four pickle files that correspond to expected files:
- `score_data_expected.pkl`: Initial score without counties - `score_data_expected.pkl`: Initial score without counties

View file

@ -1,7 +1,6 @@
from pathlib import Path from pathlib import Path
import datetime import datetime
import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.score import field_names from data_pipeline.score import field_names
@ -38,6 +37,9 @@ FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
# Score Tile CSV source path # Score Tile CSV source path
DATA_SCORE_CSV_TILES_PATH = DATA_SCORE_CSV_DIR / "tiles" DATA_SCORE_CSV_TILES_PATH = DATA_SCORE_CSV_DIR / "tiles"
DATA_SCORE_CSV_TILES_FILE_PATH = DATA_SCORE_CSV_TILES_PATH / "usa.csv" DATA_SCORE_CSV_TILES_FILE_PATH = DATA_SCORE_CSV_TILES_PATH / "usa.csv"
DATA_SCORE_JSON_INDEX_FILE_PATH = (
DATA_SCORE_CSV_TILES_PATH / "tile_indexes.json"
)
## Tile path ## Tile path
DATA_SCORE_TILES_DIR = DATA_SCORE_DIR / "tiles" DATA_SCORE_TILES_DIR = DATA_SCORE_DIR / "tiles"
@ -60,119 +62,214 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
# Column subsets # Column subsets
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"] CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
TILES_SCORE_COLUMNS = [
field_names.GEOID_TRACT_FIELD, TILES_ROUND_NUM_DECIMALS = 2
field_names.STATE_FIELD, # Tiles data: full field name, tile index name
field_names.COUNTY_FIELD, TILES_SCORE_COLUMNS = {
field_names.TOTAL_POP_FIELD, field_names.GEOID_TRACT_FIELD: "GTF",
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX, field_names.STATE_FIELD: "SF",
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX, field_names.COUNTY_FIELD: "CF",
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX, field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "DF_PFS",
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX, field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "AF_PFS",
field_names.SCORE_G_COMMUNITIES, field_names.HEART_DISEASE_FIELD
field_names.SCORE_G, + field_names.PERCENTILE_FIELD_SUFFIX: "HDF_PFS",
field_names.SCORE_L_COMMUNITIES, field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "DSF_PFS",
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX, field_names.ENERGY_BURDEN_FIELD
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX: "EBF_PFS",
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX: "EALR_PFS",
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX: "EBLR_PFS",
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX: "EPLR_PFS",
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.HOUSING_BURDEN_FIELD
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX: "HBF_PFS",
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.LOW_LIFE_EXPECTANCY_FIELD
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX: "LLEF_PFS",
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.LINGUISTIC_ISO_FIELD
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX: "LIF_PFS",
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX: "LMI_PFS",
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "MHVF_PFS",
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "PM25F_PFS",
field_names.HIGH_SCHOOL_ED_FIELD: "HSEF",
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "P100_PFS",
field_names.POVERTY_LESS_THAN_200_FPL_FIELD field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX: "P200_PFS",
] field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "LPF_PFS",
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "NPL_PFS",
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "RMP_PFS",
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "TSDF_PFS",
field_names.TOTAL_POP_FIELD: "TPF",
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "TF_PFS",
field_names.UNEMPLOYMENT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "UF_PFS",
field_names.WASTEWATER_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "WF_PFS",
field_names.L_WATER: "L_WTR",
field_names.L_WORKFORCE: "L_WKFC",
field_names.L_CLIMATE: "L_CLT",
field_names.L_ENERGY: "L_ENY",
field_names.L_TRANSPORTATION: "L_TRN",
field_names.L_HOUSING: "L_HSG",
field_names.L_POLLUTION: "L_PLN",
field_names.L_HEALTH: "L_HLTH",
field_names.SCORE_L_COMMUNITIES: "SL_C",
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX: "SL_PFS",
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD: "EPLRLI",
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD: "EALRLI",
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD: "EBLRLI",
field_names.PM25_EXPOSURE_LOW_INCOME_FIELD: "PM25LI",
field_names.ENERGY_BURDEN_LOW_INCOME_FIELD: "EBLI",
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD: "DPMLI",
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD: "TPLI",
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD: "LPMHVLI",
field_names.HOUSING_BURDEN_LOW_INCOME_FIELD: "HBLI",
field_names.RMP_LOW_INCOME_FIELD: "RMPLI",
field_names.SUPERFUND_LOW_INCOME_FIELD: "SFLI",
field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD: "HWLI",
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD: "WDLI",
field_names.DIABETES_LOW_INCOME_FIELD: "DLI",
field_names.ASTHMA_LOW_INCOME_FIELD: "ALI",
field_names.HEART_DISEASE_LOW_INCOME_FIELD: "HDLI",
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD: "LLELI",
field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD: "LILHSE",
field_names.POVERTY_LOW_HS_EDUCATION_FIELD: "PLHSE",
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD: "LMILHSE",
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "ULHSE",
field_names.FPL_200_SERIES: "FPL200S",
field_names.THRESHOLD_COUNT: "TC",
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "IAULHSE",
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD: "ISPLHSE",
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD: "IALMILHSE",
}
# columns to round floats to 2 decimals # columns to round floats to 2 decimals
# TODO refactor to use much smaller subset of fields we DON'T want to round
TILES_SCORE_FLOAT_COLUMNS = [ TILES_SCORE_FLOAT_COLUMNS = [
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
] ]
TILES_ROUND_NUM_DECIMALS = 2
DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC = [
field_names.AMI_FIELD,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.DIABETES_FIELD,
field_names.ASTHMA_FIELD,
field_names.HEART_DISEASE_FIELD,
field_names.TRAFFIC_FIELD,
field_names.FEMA_RISK_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.WASTEWATER_FIELD,
field_names.LEAD_PAINT_FIELD,
field_names.DIESEL_FIELD,
field_names.PM25_FIELD,
field_names.TOTAL_POP_FIELD,
]
# For every indicator above, we want to include percentile also.
DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
pd.core.common.flatten(
[
[p, f"{p}{field_names.PERCENTILE_FIELD_SUFFIX}"]
for p in DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC
]
)
)
# Finally we augment with the GEOID10, county, and state # Finally we augment with the GEOID10, county, and state
DOWNLOADABLE_SCORE_COLUMNS = [ DOWNLOADABLE_SCORE_COLUMNS = [
field_names.GEOID_TRACT_FIELD, field_names.GEOID_TRACT_FIELD,
field_names.COUNTY_FIELD, field_names.COUNTY_FIELD,
field_names.STATE_FIELD, field_names.STATE_FIELD,
field_names.SCORE_G_COMMUNITIES, field_names.SCORE_L_COMMUNITIES,
# Note: the reverse percentile fields get moved down here because field_names.TOTAL_POP_FIELD,
# we put the raw value in the download along with the *reversed* percentile. field_names.FPL_200_SERIES,
# All other fields we put in f"{field_name}" and field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
# f"{field_name}{field_names.PERCENTILE_FIELD_SUFFIX}", which doesn't work for the field_names.POVERTY_LESS_THAN_200_FPL_FIELD
# reversed percentile fields. + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_LOW_INCOME_FIELD,
field_names.PM25_FIELD,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_EXPOSURE_LOW_INCOME_FIELD,
field_names.DIESEL_FIELD,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD,
field_names.TRAFFIC_FIELD,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_LOW_INCOME_FIELD,
field_names.LEAD_PAINT_FIELD,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD,
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD,
field_names.NPL_FIELD,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SUPERFUND_LOW_INCOME_FIELD,
field_names.RMP_FIELD,
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RMP_LOW_INCOME_FIELD,
field_names.WASTEWATER_FIELD,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
field_names.ASTHMA_FIELD,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_LOW_INCOME_FIELD,
field_names.DIABETES_FIELD,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_LOW_INCOME_FIELD,
field_names.HEART_DISEASE_FIELD,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_LOW_INCOME_FIELD,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD, field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LIFE_EXPECTANCY_FIELD, field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.LINGUISTIC_ISO_FIELD,
*DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL, field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.THRESHOLD_COUNT,
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.COMBINED_UNEMPLOYMENT_2010,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
] ]

View file

@ -3,10 +3,12 @@ import pandas as pd
import geopandas as gpd import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.score import constants
from data_pipeline.etl.sources.census.etl_utils import ( from data_pipeline.etl.sources.census.etl_utils import (
check_census_data_source, check_census_data_source,
) )
from data_pipeline.etl.score.etl_utils import check_score_data_source from data_pipeline.etl.score.etl_utils import check_score_data_source
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -31,9 +33,19 @@ class GeoScoreETL(ExtractTransformLoad):
self.DATA_PATH / "census" / "geojson" / "us.json" self.DATA_PATH / "census" / "geojson" / "us.json"
) )
self.TARGET_SCORE_NAME = "Definition L (percentile)" # Import the shortened name for Score L percentile ("SL_PFS") that's used on the
# tiles.
self.TARGET_SCORE_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX
]
self.TARGET_SCORE_RENAME_TO = "L_SCORE" self.TARGET_SCORE_RENAME_TO = "L_SCORE"
# Import the shortened name for tract ("GTF") that's used on the tiles.
self.TRACT_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
field_names.GEOID_TRACT_FIELD
]
self.GEOMETRY_FIELD_NAME = "geometry"
self.NUMBER_OF_BUCKETS = 10 self.NUMBER_OF_BUCKETS = 10
self.geojson_usa_df: gpd.GeoDataFrame self.geojson_usa_df: gpd.GeoDataFrame
@ -57,45 +69,52 @@ class GeoScoreETL(ExtractTransformLoad):
logger.info("Reading US GeoJSON (~6 minutes)") logger.info("Reading US GeoJSON (~6 minutes)")
self.geojson_usa_df = gpd.read_file( self.geojson_usa_df = gpd.read_file(
self.CENSUS_USA_GEOJSON, self.CENSUS_USA_GEOJSON,
dtype={"GEOID10": "string"}, dtype={self.GEOID_FIELD_NAME: "string"},
usecols=["GEOID10", "geometry"], usecols=[self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME],
low_memory=False, low_memory=False,
) )
self.geojson_usa_df.head()
logger.info("Reading score CSV") logger.info("Reading score CSV")
self.score_usa_df = pd.read_csv( self.score_usa_df = pd.read_csv(
self.TILE_SCORE_CSV, self.TILE_SCORE_CSV,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, dtype={self.TRACT_SHORT_FIELD: "string"},
low_memory=False, low_memory=False,
) )
def transform(self) -> None: def transform(self) -> None:
# rename GEOID10_TRACT to GEOID10 on score to allow merging with Census GeoJSON # Rename GEOID10_TRACT to GEOID10 on score to allow merging with Census GeoJSON
self.score_usa_df.rename( self.score_usa_df.rename(
columns={self.GEOID_TRACT_FIELD_NAME: "GEOID10"}, columns={self.TRACT_SHORT_FIELD: self.GEOID_FIELD_NAME},
inplace=True, inplace=True,
) )
logger.info("Pruning Census GeoJSON") logger.info("Pruning Census GeoJSON")
fields = ["GEOID10", "geometry"] fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME]
self.geojson_usa_df = self.geojson_usa_df[fields] self.geojson_usa_df = self.geojson_usa_df[fields]
logger.info("Merging and compressing score CSV with USA GeoJSON") logger.info("Merging and compressing score CSV with USA GeoJSON")
self.geojson_score_usa_high = self.score_usa_df.merge( self.geojson_score_usa_high = self.score_usa_df.merge(
self.geojson_usa_df, on="GEOID10", how="left" self.geojson_usa_df, on=self.GEOID_FIELD_NAME, how="left"
) )
self.geojson_score_usa_high = gpd.GeoDataFrame( self.geojson_score_usa_high = gpd.GeoDataFrame(
self.geojson_score_usa_high, crs="EPSG:4326" self.geojson_score_usa_high, crs="EPSG:4326"
) )
logger.info(f"Columns: {self.geojson_score_usa_high.columns}")
usa_simplified = self.geojson_score_usa_high[ usa_simplified = self.geojson_score_usa_high[
["GEOID10", self.TARGET_SCORE_NAME, "geometry"] [
self.GEOID_FIELD_NAME,
self.TARGET_SCORE_SHORT_FIELD,
self.GEOMETRY_FIELD_NAME,
]
].reset_index(drop=True) ].reset_index(drop=True)
usa_simplified.rename( usa_simplified.rename(
columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, columns={
self.TARGET_SCORE_SHORT_FIELD: self.TARGET_SCORE_RENAME_TO
},
inplace=True, inplace=True,
) )
@ -104,7 +123,7 @@ class GeoScoreETL(ExtractTransformLoad):
usa_tracts = gpd.GeoDataFrame( usa_tracts = gpd.GeoDataFrame(
usa_tracts, usa_tracts,
columns=[self.TARGET_SCORE_RENAME_TO, "geometry"], columns=[self.TARGET_SCORE_RENAME_TO, self.GEOMETRY_FIELD_NAME],
crs="EPSG:4326", crs="EPSG:4326",
) )
@ -122,7 +141,7 @@ class GeoScoreETL(ExtractTransformLoad):
self.geojson_score_usa_low = gpd.GeoDataFrame( self.geojson_score_usa_low = gpd.GeoDataFrame(
compressed, compressed,
columns=[self.TARGET_SCORE_RENAME_TO, "geometry"], columns=[self.TARGET_SCORE_RENAME_TO, self.GEOMETRY_FIELD_NAME],
crs="EPSG:4326", crs="EPSG:4326",
) )
@ -135,7 +154,7 @@ class GeoScoreETL(ExtractTransformLoad):
) -> gpd.GeoDataFrame: ) -> gpd.GeoDataFrame:
# The tract identifier is the first 11 digits of the GEOID # The tract identifier is the first 11 digits of the GEOID
block_group_df["tract"] = block_group_df.apply( block_group_df["tract"] = block_group_df.apply(
lambda row: row["GEOID10"][0:11], axis=1 lambda row: row[self.GEOID_FIELD_NAME][0:11], axis=1
) )
state_tracts = block_group_df.dissolve(by="tract", aggfunc="mean") state_tracts = block_group_df.dissolve(by="tract", aggfunc="mean")
return state_tracts return state_tracts
@ -160,7 +179,7 @@ class GeoScoreETL(ExtractTransformLoad):
[ [
self.TARGET_SCORE_RENAME_TO, self.TARGET_SCORE_RENAME_TO,
f"{self.TARGET_SCORE_RENAME_TO}_bucket", f"{self.TARGET_SCORE_RENAME_TO}_bucket",
"geometry", self.GEOMETRY_FIELD_NAME,
] ]
].reset_index(drop=True) ].reset_index(drop=True)
state_dissolve = state_attr.dissolve( state_dissolve = state_attr.dissolve(
@ -173,11 +192,13 @@ class GeoScoreETL(ExtractTransformLoad):
) -> gpd.GeoDataFrame: ) -> gpd.GeoDataFrame:
compressed = [] compressed = []
for i in range(num_buckets): for i in range(num_buckets):
for j in range(len(state_bucketed_df["geometry"][i].geoms)): for j in range(
len(state_bucketed_df[self.GEOMETRY_FIELD_NAME][i].geoms)
):
compressed.append( compressed.append(
[ [
state_bucketed_df[self.TARGET_SCORE_RENAME_TO][i], state_bucketed_df[self.TARGET_SCORE_RENAME_TO][i],
state_bucketed_df["geometry"][i].geoms[j], state_bucketed_df[self.GEOMETRY_FIELD_NAME][i].geoms[j],
] ]
) )
return compressed return compressed

View file

@ -1,5 +1,7 @@
from pathlib import Path from pathlib import Path
import json
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger, zip_files from data_pipeline.utils import get_module_logger, zip_files
from data_pipeline.score import field_names from data_pipeline.score import field_names
@ -198,16 +200,37 @@ class PostScoreETL(ExtractTransformLoad):
self, score_county_state_merged_df: pd.DataFrame self, score_county_state_merged_df: pd.DataFrame
) -> pd.DataFrame: ) -> pd.DataFrame:
logger.info("Rounding Decimals") logger.info("Rounding Decimals")
score_tiles = score_county_state_merged_df[
constants.TILES_SCORE_COLUMNS # grab all the keys from tiles score columns
] tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys())
# filter the columns on full score
score_tiles = score_county_state_merged_df[tiles_score_column_titles]
# round decimals
decimals = pd.Series( decimals = pd.Series(
[constants.TILES_ROUND_NUM_DECIMALS] [constants.TILES_ROUND_NUM_DECIMALS]
* len(constants.TILES_SCORE_FLOAT_COLUMNS), * len(constants.TILES_SCORE_FLOAT_COLUMNS),
index=constants.TILES_SCORE_FLOAT_COLUMNS, index=constants.TILES_SCORE_FLOAT_COLUMNS,
) )
score_tiles = score_tiles.round(decimals)
return score_tiles.round(decimals) # create indexes
score_tiles = score_tiles.rename(
columns=constants.TILES_SCORE_COLUMNS,
inplace=False,
)
# write the json map to disk
inverse_tiles_columns = {
v: k for k, v in constants.TILES_SCORE_COLUMNS.items()
} # reverse dict
index_file_path = constants.DATA_SCORE_JSON_INDEX_FILE_PATH
index_file_path.parent.mkdir(parents=True, exist_ok=True)
with open(index_file_path, "w", encoding="utf-8") as fp:
json.dump(inverse_tiles_columns, fp)
return score_tiles
def _create_downloadable_data( def _create_downloadable_data(
self, score_county_state_merged_df: pd.DataFrame self, score_county_state_merged_df: pd.DataFrame

File diff suppressed because one or more lines are too long

View file

@ -136,7 +136,7 @@ class CensusETL(ExtractTransformLoad):
def transform(self) -> None: def transform(self) -> None:
"""Download all census shape files from the Census FTP and extract the geojson """Download all census shape files from the Census FTP and extract the geojson
to generate national and by state Census Block Group CSVs and GeoJSONs to generate national and by state Census tract CSVs and GeoJSONs
Returns: Returns:
None None
@ -225,7 +225,7 @@ class CensusETL(ExtractTransformLoad):
logger.info("Writing national geojson file") logger.info("Writing national geojson file")
usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON") usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
logger.info("Census block groups downloading complete") logger.info("Census tract downloading complete")
def load(self) -> None: def load(self) -> None:
"""Create state CSVs, National CSV, and National GeoJSON """Create state CSVs, National CSV, and National GeoJSON

View file

@ -291,14 +291,27 @@ POVERTY_LOW_HS_EDUCATION_FIELD = (
" and has low HS education" " and has low HS education"
) )
LOW_READING_LOW_HS_EDUCATION_FIELD = ( LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for low 3rd grade reading proficiency" f"At or above the {PERCENTILE}th percentile for low median household income as a "
" and has low HS education" f"percent of area median income and has low HS education"
) )
LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = ( # Workforce for island areas
f"At or below the {PERCENTILE}th percentile for low median household income as a " ISLAND_AREAS_SUFFIX = " in 2009 (island areas)"
f"percent of area median income and has low HS education" ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for unemployment"
f" and has low HS education{ISLAND_AREAS_SUFFIX}"
)
ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for households at or below 100% federal poverty level"
f" and has low HS education{ISLAND_AREAS_SUFFIX}"
)
ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for low median household income as a "
f"percent of area median income"
f" and has low HS education{ISLAND_AREAS_SUFFIX}"
) )
# Not currently used in a factor # Not currently used in a factor
@ -317,6 +330,10 @@ HEALTHY_FOOD_LOW_INCOME_FIELD = (
f"At or above the {PERCENTILE}th percentile for low " f"At or above the {PERCENTILE}th percentile for low "
f"access to healthy food and is low income" f"access to healthy food and is low income"
) )
LOW_READING_LOW_HS_EDUCATION_FIELD = (
f"At or above the {PERCENTILE}th percentile for low 3rd grade reading proficiency"
" and has low HS education"
)
THRESHOLD_COUNT = "Total threshold criteria exceeded" THRESHOLD_COUNT = "Total threshold criteria exceeded"

View file

@ -117,7 +117,7 @@ class ScoreL(Score):
""" """
self.df[field_names.THRESHOLD_COUNT] += self.df[columns_for_subset].sum( self.df[field_names.THRESHOLD_COUNT] += self.df[columns_for_subset].sum(
axis=1 axis=1, skipna=True
) )
def add_columns(self) -> pd.DataFrame: def add_columns(self) -> pd.DataFrame:
@ -162,7 +162,7 @@ class ScoreL(Score):
non_workforce_factors non_workforce_factors
].any(axis=1) ].any(axis=1)
self.df["Definition L (percentile)"] = self.df[ self.df[field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX] = self.df[
field_names.SCORE_L_COMMUNITIES field_names.SCORE_L_COMMUNITIES
].astype(int) ].astype(int)
@ -586,12 +586,16 @@ class ScoreL(Score):
) )
# Now, calculate workforce criteria for island territories. # Now, calculate workforce criteria for island territories.
island_areas_workforce_eligibility_columns = [
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
]
# F a couple of values, create a combined field and criteria field.
# First, combine unemployment. # First, combine unemployment.
( (
self.df, self.df,
unemployment_island_areas_criteria_field_name, island_areas_unemployment_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds( ) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df, df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009, column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
@ -603,7 +607,7 @@ class ScoreL(Score):
# Next, combine poverty. # Next, combine poverty.
( (
self.df, self.df,
poverty_island_areas_criteria_field_name, island_areas_poverty_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds( ) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df, df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009, column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
@ -614,12 +618,12 @@ class ScoreL(Score):
# Also check whether low area median income is 90th percentile or higher # Also check whether low area median income is 90th percentile or higher
# within the islands. # within the islands.
low_median_income_as_a_percent_of_ami_island_areas_criteria_field_name = ( island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name = (
f"{field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009} exceeds " f"{field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009} exceeds "
f"{field_names.PERCENTILE}th percentile" f"{field_names.PERCENTILE}th percentile"
) )
self.df[ self.df[
low_median_income_as_a_percent_of_ami_island_areas_criteria_field_name island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name
] = ( ] = (
self.df[ self.df[
field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
@ -628,17 +632,40 @@ class ScoreL(Score):
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
) )
workforce_combined_criteria_for_island_areas = ( island_areas_high_scool_achievement_rate_threshold = (
self.df[unemployment_island_areas_criteria_field_name]
| self.df[poverty_island_areas_criteria_field_name]
| self.df[
low_median_income_as_a_percent_of_ami_island_areas_criteria_field_name
]
) & (
self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009] self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
) )
self.df[
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD
] = (
self.df[island_areas_unemployment_criteria_field_name]
& island_areas_high_scool_achievement_rate_threshold
)
self.df[field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD] = (
self.df[island_areas_poverty_criteria_field_name]
& island_areas_high_scool_achievement_rate_threshold
)
self.df[
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD
] = (
self.df[
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name
]
& island_areas_high_scool_achievement_rate_threshold
)
workforce_combined_criteria_for_island_areas = self.df[
island_areas_workforce_eligibility_columns
].any(axis="columns")
self._increment_total_eligibility_exceeded(
island_areas_workforce_eligibility_columns
)
percent_of_island_tracts_highlighted = ( percent_of_island_tracts_highlighted = (
100 100
* workforce_combined_criteria_for_island_areas.sum() * workforce_combined_criteria_for_island_areas.sum()

View file

@ -0,0 +1,2 @@
[pytest]
norecursedirs = .git data