Merge branch 'esfoobar-usds/1062-implement-changes-export-files' into issue-239-saran-ahluwalia

This commit is contained in:
Saran Ahluwalia 2022-01-12 13:33:00 -05:00
commit 38990a1449
8 changed files with 237 additions and 131 deletions

View file

@ -63,6 +63,13 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
# Column subsets
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
# Percent prefixes for rounding
PERCENT_PREFIXES_SUFFIXES = [
"Percent",
"Percentage",
field_names.PERCENTILE_FIELD_SUFFIX,
]
TILES_ROUND_NUM_DECIMALS = 2
# Tiles data: full field name, tile index name
TILES_SCORE_COLUMNS = {
@ -191,91 +198,88 @@ DOWNLOADABLE_SCORE_COLUMNS = [
field_names.GEOID_TRACT_FIELD,
field_names.COUNTY_FIELD,
field_names.STATE_FIELD,
field_names.THRESHOLD_COUNT,
field_names.SCORE_L_COMMUNITIES,
field_names.TOTAL_POP_FIELD,
field_names.FPL_200_SERIES,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
field_names.ENERGY_BURDEN_LOW_INCOME_FIELD,
field_names.PM25_FIELD,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD,
field_names.PM25_EXPOSURE_LOW_INCOME_FIELD,
field_names.DIESEL_FIELD,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD,
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD,
field_names.TRAFFIC_FIELD,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD,
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD,
field_names.HOUSING_BURDEN_LOW_INCOME_FIELD,
field_names.LEAD_PAINT_FIELD,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD,
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD,
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.MEDIAN_HOUSE_VALUE_FIELD,
field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD,
field_names.NPL_FIELD,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD,
field_names.SUPERFUND_LOW_INCOME_FIELD,
field_names.RMP_FIELD,
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.NPL_FIELD,
field_names.RMP_LOW_INCOME_FIELD,
field_names.WASTEWATER_FIELD,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RMP_FIELD,
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
field_names.ASTHMA_FIELD,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD,
field_names.ASTHMA_LOW_INCOME_FIELD,
field_names.DIABETES_FIELD,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD,
field_names.DIABETES_LOW_INCOME_FIELD,
field_names.HEART_DISEASE_FIELD,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_FIELD,
field_names.HEART_DISEASE_LOW_INCOME_FIELD,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD,
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD,
field_names.POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_HS_EDUCATION_FIELD,
field_names.THRESHOLD_COUNT,
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.COMBINED_UNEMPLOYMENT_2010,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD,
]

View file

@ -129,7 +129,7 @@ class PostScoreETL(ExtractTransformLoad):
new_df = initial_states_df.rename(
columns={
"fips": "State Code",
"state_name": "State Name",
"state_name": field_names.STATE_FIELD,
"state_abbreviation": "State Abbreviation",
}
)
@ -206,7 +206,9 @@ class PostScoreETL(ExtractTransformLoad):
tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys())
# filter the columns on full score
score_tiles = score_county_state_merged_df[tiles_score_column_titles]
score_tiles = score_county_state_merged_df[
tiles_score_column_titles
].copy()
score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[
constants.TILES_SCORE_FLOAT_COLUMNS
@ -238,9 +240,44 @@ class PostScoreETL(ExtractTransformLoad):
def _create_downloadable_data(
self, score_county_state_merged_df: pd.DataFrame
) -> pd.DataFrame:
return score_county_state_merged_df[
df = score_county_state_merged_df[
constants.DOWNLOADABLE_SCORE_COLUMNS
]
].copy()
float_columns = df.select_dtypes(include=["float64"]).columns
# convert percentile_columns
percent_target_columns = []
for x in float_columns:
for col in constants.PERCENT_PREFIXES_SUFFIXES:
if col in x:
percent_target_columns.append(x)
df[percent_target_columns] = df[percent_target_columns].apply(
func=lambda series: floor_series(
series=series * 100,
number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS,
)
)
# # convert percentile_columns
# non_percentile_float_columns = [
# x
# for x in float_columns
# if x not in constants.PERCENT_PREFIXES_SUFFIXES
# ]
# df[non_percentile_float_columns] = df[
# non_percentile_float_columns
# ].apply(
# func=lambda series: floor_series(
# series=series,
# number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS,
# ),
# axis=0,
# )
return df
def transform(self) -> None:
logger.info("Transforming data sources for Score + County CSVs")
@ -297,7 +334,7 @@ class PostScoreETL(ExtractTransformLoad):
# Rename score column
downloadable_df_copy = downloadable_df.rename(
columns={
field_names.SCORE_L_COMMUNITIES: "Community of focus (v0.1)"
field_names.SCORE_L_COMMUNITIES: "Identified as disadvantaged (v0.1)"
},
inplace=False,
)

View file

@ -2,6 +2,7 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger, download_file_from_url
from data_pipeline.score import field_names
logger = get_module_logger(__name__)
@ -49,6 +50,20 @@ class CDCPlacesETL(ExtractTransformLoad):
values=self.CDC_VALUE_FIELD_NAME,
)
# rename columns to be used in score
rename_fields = {
"Current asthma among adults aged >=18 years": field_names.ASTHMA_FIELD,
"Coronary heart disease among adults aged >=18 years": field_names.HEART_DISEASE_FIELD,
"Cancer (excluding skin cancer) among adults aged >=18 years": field_names.CANCER_FIELD,
"Diagnosed diabetes among adults aged >=18 years": field_names.DIABETES_FIELD,
"Physical health not good for >=14 days among adults aged >=18 years": field_names.PHYS_HEALTH_NOT_GOOD_FIELD,
}
self.df.rename(
columns=rename_fields,
inplace=True,
errors="raise",
)
# Make the index (the census tract ID) a column, not the index.
self.df.reset_index(inplace=True)

View file

@ -5,6 +5,7 @@ from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.utils import get_module_logger
from data_pipeline.score import field_names
logger = get_module_logger(__name__)
@ -22,7 +23,7 @@ class CensusACSETL(ExtractTransformLoad):
self.TOTAL_UNEMPLOYED_FIELD,
self.TOTAL_IN_LABOR_FORCE,
]
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.UNEMPLOYED_FIELD_NAME = "Unemployment (percent)"
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
@ -353,18 +354,29 @@ class CensusACSETL(ExtractTransformLoad):
+ df[self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PRIVATE]
) / df[self.COLLEGE_ATTENDANCE_TOTAL_POPULATION_ASKED]
# strip columns
df = df[self.COLUMNS_TO_KEEP]
# Save results to self.
self.df = df
# rename columns to be used in score
rename_fields = {
"Percent of individuals < 200% Federal Poverty Line": field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
}
self.df.rename(
columns=rename_fields,
inplace=True,
errors="raise",
)
def load(self) -> None:
logger.info("Saving Census ACS Data")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df[self.COLUMNS_TO_KEEP].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
def validate(self) -> None:
logger.info("Validating Census ACS Data")

View file

@ -5,6 +5,7 @@ from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.utils import get_module_logger
from data_pipeline.score import field_names
logger = get_module_logger(__name__)
@ -73,7 +74,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE,
]
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.UNEMPLOYED_FIELD_NAME = "Unemployment (percent)"
self.POVERTY_FIELDS = [
"C17002_001E", # Estimate!!Total,
@ -149,15 +150,6 @@ class CensusACS2010ETL(ExtractTransformLoad):
+ df["C17002_007E"]
) / df["C17002_001E"]
# Save results to self.
self.df = df
def load(self) -> None:
logger.info("Saving Census ACS Data")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
columns_to_include = [
self.GEOID_TRACT_FIELD_NAME,
self.UNEMPLOYED_FIELD_NAME,
@ -166,7 +158,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
]
output_df = self.df[columns_to_include]
output_df = df[columns_to_include]
# Add the year to the end of every column, so when it's all joined in the
# score df, it's obvious which year this data is from.
@ -178,7 +170,26 @@ class CensusACS2010ETL(ExtractTransformLoad):
}
)
output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
# rename columns to be used in score
rename_fields = {
"Percent of individuals < 100% Federal Poverty Line in 2010": field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
}
output_df.rename(
columns=rename_fields,
inplace=True,
errors="raise",
)
# Save results to self.
self.df = output_df
def load(self) -> None:
logger.info("Saving Census ACS Data")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
def validate(self) -> None:
logger.info("Validating Census ACS Data")

View file

@ -6,6 +6,7 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
from data_pipeline.score import field_names
pd.options.mode.chained_assignment = "raise"
@ -141,7 +142,9 @@ class CensusDecennialETL(ExtractTransformLoad):
"PBG036014" # Total!!Female!!In labor force!!Civilian!!Unemployed
)
self.UNEMPLOYMENT_FIELD_NAME = "Unemployed civilians (percent) in 2009"
self.UNEMPLOYMENT_FIELD_NAME = (
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009
)
var_list = [
self.MEDIAN_INCOME_FIELD,