diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py index ad58a23d..9527b242 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py @@ -52,11 +52,11 @@ class CDCPlacesETL(ExtractTransformLoad): # rename columns to be used in score rename_fields = { - "Current asthma among adults aged >=18 years": field_names.ASTHMA_FIELD, # 'Current asthma among adults aged greater than or equal to 18 years' - "Coronary heart disease among adults aged >=18 years": field.names.HEART_DISEASE_FIELD, # "Coronary heart disease among adults aged greater than or equal to 18 years" - "Cancer (excluding skin cancer) among adults aged >=18 years": field_names.CANCER_FIELD, # 'Cancer (excluding skin cancer) among adults aged greater than or equal to 18 years', - "Diagnosed diabetes among adults aged >=18 years": field_names.DIABETES_FIELD, # 'Diagnosed diabetes among adults aged greater than or equal to 18 years', - "Physical health not good for >=14 days among adults aged >=18 years": field_names.PHYS_HEALTH_NOT_GOOD_FIELD, # 'Physical health not good for greater than or equal to 14 days among adults aged greater than or equal to 18 years', + "Current asthma among adults aged >=18 years": field_names.ASTHMA_FIELD, + "Coronary heart disease among adults aged >=18 years": field_names.HEART_DISEASE_FIELD, + "Cancer (excluding skin cancer) among adults aged >=18 years": field_names.CANCER_FIELD, + "Diagnosed diabetes among adults aged >=18 years": field_names.DIABETES_FIELD, + "Physical health not good for >=14 days among adults aged >=18 years": field_names.PHYS_HEALTH_NOT_GOOD_FIELD, } self.df.rename( columns=rename_fields, diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index 51097cbe..af6b3c48 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -5,6 +5,7 @@ from data_pipeline.etl.sources.census_acs.etl_utils import ( retrieve_census_acs_data, ) from data_pipeline.utils import get_module_logger +from data_pipeline.score import field_names logger = get_module_logger(__name__) @@ -353,18 +354,29 @@ class CensusACSETL(ExtractTransformLoad): + df[self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PRIVATE] ) / df[self.COLLEGE_ATTENDANCE_TOTAL_POPULATION_ASKED] + # strip columns + df = df[self.COLUMNS_TO_KEEP] + # Save results to self. self.df = df + # rename columns to be used in score + rename_fields = { + "Percent of individuals < 200% Federal Poverty Line": field_names.POVERTY_LESS_THAN_200_FPL_FIELD, + } + self.df.rename( + columns=rename_fields, + inplace=True, + errors="raise", + ) + def load(self) -> None: logger.info("Saving Census ACS Data") # mkdir census self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) - self.df[self.COLUMNS_TO_KEEP].to_csv( - path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False - ) + self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) def validate(self) -> None: logger.info("Validating Census ACS Data") diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py index 05d823a6..ebc98121 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py @@ -5,6 +5,7 @@ from data_pipeline.etl.sources.census_acs.etl_utils import ( retrieve_census_acs_data, ) from data_pipeline.utils import get_module_logger +from data_pipeline.score import field_names logger = get_module_logger(__name__) @@ -149,15 +150,6 @@ class CensusACS2010ETL(ExtractTransformLoad): + df["C17002_007E"] ) / df["C17002_001E"] - # Save results to self. - self.df = df - - def load(self) -> None: - logger.info("Saving Census ACS Data") - - # mkdir census - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) - columns_to_include = [ self.GEOID_TRACT_FIELD_NAME, self.UNEMPLOYED_FIELD_NAME, @@ -166,7 +158,7 @@ class CensusACS2010ETL(ExtractTransformLoad): self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME, ] - output_df = self.df[columns_to_include] + output_df = df[columns_to_include] # Add the year to the end of every column, so when it's all joined in the # score df, it's obvious which year this data is from. @@ -178,7 +170,26 @@ class CensusACS2010ETL(ExtractTransformLoad): } ) - output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) + # rename columns to be used in score + rename_fields = { + "Percent of individuals < 100% Federal Poverty Line in 2010": field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010, + } + output_df.rename( + columns=rename_fields, + inplace=True, + errors="raise", + ) + + # Save results to self. + self.df = output_df + + def load(self) -> None: + logger.info("Saving Census ACS Data") + + # mkdir census + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + + self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) def validate(self) -> None: logger.info("Validating Census ACS Data")