From 4ae7eff4c4366f20e0d6e6fc83447686bcc8b420 Mon Sep 17 00:00:00 2001 From: lucasmbrown-usds Date: Mon, 9 Aug 2021 20:47:51 -0500 Subject: [PATCH] adding median income field and running black --- .../data_pipeline/etl/score/etl_score.py | 15 +++++++++++---- .../data_pipeline/etl/score/etl_score_geo.py | 7 +++++-- .../data_pipeline/etl/sources/census_acs/etl.py | 7 +++++++ .../data_pipeline/etl/sources/ejscreen/etl.py | 3 ++- .../data_pipeline/etl/sources/hud_housing/etl.py | 8 ++++++-- .../etl/sources/tree_equity_score/etl.py | 3 ++- 6 files changed, 33 insertions(+), 10 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index a43c0579..98e61260 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -59,7 +59,9 @@ class ScoreETL(ExtractTransformLoad): # Load census data census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" self.census_df = pd.read_csv( - census_csv, dtype={self.GEOID_FIELD_NAME: "string"}, low_memory=False, + census_csv, + dtype={self.GEOID_FIELD_NAME: "string"}, + low_memory=False, ) # Load housing and transportation data @@ -121,7 +123,8 @@ class ScoreETL(ExtractTransformLoad): # Define a named tuple that will be used for each data set input. DataSet = collections.namedtuple( - typename="DataSet", field_names=["input_field", "renamed_field", "bucket"], + typename="DataSet", + field_names=["input_field", "renamed_field", "bucket"], ) data_sets = [ @@ -138,7 +141,9 @@ class ScoreETL(ExtractTransformLoad): bucket=None, ), DataSet( - input_field="ACSTOTPOP", renamed_field="Total population", bucket=None, + input_field="ACSTOTPOP", + renamed_field="Total population", + bucket=None, ), # The following data sets have buckets, because they're used in the score DataSet( @@ -244,7 +249,9 @@ class ScoreETL(ExtractTransformLoad): } self.df.rename( - columns=renaming_dict, inplace=True, errors="raise", + columns=renaming_dict, + inplace=True, + errors="raise", ) columns_to_keep = [data_set.renamed_field for data_set in data_sets] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index f94b5d97..e305a2c1 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -46,7 +46,9 @@ class GeoScoreETL(ExtractTransformLoad): logger.info("Reading score CSV") self.score_usa_df = pd.read_csv( - self.TILE_SCORE_CSV, dtype={"GEOID10": "string"}, low_memory=False, + self.TILE_SCORE_CSV, + dtype={"GEOID10": "string"}, + low_memory=False, ) def transform(self) -> None: @@ -68,7 +70,8 @@ class GeoScoreETL(ExtractTransformLoad): ].reset_index(drop=True) usa_simplified.rename( - columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, inplace=True, + columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, + inplace=True, ) logger.info("Aggregating into tracts (~5 minutes)") diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index dd7e37f9..3bee9cb7 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -22,6 +22,8 @@ class CensusACSETL(ExtractTransformLoad): "C16002_010E", "C16002_013E", ] + self.MEDIAN_INCOME_FIELD = "B19013_001E" + self.MEDIAN_INCOME_FIELD_NAME = "Median household income in the past 12 months" self.df: pd.DataFrame def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str: @@ -45,6 +47,7 @@ class CensusACSETL(ExtractTransformLoad): # Emploment fields "B23025_005E", "B23025_003E", + self.MEDIAN_INCOME_FIELD, ] + self.LINGUISTIC_ISOLATION_FIELDS, ) @@ -59,6 +62,9 @@ class CensusACSETL(ExtractTransformLoad): def transform(self) -> None: logger.info("Starting Census ACS Transform") + # Rename median income + self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[self.MEDIAN_INCOME_FIELD] + # Calculate percent unemployment. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction. self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E @@ -91,6 +97,7 @@ class CensusACSETL(ExtractTransformLoad): self.GEOID_FIELD_NAME, self.UNEMPLOYED_FIELD_NAME, self.LINGUISTIC_ISOLATION_FIELD_NAME, + self.MEDIAN_INCOME_FIELD_NAME, ] self.df[columns_to_include].to_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py index ba5d5af2..fd69f3ec 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py @@ -18,7 +18,8 @@ class EJScreenETL(ExtractTransformLoad): def extract(self) -> None: logger.info("Downloading EJScreen Data") super().extract( - self.EJSCREEN_FTP_URL, self.TMP_PATH, + self.EJSCREEN_FTP_URL, + self.TMP_PATH, ) def transform(self) -> None: diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py index aafda83c..3cc2f2af 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py @@ -34,7 +34,8 @@ class HudHousingETL(ExtractTransformLoad): def extract(self) -> None: logger.info("Extracting HUD Housing Data") super().extract( - self.HOUSING_FTP_URL, self.HOUSING_ZIP_FILE_DIR, + self.HOUSING_FTP_URL, + self.HOUSING_ZIP_FILE_DIR, ) def transform(self) -> None: @@ -48,7 +49,10 @@ class HudHousingETL(ExtractTransformLoad): / "140" / "Table8.csv" ) - self.df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path, encoding="latin-1",) + self.df = pd.read_csv( + filepath_or_buffer=tmp_csv_file_path, + encoding="latin-1", + ) # Rename and reformat block group ID self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True) diff --git a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py index 52b6d90c..a4351bf8 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py @@ -70,7 +70,8 @@ class TreeEquityScoreETL(ExtractTransformLoad): logger.info("Downloading Tree Equity Score Data") for state in self.states: super().extract( - f"{self.TES_URL}{state}.zip.zip", f"{self.TMP_PATH}/{state}", + f"{self.TES_URL}{state}.zip.zip", + f"{self.TMP_PATH}/{state}", ) def transform(self) -> None: