From 4ae7eff4c4366f20e0d6e6fc83447686bcc8b420 Mon Sep 17 00:00:00 2001
From: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
Date: Mon, 9 Aug 2021 20:47:51 -0500
Subject: [PATCH] adding median income field and running black

---
 .../data_pipeline/etl/score/etl_score.py          | 15 +++++++++++----
 .../data_pipeline/etl/score/etl_score_geo.py      |  7 +++++--
 .../data_pipeline/etl/sources/census_acs/etl.py   |  7 +++++++
 .../data_pipeline/etl/sources/ejscreen/etl.py     |  3 ++-
 .../data_pipeline/etl/sources/hud_housing/etl.py  |  8 ++++++--
 .../etl/sources/tree_equity_score/etl.py          |  3 ++-
 6 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
index a43c0579..98e61260 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -59,7 +59,9 @@ class ScoreETL(ExtractTransformLoad):
         # Load census data
         census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
         self.census_df = pd.read_csv(
-            census_csv, dtype={self.GEOID_FIELD_NAME: "string"}, low_memory=False,
+            census_csv,
+            dtype={self.GEOID_FIELD_NAME: "string"},
+            low_memory=False,
         )
 
         # Load housing and transportation data
@@ -121,7 +123,8 @@ class ScoreETL(ExtractTransformLoad):
 
         # Define a named tuple that will be used for each data set input.
         DataSet = collections.namedtuple(
-            typename="DataSet", field_names=["input_field", "renamed_field", "bucket"],
+            typename="DataSet",
+            field_names=["input_field", "renamed_field", "bucket"],
         )
 
         data_sets = [
@@ -138,7 +141,9 @@ class ScoreETL(ExtractTransformLoad):
                 bucket=None,
             ),
             DataSet(
-                input_field="ACSTOTPOP", renamed_field="Total population", bucket=None,
+                input_field="ACSTOTPOP",
+                renamed_field="Total population",
+                bucket=None,
             ),
             # The following data sets have buckets, because they're used in the score
             DataSet(
@@ -244,7 +249,9 @@ class ScoreETL(ExtractTransformLoad):
         }
 
         self.df.rename(
-            columns=renaming_dict, inplace=True, errors="raise",
+            columns=renaming_dict,
+            inplace=True,
+            errors="raise",
         )
 
         columns_to_keep = [data_set.renamed_field for data_set in data_sets]
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
index f94b5d97..e305a2c1 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@@ -46,7 +46,9 @@ class GeoScoreETL(ExtractTransformLoad):
 
         logger.info("Reading score CSV")
         self.score_usa_df = pd.read_csv(
-            self.TILE_SCORE_CSV, dtype={"GEOID10": "string"}, low_memory=False,
+            self.TILE_SCORE_CSV,
+            dtype={"GEOID10": "string"},
+            low_memory=False,
         )
 
     def transform(self) -> None:
@@ -68,7 +70,8 @@ class GeoScoreETL(ExtractTransformLoad):
         ].reset_index(drop=True)
 
         usa_simplified.rename(
-            columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, inplace=True,
+            columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO},
+            inplace=True,
         )
 
         logger.info("Aggregating into tracts (~5 minutes)")
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
index dd7e37f9..3bee9cb7 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@@ -22,6 +22,8 @@ class CensusACSETL(ExtractTransformLoad):
             "C16002_010E",
             "C16002_013E",
         ]
+        self.MEDIAN_INCOME_FIELD = "B19013_001E"
+        self.MEDIAN_INCOME_FIELD_NAME = "Median household income in the past 12 months"
         self.df: pd.DataFrame
 
     def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
@@ -45,6 +47,7 @@ class CensusACSETL(ExtractTransformLoad):
                         # Emploment fields
                         "B23025_005E",
                         "B23025_003E",
+                        self.MEDIAN_INCOME_FIELD,
                     ]
                     + self.LINGUISTIC_ISOLATION_FIELDS,
                 )
@@ -59,6 +62,9 @@ class CensusACSETL(ExtractTransformLoad):
     def transform(self) -> None:
         logger.info("Starting Census ACS Transform")
 
+        # Rename median income
+        self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[self.MEDIAN_INCOME_FIELD]
+
         # Calculate percent unemployment.
         # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
         self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
@@ -91,6 +97,7 @@ class CensusACSETL(ExtractTransformLoad):
             self.GEOID_FIELD_NAME,
             self.UNEMPLOYED_FIELD_NAME,
             self.LINGUISTIC_ISOLATION_FIELD_NAME,
+            self.MEDIAN_INCOME_FIELD_NAME,
         ]
 
         self.df[columns_to_include].to_csv(
diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
index ba5d5af2..fd69f3ec 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@@ -18,7 +18,8 @@ class EJScreenETL(ExtractTransformLoad):
     def extract(self) -> None:
         logger.info("Downloading EJScreen Data")
         super().extract(
-            self.EJSCREEN_FTP_URL, self.TMP_PATH,
+            self.EJSCREEN_FTP_URL,
+            self.TMP_PATH,
         )
 
     def transform(self) -> None:
diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
index aafda83c..3cc2f2af 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
@@ -34,7 +34,8 @@ class HudHousingETL(ExtractTransformLoad):
     def extract(self) -> None:
         logger.info("Extracting HUD Housing Data")
         super().extract(
-            self.HOUSING_FTP_URL, self.HOUSING_ZIP_FILE_DIR,
+            self.HOUSING_FTP_URL,
+            self.HOUSING_ZIP_FILE_DIR,
         )
 
     def transform(self) -> None:
@@ -48,7 +49,10 @@ class HudHousingETL(ExtractTransformLoad):
             / "140"
             / "Table8.csv"
         )
-        self.df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path, encoding="latin-1",)
+        self.df = pd.read_csv(
+            filepath_or_buffer=tmp_csv_file_path,
+            encoding="latin-1",
+        )
 
         # Rename and reformat block group ID
         self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True)
diff --git a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
index 52b6d90c..a4351bf8 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
@@ -70,7 +70,8 @@ class TreeEquityScoreETL(ExtractTransformLoad):
         logger.info("Downloading Tree Equity Score Data")
         for state in self.states:
             super().extract(
-                f"{self.TES_URL}{state}.zip.zip", f"{self.TMP_PATH}/{state}",
+                f"{self.TES_URL}{state}.zip.zip",
+                f"{self.TMP_PATH}/{state}",
             )
 
     def transform(self) -> None: