Running Black

2025-02-23 10:04:18 -08:00 · 2021-08-04 21:10:30 -04:00 · 2021-08-04 21:10:30 -04:00 · 508925618b
commit 508925618b
parent dd8c37e06d
11 changed files with 30 additions and 87 deletions
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@ -22,9 +22,7 @@ def cli():
    pass
-@cli.command(
+@cli.command(help="Clean up all census data folders",)
    help="Clean up all census data folders",
 )
 def census_cleanup():
    """CLI command to clean up the census data folder"""
@ -37,9 +35,7 @@ def census_cleanup():
    logger.info("Cleaned up all census data files")
-@cli.command(
+@cli.command(help="Clean up all data folders",)
    help="Clean up all data folders",
 )
 def data_cleanup():
    """CLI command to clean up the all the data folders"""
@ -50,9 +46,7 @@ def data_cleanup():
    logger.info("Cleaned up all data folders")
-@cli.command(
+@cli.command(help="Census data download",)
    help="Census data download",
 )
 def census_data_download():
    """CLI command to download all census shape files from the Census FTP and extract the geojson
    to generate national and by state Census Block Group CSVs"""
@ -64,9 +58,7 @@ def census_data_download():
    logger.info("Completed downloading census data")
-@cli.command(
+@cli.command(help="Run all ETL processes or a specific one",)
    help="Run all ETL processes or a specific one",
 )
@click.option("-d", "--dataset", required=False, type=str)
 def etl_run(dataset: str):
    """Run a specific or all ETL processes
@ -81,27 +73,21 @@ def etl_run(dataset: str):
    etl_runner(dataset)
-@cli.command(
+@cli.command(help="Generate Score",)
    help="Generate Score",
 )
 def score_run():
    """CLI command to generate the score"""
    score_generate()
-@cli.command(
+@cli.command(help="Generate Geojson files with scores baked in",)
    help="Generate Geojson files with scores baked in",
 )
 def geo_score():
    """CLI command to generate the score"""
    score_geo()
-@cli.command(
+@cli.command(help="Generate map tiles",)
    help="Generate map tiles",
 )
 def generate_map_tiles():
    """CLI command to generate the map tiles"""
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -27,11 +27,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
            "module_dir": "census_acs",
            "class_name": "CensusACSETL",
        },
-        {
+        {"name": "ejscreen", "module_dir": "ejscreen", "class_name": "EJScreenETL",},
            "name": "ejscreen",
            "module_dir": "ejscreen",
            "class_name": "EJScreenETL",
        },
        {
            "name": "housing_and_transportation",
            "module_dir": "housing_and_transportation",
@ -47,17 +43,12 @@ def etl_runner(dataset_to_run: str = None) -> None:
            "module_dir": "calenviroscreen",
            "class_name": "CalEnviroScreenETL",
        },
-        {
+        {"name": "hud_recap", "module_dir": "hud_recap", "class_name": "HudRecapETL",},
            "name": "hud_recap",
            "module_dir": "hud_recap",
            "class_name": "HudRecapETL",
        },
    ]
    if dataset_to_run:
        dataset_element = next(
-            (item for item in dataset_list if item["name"] == dataset_to_run),
+            (item for item in dataset_list if item["name"] == dataset_to_run), None,
            None,
        )
        if not dataset_list:
            raise ValueError("Invalid dataset name")
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -59,9 +59,7 @@ class ScoreETL(ExtractTransformLoad):
        # Load census data
        census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
        self.census_df = pd.read_csv(
-            census_csv,
+            census_csv, dtype={self.GEOID_FIELD_NAME: "string"}, low_memory=False,
            dtype={self.GEOID_FIELD_NAME: "string"},
            low_memory=False,
        )
        # Load housing and transportation data
@ -123,8 +121,7 @@ class ScoreETL(ExtractTransformLoad):
        # Define a named tuple that will be used for each data set input.
        DataSet = collections.namedtuple(
-            typename="DataSet",
+            typename="DataSet", field_names=["input_field", "renamed_field", "bucket"],
            field_names=["input_field", "renamed_field", "bucket"],
        )
        data_sets = [
@ -141,9 +138,7 @@ class ScoreETL(ExtractTransformLoad):
                bucket=None,
            ),
            DataSet(
-                input_field="ACSTOTPOP",
+                input_field="ACSTOTPOP", renamed_field="Total population", bucket=None,
                renamed_field="Total population",
                bucket=None,
            ),
            # The following data sets have buckets, because they're used in the score
            DataSet(
@ -249,9 +244,7 @@ class ScoreETL(ExtractTransformLoad):
        }
        self.df.rename(
-            columns=renaming_dict,
+            columns=renaming_dict, inplace=True, errors="raise",
            inplace=True,
            errors="raise",
        )
        columns_to_keep = [data_set.renamed_field for data_set in data_sets]
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -46,9 +46,7 @@ class GeoScoreETL(ExtractTransformLoad):
        logger.info("Reading score CSV")
        self.score_usa_df = pd.read_csv(
-            self.TILE_SCORE_CSV,
+            self.TILE_SCORE_CSV, dtype={"GEOID10": "string"}, low_memory=False,
            dtype={"GEOID10": "string"},
            low_memory=False,
        )
    def transform(self) -> None:
@ -70,8 +68,7 @@ class GeoScoreETL(ExtractTransformLoad):
        ].reset_index(drop=True)
        usa_simplified.rename(
-            columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO},
+            columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, inplace=True,
            inplace=True,
        )
        logger.info("Aggregating into tracts (~5 minutes)")
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -43,8 +43,7 @@ class PostScoreETL(ExtractTransformLoad):
    def extract(self) -> None:
        super().extract(
-            self.CENSUS_COUNTIES_ZIP_URL,
+            self.CENSUS_COUNTIES_ZIP_URL, self.TMP_PATH,
            self.TMP_PATH,
        )
        logger.info("Reading Counties CSV")
@ -68,8 +67,7 @@ class PostScoreETL(ExtractTransformLoad):
        # rename some of the columns to prepare for merge
        self.counties_df = self.counties_df[["USPS", "GEOID", "NAME"]]
        self.counties_df.rename(
-            columns={"USPS": "State Abbreviation", "NAME": "County Name"},
+            columns={"USPS": "State Abbreviation", "NAME": "County Name"}, inplace=True,
            inplace=True,
        )
        # remove unnecessary columns
--- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
@ -28,8 +28,7 @@ class CalEnviroScreenETL(ExtractTransformLoad):
    def extract(self) -> None:
        logger.info("Downloading CalEnviroScreen Data")
        super().extract(
-            self.CALENVIROSCREEN_FTP_URL,
+            self.CALENVIROSCREEN_FTP_URL, self.TMP_PATH,
            self.TMP_PATH,
        )
    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
@ -39,9 +39,7 @@ def download_census_csvs(data_path: Path) -> None:
            # But using 2010 for now
            cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
            unzip_file_from_url(
-                cbg_state_url,
+                cbg_state_url, data_path / "tmp", data_path / "census" / "shp" / fips,
                data_path / "tmp",
                data_path / "census" / "shp" / fips,
            )
            cmd = (
@ -80,32 +78,22 @@ def download_census_csvs(data_path: Path) -> None:
            csv_dir_path / f"{state_id}.csv", mode="w", newline=""
        ) as cbg_csv_file:
            cbg_csv_file_writer = csv.writer(
-                cbg_csv_file,
+                cbg_csv_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL,
                delimiter=",",
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL,
            )
            for geoid10 in geoid10_list:
                cbg_csv_file_writer.writerow(
-                    [
+                    [geoid10,]
                        geoid10,
                    ]
                )
    ## write US csv
    with open(csv_dir_path / "us.csv", mode="w", newline="") as cbg_csv_file:
        cbg_csv_file_writer = csv.writer(
-            cbg_csv_file,
+            cbg_csv_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL,
            delimiter=",",
            quotechar='"',
            quoting=csv.QUOTE_MINIMAL,
        )
        for geoid10 in cbg_national:
            cbg_csv_file_writer.writerow(
-                [
+                [geoid10,]
                    geoid10,
                ]
            )
    ## create national geojson
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@ -18,8 +18,7 @@ class EJScreenETL(ExtractTransformLoad):
    def extract(self) -> None:
        logger.info("Downloading EJScreen Data")
        super().extract(
-            self.EJSCREEN_FTP_URL,
+            self.EJSCREEN_FTP_URL, self.TMP_PATH,
            self.TMP_PATH,
        )
    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
@ -34,8 +34,7 @@ class HudHousingETL(ExtractTransformLoad):
    def extract(self) -> None:
        logger.info("Extracting HUD Housing Data")
        super().extract(
-            self.HOUSING_FTP_URL,
+            self.HOUSING_FTP_URL, self.HOUSING_ZIP_FILE_DIR,
            self.HOUSING_ZIP_FILE_DIR,
        )
    def transform(self) -> None:
@ -49,10 +48,7 @@ class HudHousingETL(ExtractTransformLoad):
            / "140"
            / "Table8.csv"
        )
-        self.df = pd.read_csv(
+        self.df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path, encoding="latin-1",)
            filepath_or_buffer=tmp_csv_file_path,
            encoding="latin-1",
        )
        # Rename and reformat block group ID
        self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True)
--- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
@ -70,8 +70,7 @@ class TreeEquityScoreETL(ExtractTransformLoad):
        logger.info("Downloading Tree Equity Score Data")
        for state in self.states:
            super().extract(
-                f"{self.TES_URL}{state}.zip.zip",
+                f"{self.TES_URL}{state}.zip.zip", f"{self.TMP_PATH}/{state}",
                f"{self.TMP_PATH}/{state}",
            )
    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -97,10 +97,7 @@ def remove_all_dirs_from_dir(dir_path: Path) -> None:
 def unzip_file_from_url(
-    file_url: str,
+    file_url: str, download_path: Path, unzipped_file_path: Path, verify: bool = False,
    download_path: Path,
    unzipped_file_path: Path,
    verify: bool = False,
 ) -> None:
    """Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after