diff --git a/data/data-pipeline/data_pipeline/application.py b/data/data-pipeline/data_pipeline/application.py index 53dd9edb..cba4762c 100644 --- a/data/data-pipeline/data_pipeline/application.py +++ b/data/data-pipeline/data_pipeline/application.py @@ -22,9 +22,7 @@ def cli(): pass -@cli.command( - help="Clean up all census data folders", -) +@cli.command(help="Clean up all census data folders",) def census_cleanup(): """CLI command to clean up the census data folder""" @@ -37,9 +35,7 @@ def census_cleanup(): logger.info("Cleaned up all census data files") -@cli.command( - help="Clean up all data folders", -) +@cli.command(help="Clean up all data folders",) def data_cleanup(): """CLI command to clean up the all the data folders""" @@ -50,9 +46,7 @@ def data_cleanup(): logger.info("Cleaned up all data folders") -@cli.command( - help="Census data download", -) +@cli.command(help="Census data download",) def census_data_download(): """CLI command to download all census shape files from the Census FTP and extract the geojson to generate national and by state Census Block Group CSVs""" @@ -64,9 +58,7 @@ def census_data_download(): logger.info("Completed downloading census data") -@cli.command( - help="Run all ETL processes or a specific one", -) +@cli.command(help="Run all ETL processes or a specific one",) @click.option("-d", "--dataset", required=False, type=str) def etl_run(dataset: str): """Run a specific or all ETL processes @@ -81,27 +73,21 @@ def etl_run(dataset: str): etl_runner(dataset) -@cli.command( - help="Generate Score", -) +@cli.command(help="Generate Score",) def score_run(): """CLI command to generate the score""" score_generate() -@cli.command( - help="Generate Geojson files with scores baked in", -) +@cli.command(help="Generate Geojson files with scores baked in",) def geo_score(): """CLI command to generate the score""" score_geo() -@cli.command( - help="Generate map tiles", -) +@cli.command(help="Generate map tiles",) def generate_map_tiles(): """CLI command to generate the map tiles""" diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py index 093012de..7e5f2449 100644 --- a/data/data-pipeline/data_pipeline/etl/runner.py +++ b/data/data-pipeline/data_pipeline/etl/runner.py @@ -27,11 +27,7 @@ def etl_runner(dataset_to_run: str = None) -> None: "module_dir": "census_acs", "class_name": "CensusACSETL", }, - { - "name": "ejscreen", - "module_dir": "ejscreen", - "class_name": "EJScreenETL", - }, + {"name": "ejscreen", "module_dir": "ejscreen", "class_name": "EJScreenETL",}, { "name": "housing_and_transportation", "module_dir": "housing_and_transportation", @@ -47,17 +43,12 @@ def etl_runner(dataset_to_run: str = None) -> None: "module_dir": "calenviroscreen", "class_name": "CalEnviroScreenETL", }, - { - "name": "hud_recap", - "module_dir": "hud_recap", - "class_name": "HudRecapETL", - }, + {"name": "hud_recap", "module_dir": "hud_recap", "class_name": "HudRecapETL",}, ] if dataset_to_run: dataset_element = next( - (item for item in dataset_list if item["name"] == dataset_to_run), - None, + (item for item in dataset_list if item["name"] == dataset_to_run), None, ) if not dataset_list: raise ValueError("Invalid dataset name") diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 98e61260..a43c0579 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -59,9 +59,7 @@ class ScoreETL(ExtractTransformLoad): # Load census data census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" self.census_df = pd.read_csv( - census_csv, - dtype={self.GEOID_FIELD_NAME: "string"}, - low_memory=False, + census_csv, dtype={self.GEOID_FIELD_NAME: "string"}, low_memory=False, ) # Load housing and transportation data @@ -123,8 +121,7 @@ class ScoreETL(ExtractTransformLoad): # Define a named tuple that will be used for each data set input. DataSet = collections.namedtuple( - typename="DataSet", - field_names=["input_field", "renamed_field", "bucket"], + typename="DataSet", field_names=["input_field", "renamed_field", "bucket"], ) data_sets = [ @@ -141,9 +138,7 @@ class ScoreETL(ExtractTransformLoad): bucket=None, ), DataSet( - input_field="ACSTOTPOP", - renamed_field="Total population", - bucket=None, + input_field="ACSTOTPOP", renamed_field="Total population", bucket=None, ), # The following data sets have buckets, because they're used in the score DataSet( @@ -249,9 +244,7 @@ class ScoreETL(ExtractTransformLoad): } self.df.rename( - columns=renaming_dict, - inplace=True, - errors="raise", + columns=renaming_dict, inplace=True, errors="raise", ) columns_to_keep = [data_set.renamed_field for data_set in data_sets] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index ead15fe0..f94b5d97 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -46,9 +46,7 @@ class GeoScoreETL(ExtractTransformLoad): logger.info("Reading score CSV") self.score_usa_df = pd.read_csv( - self.TILE_SCORE_CSV, - dtype={"GEOID10": "string"}, - low_memory=False, + self.TILE_SCORE_CSV, dtype={"GEOID10": "string"}, low_memory=False, ) def transform(self) -> None: @@ -70,8 +68,7 @@ class GeoScoreETL(ExtractTransformLoad): ].reset_index(drop=True) usa_simplified.rename( - columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, - inplace=True, + columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, inplace=True, ) logger.info("Aggregating into tracts (~5 minutes)") @@ -156,4 +153,4 @@ class GeoScoreETL(ExtractTransformLoad): logger.info("Writing usa-low (~9 minutes)") self.geojson_score_usa_low.to_file(self.SCORE_LOW_GEOJSON, driver="GeoJSON") - logger.info("Completed writing usa-low") \ No newline at end of file + logger.info("Completed writing usa-low") diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 9f3365f8..f7b6bff9 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -43,8 +43,7 @@ class PostScoreETL(ExtractTransformLoad): def extract(self) -> None: super().extract( - self.CENSUS_COUNTIES_ZIP_URL, - self.TMP_PATH, + self.CENSUS_COUNTIES_ZIP_URL, self.TMP_PATH, ) logger.info("Reading Counties CSV") @@ -68,8 +67,7 @@ class PostScoreETL(ExtractTransformLoad): # rename some of the columns to prepare for merge self.counties_df = self.counties_df[["USPS", "GEOID", "NAME"]] self.counties_df.rename( - columns={"USPS": "State Abbreviation", "NAME": "County Name"}, - inplace=True, + columns={"USPS": "State Abbreviation", "NAME": "County Name"}, inplace=True, ) # remove unnecessary columns diff --git a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py index 63a40f62..33c9c6ad 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py @@ -28,8 +28,7 @@ class CalEnviroScreenETL(ExtractTransformLoad): def extract(self) -> None: logger.info("Downloading CalEnviroScreen Data") super().extract( - self.CALENVIROSCREEN_FTP_URL, - self.TMP_PATH, + self.CALENVIROSCREEN_FTP_URL, self.TMP_PATH, ) def transform(self) -> None: diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py index 5e70dbd9..9dae7beb 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py @@ -39,9 +39,7 @@ def download_census_csvs(data_path: Path) -> None: # But using 2010 for now cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip" unzip_file_from_url( - cbg_state_url, - data_path / "tmp", - data_path / "census" / "shp" / fips, + cbg_state_url, data_path / "tmp", data_path / "census" / "shp" / fips, ) cmd = ( @@ -80,32 +78,22 @@ def download_census_csvs(data_path: Path) -> None: csv_dir_path / f"{state_id}.csv", mode="w", newline="" ) as cbg_csv_file: cbg_csv_file_writer = csv.writer( - cbg_csv_file, - delimiter=",", - quotechar='"', - quoting=csv.QUOTE_MINIMAL, + cbg_csv_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL, ) for geoid10 in geoid10_list: cbg_csv_file_writer.writerow( - [ - geoid10, - ] + [geoid10,] ) ## write US csv with open(csv_dir_path / "us.csv", mode="w", newline="") as cbg_csv_file: cbg_csv_file_writer = csv.writer( - cbg_csv_file, - delimiter=",", - quotechar='"', - quoting=csv.QUOTE_MINIMAL, + cbg_csv_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL, ) for geoid10 in cbg_national: cbg_csv_file_writer.writerow( - [ - geoid10, - ] + [geoid10,] ) ## create national geojson diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py index fd69f3ec..ba5d5af2 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py @@ -18,8 +18,7 @@ class EJScreenETL(ExtractTransformLoad): def extract(self) -> None: logger.info("Downloading EJScreen Data") super().extract( - self.EJSCREEN_FTP_URL, - self.TMP_PATH, + self.EJSCREEN_FTP_URL, self.TMP_PATH, ) def transform(self) -> None: diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py index 3cc2f2af..aafda83c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py @@ -34,8 +34,7 @@ class HudHousingETL(ExtractTransformLoad): def extract(self) -> None: logger.info("Extracting HUD Housing Data") super().extract( - self.HOUSING_FTP_URL, - self.HOUSING_ZIP_FILE_DIR, + self.HOUSING_FTP_URL, self.HOUSING_ZIP_FILE_DIR, ) def transform(self) -> None: @@ -49,10 +48,7 @@ class HudHousingETL(ExtractTransformLoad): / "140" / "Table8.csv" ) - self.df = pd.read_csv( - filepath_or_buffer=tmp_csv_file_path, - encoding="latin-1", - ) + self.df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path, encoding="latin-1",) # Rename and reformat block group ID self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True) diff --git a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py index a4351bf8..52b6d90c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py @@ -70,8 +70,7 @@ class TreeEquityScoreETL(ExtractTransformLoad): logger.info("Downloading Tree Equity Score Data") for state in self.states: super().extract( - f"{self.TES_URL}{state}.zip.zip", - f"{self.TMP_PATH}/{state}", + f"{self.TES_URL}{state}.zip.zip", f"{self.TMP_PATH}/{state}", ) def transform(self) -> None: diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py index 807d9ebe..b076a4f5 100644 --- a/data/data-pipeline/data_pipeline/utils.py +++ b/data/data-pipeline/data_pipeline/utils.py @@ -97,10 +97,7 @@ def remove_all_dirs_from_dir(dir_path: Path) -> None: def unzip_file_from_url( - file_url: str, - download_path: Path, - unzipped_file_path: Path, - verify: bool = False, + file_url: str, download_path: Path, unzipped_file_path: Path, verify: bool = False, ) -> None: """Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after