diff --git a/data/data-pipeline/data_pipeline/application.py b/data/data-pipeline/data_pipeline/application.py index 576b3d6d..ad621894 100644 --- a/data/data-pipeline/data_pipeline/application.py +++ b/data/data-pipeline/data_pipeline/application.py @@ -28,6 +28,8 @@ logger = get_module_logger(__name__) dataset_cli_help = "Grab the data from either 'local' for local access or 'aws' to retrieve from Justice40 S3 repository" +LOG_LINE_WIDTH = 60 + @click.group() def cli(): @@ -37,23 +39,26 @@ def cli(): @cli.command(help="Clean up all census data folders") def census_cleanup(): """CLI command to clean up the census data folder""" + log_title("Clean Up Census Data") data_path = settings.APP_ROOT / "data" # census directories - logger.info("Initializing all census data") + log_info("Cleaning up all census data") census_reset(data_path) - logger.info("Cleaned up all census data files") + log_goodbye() sys.exit() @cli.command(help="Clean up all data folders") def data_cleanup(): """CLI command to clean up the all the data folders""" + log_title("Clean Up Data ") data_path = settings.APP_ROOT / "data" + log_info("Cleaning up all data folders") census_reset(data_path) data_folder_cleanup() tribal_reset(data_path) @@ -61,7 +66,7 @@ def data_cleanup(): temp_folder_cleanup() geo_score_folder_cleanup() - logger.info("Cleaned up all data folders") + log_goodbye() sys.exit() @@ -77,19 +82,19 @@ def data_cleanup(): def census_data_download(zip_compress): """CLI command to download all census shape files from the Census FTP and extract the geojson to generate national and by state Census Block Group CSVs""" - - logger.info("Initializing all census data") + log_title("Download Census Data ") data_path = settings.APP_ROOT / "data" census_reset(data_path) - logger.info("Downloading census data") + log_info("Downloading census data") etl_runner("census") if zip_compress: + log_info("Zipping census data") zip_census_data() - logger.info("Completed downloading census data") + log_goodbye() sys.exit() @@ -103,10 +108,14 @@ def census_data_download(zip_compress): help=dataset_cli_help, ) def pull_census_data(data_source: str): - logger.info("Pulling census data from %s", data_source) + + log_title("Pull Census Data") + + log_info(f"Pulling census data from {data_source}") data_path = settings.APP_ROOT / "data" / "census" check_census_data_source(data_path, data_source) - logger.info("Finished pulling census data") + + log_goodbye() sys.exit() @@ -129,8 +138,12 @@ def etl_run(dataset: str): Returns: None """ + log_title("Run ETL") + log_info("Running dataset(s)") etl_runner(dataset) + + log_goodbye() sys.exit() @@ -139,9 +152,15 @@ def etl_run(dataset: str): ) def score_run(): """CLI command to generate the score""" + log_title("Score", "Generate Score") + log_info("Cleaning up data folders") score_folder_cleanup() + + log_info("Generating score") score_generate() + + log_goodbye() sys.exit() @@ -150,63 +169,25 @@ def score_run(): ) def score_full_run(): """CLI command to run ETL and generate the score in one command""" + log_title("Score Full Run", "Run ETL and Generate Score (no tiles)") + log_info("Cleaning up data folders") data_folder_cleanup() score_folder_cleanup() temp_folder_cleanup() + + log_info("Running all ETLs") etl_runner() + + log_info("Generating score") score_generate() - sys.exit() - -@cli.command(help="Generate Geojson files with scores baked in") -@click.option( - "-s", - "--data-source", - default="local", - required=False, - type=str, - help=dataset_cli_help, -) -def geo_score(data_source: str): - """CLI command to combine score with GeoJSON data and generate low and high files - - Args: - data_source (str): Source for the census data (optional) - Options: - - local: fetch census and score data from the local data directory - - aws: fetch census and score from AWS S3 J40 data repository - - Returns: - None - """ - - geo_score_folder_cleanup() - score_geo(data_source=data_source) + log_goodbye() sys.exit() @cli.command( - help="Generate map tiles. Pass -t to generate tribal layer as well.", -) -@click.option( - "-t", - "--generate-tribal-layer", - default=False, - required=False, - is_flag=True, - type=bool, -) -def generate_map_tiles(generate_tribal_layer): - """CLI command to generate the map tiles""" - - data_path = settings.APP_ROOT / "data" - generate_tiles(data_path, generate_tribal_layer) - sys.exit() - - -@cli.command( - help="Run etl_score_post to create score csv, tile csv, and downloadable zip", + help="Run etl_score_post to create score csv, tile csv, and downloadable zip" ) @click.option( "-s", @@ -228,9 +209,74 @@ def generate_score_post(data_source: str): Returns: None """ + log_title( + "Generate Score Post ", "Create Score CSV, Tile CSV, Downloadable ZIP" + ) + log_info("Cleaning up downloadable folder") downloadable_cleanup() + + log_info("Running score post activities") score_post(data_source) + + log_goodbye() + sys.exit() + + +@cli.command(help="Generate GeoJSON files with scores baked in") +@click.option( + "-s", + "--data-source", + default="local", + required=False, + type=str, + help=dataset_cli_help, +) +def geo_score(data_source: str): + """CLI command to combine score with GeoJSON data and generate low and high files + + Args: + data_source (str): Source for the census data (optional) + Options: + - local: fetch census and score data from the local data directory + - aws: fetch census and score from AWS S3 J40 data repository + + Returns: + None + """ + log_title("Generate GeoJSON", "Combine Score and GeoJSON") + + log_info("Cleaning up geo score folder") + geo_score_folder_cleanup() + + log_info("Combining score with GeoJSON") + score_geo(data_source=data_source) + + log_goodbye() + sys.exit() + + +@cli.command( + help="Generate map tiles. Pass -t to generate tribal layer as well.", +) +@click.option( + "-t", + "--generate-tribal-layer", + default=False, + required=False, + is_flag=True, + type=bool, +) +def generate_map_tiles(generate_tribal_layer): + """CLI command to generate the map tiles""" + log_title("Generate Map Tiles") + + data_path = settings.APP_ROOT / "data" + + log_info("Generating tiles") + generate_tiles(data_path, generate_tribal_layer) + + log_goodbye() sys.exit() @@ -264,49 +310,74 @@ def data_full_run(check: bool, data_source: str): Returns: None """ + log_title("Full Run", "Census DL, ETL, Score, Combine, Generate Tiles") + data_path = settings.APP_ROOT / "data" if check: if not check_first_run(): # check if the data full run has been run before - logger.info("*** The data full run was already executed") + log_info("The data full run was already executed") sys.exit() else: # census directories - logger.info("*** Initializing all data folders") + log_info("Cleaning up data folders") census_reset(data_path) data_folder_cleanup() score_folder_cleanup() temp_folder_cleanup() if data_source == "local": - logger.info("*** Downloading census data") + log_info("Downloading census data") etl_runner("census") - logger.info("*** Running all ETLs") + log_info("Running all ETLs") etl_runner() - logger.info("*** Generating Score") + log_info("Generating score") score_generate() - logger.info("*** Running Post Score scripts") + log_info("Running post score") downloadable_cleanup() score_post(data_source) - logger.info("*** Combining Score with Census Geojson") + log_info("Combining score with census GeoJSON") score_geo(data_source) - logger.info("*** Generating Map Tiles") + log_info("Generating map tiles") generate_tiles(data_path, True) + log_info("Completing pipeline") file = "first_run.txt" cmd = f"touch {data_path}/{file}" call(cmd, shell=True) - logger.info("*** Map data ready") + log_goodbye() sys.exit() +def log_title(title: str, subtitle: str = None): + """Logs a title in our fancy title format""" + logger.info("-" * LOG_LINE_WIDTH) + logger.info("") + logger.info(f"{title}") + if subtitle: + logger.info(f"{subtitle}") + logger.info("") + logger.info("-" * LOG_LINE_WIDTH) + logger.info("") + + +def log_info(info: str): + """Logs a general informational message""" + logger.info(f"- {info}") + + +def log_goodbye(): + """Logs a goodbye message""" + logger.info("- Finished. Bye!") + + if __name__ == "__main__": cli() diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index c9f647cd..c15f0240 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -225,8 +225,8 @@ class ExtractTransformLoad: # TODO: remove this once all ETL classes are converted to using the new # base class parameters and patterns. if self.GEO_LEVEL is None: - logger.info( - "Skipping validation step for this class because it does not " + logger.warning( + f"Skipping validation step for {self.__class__.__name__} because it does not " "seem to be converted to new ETL class patterns." ) return @@ -331,7 +331,7 @@ class ExtractTransformLoad: Uses the directory and the file name from `self._get_output_file_path`. """ - logger.info(f"Saving `{self.NAME}` CSV") + logger.debug(f"Saving `{self.NAME}` CSV") # Create directory if necessary. output_file_path = self._get_output_file_path() @@ -342,7 +342,7 @@ class ExtractTransformLoad: output_file_path, index=False, float_format=float_format ) - logger.info(f"File written to `{output_file_path}`.") + logger.debug(f"File written to `{output_file_path}`.") # This is a classmethod so it can be used without needing to create an instance of # the class. This is a use case in `etl_score`. @@ -362,7 +362,7 @@ class ExtractTransformLoad: f"No file found at `{output_file_path}`." ) - logger.info( + logger.debug( f"Reading in CSV `{output_file_path}` for ETL of class `{cls}`." ) output_df = pd.read_csv( diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py index 89663824..8d896ded 100644 --- a/data/data-pipeline/data_pipeline/etl/runner.py +++ b/data/data-pipeline/data_pipeline/etl/runner.py @@ -42,6 +42,9 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]: def _run_one_dataset(dataset: dict) -> None: """Runs one etl process.""" + + logger.info(f"Running ETL for {dataset['name']}") + etl_module = importlib.import_module( f"data_pipeline.etl.sources.{dataset['module_dir']}.etl" ) @@ -49,21 +52,26 @@ def _run_one_dataset(dataset: dict) -> None: etl_instance = etl_class() # run extract + logger.debug(f"Extracting {dataset['name']}") etl_instance.extract() # run transform + logger.debug(f"Transforming {dataset['name']}") etl_instance.transform() # run load + logger.debug(f"Loading {dataset['name']}") etl_instance.load() # run validate + logger.debug(f"Validating {dataset['name']}") etl_instance.validate() # cleanup + logger.debug(f"Cleaning up {dataset['name']}") etl_instance.cleanup() - logger.info(f"Finished `etl-run` for dataset `{dataset['name']}`.") + logger.info(f"Finished ETL for dataset {dataset['name']}") def etl_runner(dataset_to_run: str = None) -> None: @@ -94,7 +102,7 @@ def etl_runner(dataset_to_run: str = None) -> None: ] if concurrent_datasets: - logger.info("Running concurrent jobs") + logger.info("Running concurrent ETL jobs") with concurrent.futures.ThreadPoolExecutor() as executor: futures = { executor.submit(_run_one_dataset, dataset=dataset) @@ -106,10 +114,10 @@ def etl_runner(dataset_to_run: str = None) -> None: # Otherwise, the exceptions are silently ignored. fut.result() - # Note: these high-memory datasets also usually require the Census geojson to be - # generated, and one of them requires the Tribal geojson to be generated. + # Note: these high-memory datasets also usually require the Census GeoJSON to be + # generated, and one of them requires the Tribal GeoJSON to be generated. if high_memory_datasets: - logger.info("Running high-memory jobs") + logger.info("Running high-memory ETL jobs") for dataset in high_memory_datasets: _run_one_dataset(dataset=dataset) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index b6c8c407..cf6c4366 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -56,8 +56,6 @@ class ScoreETL(ExtractTransformLoad): self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = [] def extract(self) -> None: - logger.info("Loading data sets from disk.") - # EJSCreen csv Load ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv" self.ejscreen_df = pd.read_csv( @@ -200,7 +198,7 @@ class ScoreETL(ExtractTransformLoad): ) def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame: - logger.info("Joining Census Tract dataframes") + logger.debug("Joining Census Tract dataframes") def merge_function( left: pd.DataFrame, right: pd.DataFrame @@ -317,7 +315,7 @@ class ScoreETL(ExtractTransformLoad): ~df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts), np.nan, ) - logger.info( + logger.debug( f"Creating special case column for percentiles from {input_column_name}" ) df[ @@ -335,7 +333,7 @@ class ScoreETL(ExtractTransformLoad): # TODO Move a lot of this to the ETL part of the pipeline def _prepare_initial_df(self) -> pd.DataFrame: - logger.info("Preparing initial dataframe") + logger.debug("Preparing initial dataframe") # Join all the data sources that use census tracts census_tract_dfs = [ @@ -377,7 +375,7 @@ class ScoreETL(ExtractTransformLoad): assert ( census_tract_df.shape[0] <= pre_join_len ), "Join against national tract list ADDED rows" - logger.info( + logger.debug( "Dropped %s tracts not in the 2010 tract data", pre_join_len - census_tract_df[field_names.GEOID_TRACT_FIELD].nunique(), @@ -560,7 +558,7 @@ class ScoreETL(ExtractTransformLoad): for col in boolean_columns: tmp = df_copy[col].copy() df_copy[col] = np.where(tmp.notna(), tmp.astype(bool), None) - logger.info(f"{col} contains {df_copy[col].isna().sum()} nulls.") + logger.debug(f"{col} contains {df_copy[col].isna().sum()} nulls.") # Convert all columns to numeric and do math # Note that we have a few special conditions here and we handle them explicitly. @@ -591,7 +589,7 @@ class ScoreETL(ExtractTransformLoad): .astype(bool) .fillna(False) ][field_names.GEOID_TRACT_FIELD].to_list() - logger.info( + logger.debug( f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss" ) elif numeric_column == field_names.LINGUISTIC_ISO_FIELD: @@ -599,7 +597,7 @@ class ScoreETL(ExtractTransformLoad): # 72 is the FIPS code for Puerto Rico df_copy[field_names.GEOID_TRACT_FIELD].str.startswith("72") ][field_names.GEOID_TRACT_FIELD].to_list() - logger.info( + logger.debug( f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation" ) @@ -615,7 +613,7 @@ class ScoreETL(ExtractTransformLoad): df_copy[field_names.TOTAL_POP_FIELD].fillna(0) <= low_population ][field_names.GEOID_TRACT_FIELD].to_list() - logger.info( + logger.debug( f"Dropping {len(drop_tracts)} tracts from DOT traffic burden" ) @@ -666,7 +664,7 @@ class ScoreETL(ExtractTransformLoad): ) def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame: - logger.info("Backfilling island demographic data") + logger.debug("Backfilling island demographic data") island_index = self._get_island_areas(df) for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: actual_field_name = backfill_field_name.replace( @@ -684,8 +682,6 @@ class ScoreETL(ExtractTransformLoad): return df def transform(self) -> None: - logger.info("Transforming Score Data") - # prepare the df with the right CBG/tract IDs, column names/types, and percentiles self.df = self._prepare_initial_df() @@ -696,9 +692,6 @@ class ScoreETL(ExtractTransformLoad): self.df = self._backfill_island_demographics(self.df) def load(self) -> None: - logger.info( - f"Saving Score CSV to {constants.DATA_SCORE_CSV_FULL_FILE_PATH}." - ) constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True) self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index 047ae80d..b7937272 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -118,7 +118,7 @@ class GeoScoreETL(ExtractTransformLoad): fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME] # TODO update this join - logger.info("Merging and compressing score CSV with USA GeoJSON") + logger.info("Merging and compressing score csv with USA GeoJSON") self.geojson_score_usa_high = self.score_usa_df.set_index( self.GEOID_FIELD_NAME ).merge( @@ -143,7 +143,7 @@ class GeoScoreETL(ExtractTransformLoad): columns={self.TARGET_SCORE_SHORT_FIELD: self.TARGET_SCORE_RENAME_TO} ) - logger.info("Converting geojson into geodf with tracts") + logger.info("Converting GeoJSON into GeoDataFrame with tracts") usa_tracts = gpd.GeoDataFrame( usa_tracts, columns=[ @@ -154,15 +154,15 @@ class GeoScoreETL(ExtractTransformLoad): crs="EPSG:4326", ) - logger.info("Creating buckets from tracts") + logger.debug("Creating buckets from tracts") usa_bucketed, keep_high_zoom_df = self._create_buckets_from_tracts( usa_tracts, self.NUMBER_OF_BUCKETS ) - logger.info("Aggregating buckets") + logger.debug("Aggregating buckets") usa_aggregated = self._aggregate_buckets(usa_bucketed, agg_func="mean") - logger.info("Breaking up polygons") + logger.debug("Breaking up polygons") compressed = self._breakup_multipolygons( usa_aggregated, self.NUMBER_OF_BUCKETS ) @@ -220,7 +220,7 @@ class GeoScoreETL(ExtractTransformLoad): len(state_tracts.index) / self.NUMBER_OF_BUCKETS ) - logger.info( + logger.debug( f"The number of buckets has increased to {self.NUMBER_OF_BUCKETS}" ) for i in range(len(state_tracts.index)): diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 87fbecda..0111bb04 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -62,7 +62,7 @@ class PostScoreETL(ExtractTransformLoad): # End YAML definition constants def _extract_counties(self, county_path: Path) -> pd.DataFrame: - logger.info("Reading Counties CSV") + logger.debug("Reading Counties CSV") return pd.read_csv( county_path, sep="\t", @@ -75,7 +75,7 @@ class PostScoreETL(ExtractTransformLoad): ) def _extract_states(self, state_path: Path) -> pd.DataFrame: - logger.info("Reading States CSV") + logger.debug("Reading States CSV") return pd.read_csv( state_path, dtype={"fips": "string", "state_abbreviation": "string"}, @@ -83,7 +83,7 @@ class PostScoreETL(ExtractTransformLoad): ) def _extract_score(self, score_path: Path) -> pd.DataFrame: - logger.info("Reading Score CSV") + logger.debug("Reading Score CSV") df = pd.read_csv( score_path, dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, @@ -98,8 +98,6 @@ class PostScoreETL(ExtractTransformLoad): return df def extract(self) -> None: - logger.info("Starting Extraction") - # check census data check_census_data_source( census_data_path=self.DATA_PATH / "census", @@ -170,7 +168,7 @@ class PostScoreETL(ExtractTransformLoad): score_df: pd.DataFrame, ) -> pd.DataFrame: - logger.info("Merging county info with score info") + logger.debug("Merging county info with score info") score_county_merged = score_df.merge( # We drop state abbreviation so we don't get it twice counties_df[["GEOID", "County Name"]], @@ -178,7 +176,7 @@ class PostScoreETL(ExtractTransformLoad): how="left", ) - logger.info("Merging state info with county-score info") + logger.debug("Merging state info with county-score info") # Here, we need to join on a separate key, since there's no # entry for the island areas in the counties df (there are no # counties!) Thus, unless we join state separately from county, @@ -207,7 +205,7 @@ class PostScoreETL(ExtractTransformLoad): score_county_state_merged_df: pd.DataFrame, ) -> pd.DataFrame: - logger.info("Rounding Decimals") + logger.debug("Rounding Decimals") # grab all the keys from tiles score columns tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys()) @@ -218,7 +216,7 @@ class PostScoreETL(ExtractTransformLoad): # We may not want some states/territories on the map, so this will drop all # rows with those FIPS codes (first two digits of the census tract) - logger.info( + logger.debug( f"Dropping specified FIPS codes from tile data: {constants.DROP_FIPS_CODES}" ) tracts_to_drop = [] @@ -236,12 +234,12 @@ class PostScoreETL(ExtractTransformLoad): for col, col_dtype in score_tiles.dtypes.items() if col_dtype == np.dtype("float64") ] - scale_factor = 10**constants.TILES_ROUND_NUM_DECIMALS + scale_factor = 10 ** constants.TILES_ROUND_NUM_DECIMALS score_tiles[float_cols] = ( score_tiles[float_cols] * scale_factor ).apply(np.floor) / scale_factor - logger.info("Adding fields for island areas and Puerto Rico") + logger.debug("Adding fields for island areas and Puerto Rico") # The below operation constructs variables for the front end. # Since the Island Areas, Puerto Rico, and the nation all have a different # set of available data, each has its own user experience. @@ -381,8 +379,6 @@ class PostScoreETL(ExtractTransformLoad): return final_df def transform(self) -> None: - logger.info("Transforming data sources for Score + County CSVs") - transformed_counties = self._transform_counties(self.input_counties_df) transformed_states = self._transform_states(self.input_states_df) transformed_score = self._transform_score(self.input_score_df) @@ -403,7 +399,7 @@ class PostScoreETL(ExtractTransformLoad): def _load_score_csv_full( self, score_county_state_merged: pd.DataFrame, score_csv_path: Path ) -> None: - logger.info("Saving Full Score CSV with County Information") + logger.debug("Saving Full Score CSV with County Information") score_csv_path.parent.mkdir(parents=True, exist_ok=True) score_county_state_merged.to_csv( score_csv_path, @@ -476,7 +472,7 @@ class PostScoreETL(ExtractTransformLoad): def _load_tile_csv( self, score_tiles_df: pd.DataFrame, tile_score_path: Path ) -> None: - logger.info("Saving Tile Score CSV") + logger.debug("Saving Tile Score CSV") tile_score_path.parent.mkdir(parents=True, exist_ok=True) score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8") @@ -498,13 +494,13 @@ class PostScoreETL(ExtractTransformLoad): constants.SCORE_VERSIONING_DATA_DOCUMENTATION_ZIP_FILE_PATH ) - logger.info("Writing downloadable excel") + logger.debug("Writing downloadable excel") excel_config = self._load_excel_from_df( excel_df=self.output_score_county_state_merged_df, excel_path=excel_path, ) - logger.info("Writing downloadable csv") + logger.debug("Writing downloadable csv") # open yaml config downloadable_csv_config = load_yaml_dict_from_file( self.CONTENT_CONFIG / "csv.yml", CSVConfig @@ -516,7 +512,7 @@ class PostScoreETL(ExtractTransformLoad): ) downloadable_df.to_csv(csv_path, index=False) - logger.info("Creating codebook for download zip") + logger.debug("Creating codebook for download zip") # consolidate all excel fields from the config yml. The codebook # code takes in a list of fields, but the excel config file @@ -562,17 +558,17 @@ class PostScoreETL(ExtractTransformLoad): codebook_df.to_csv(codebook_path, index=False) # zip assets - logger.info("Compressing csv files") + logger.debug("Compressing csv files") files_to_compress = [csv_path, codebook_path, readme_path] zip_files(csv_zip_path, files_to_compress) - logger.info("Compressing xls files") + logger.debug("Compressing xls files") files_to_compress = [excel_path, codebook_path, readme_path] zip_files(xls_zip_path, files_to_compress) # Per #1557 # zip file that contains the .xls, .csv, .pdf, tech support document, checksum file - logger.info("Compressing data and documentation files") + logger.debug("Compressing data and documentation files") files_to_compress = [ excel_path, csv_path, diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py index 66ee7131..bc0f45ac 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py @@ -47,14 +47,14 @@ def check_score_data_source( # download from s3 if census_data_source is aws if score_data_source == "aws": - logger.info("Fetching Score Tile data from AWS S3") + logger.debug("Fetching Score Tile data from AWS S3") download_file_from_url( file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV ) else: # check if score data is found locally if not os.path.isfile(TILE_SCORE_CSV): - logger.info( + logger.warning( "No local score tiles data found. Please use '-s aws` to fetch from AWS" ) sys.exit() @@ -96,7 +96,7 @@ def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series: if series.isin(unacceptable_values).any(): series.replace(mapping, regex=False, inplace=True) - multiplication_factor = 10**number_of_decimals + multiplication_factor = 10 ** number_of_decimals # In order to safely cast NaNs # First coerce series to float type: series.astype(float) @@ -409,7 +409,7 @@ def compare_to_list_of_expected_state_fips_codes( f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n" ) else: - logger.info( + logger.debug( "Data matches expected state and territory representation" f"{dataset_name_phrase}." ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py index 5c970062..9e3b2db4 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py @@ -33,15 +33,12 @@ class CalEnviroScreenETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Downloading CalEnviroScreen Data") super().extract( self.CALENVIROSCREEN_FTP_URL, self.get_tmp_path(), ) def transform(self) -> None: - logger.info("Transforming CalEnviroScreen Data") - # Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically: # https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip # Load comparison index (CalEnviroScreen 4) @@ -70,7 +67,6 @@ class CalEnviroScreenETL(ExtractTransformLoad): ) def load(self) -> None: - logger.info("Saving CalEnviroScreen CSV") # write nationwide csv self.CSV_PATH.mkdir(parents=True, exist_ok=True) self.df.to_csv(self.CSV_PATH / "data06.csv", index=False) diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index e0d51952..8c2da2e9 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -81,7 +81,6 @@ class CDCLifeExpectancy(ExtractTransformLoad): return df def extract(self) -> None: - logger.info("Starting data download.") all_usa_raw_df = self._download_and_prep_data( file_url=self.USA_FILE_URL, @@ -102,13 +101,13 @@ class CDCLifeExpectancy(ExtractTransformLoad): additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE, ) - logger.info("Downloading data for Maine") + logger.debug("Downloading data for Maine") maine_raw_df = self._download_and_prep_data( file_url=self.MAINE_FILE_URL, download_file_name=self.get_tmp_path() / "maine.csv", ) - logger.info("Downloading data for Wisconsin") + logger.debug("Downloading data for Wisconsin") wisconsin_raw_df = self._download_and_prep_data( file_url=self.WISCONSIN_FILE_URL, download_file_name=self.get_tmp_path() / "wisconsin.csv", @@ -138,7 +137,6 @@ class CDCLifeExpectancy(ExtractTransformLoad): self.raw_df = combined_df def transform(self) -> None: - logger.info("Starting CDC life expectancy transform.") self.output_df = self.raw_df.rename( columns={ @@ -148,7 +146,6 @@ class CDCLifeExpectancy(ExtractTransformLoad): ) def load(self) -> None: - logger.info("Saving CDC Life Expectancy CSV") self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.output_df[self.COLUMNS_TO_KEEP].to_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py index fc5589ce..d940cec9 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py @@ -44,7 +44,6 @@ class CDCPlacesETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Starting to download 520MB CDC Places file.") file_path = download_file_from_url( file_url=self.CDC_PLACES_URL, download_file_name=self.get_tmp_path() / "census_tract.csv", @@ -57,8 +56,6 @@ class CDCPlacesETL(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Starting CDC Places transform") - # Rename GEOID field self.df.rename( columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME}, diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py index c4f9853e..7f725e91 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py @@ -48,7 +48,6 @@ class CDCSVIIndex(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Downloading 43 MB CDC SVI INDEX") self.df = pd.read_csv( filepath_or_buffer=self.CDC_SVI_INDEX_URL, dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"}, @@ -56,7 +55,6 @@ class CDCSVIIndex(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Starting CDC SVI INDEX transform") # Note: In this dataset all US census tracts are ranked against one another. # Puerto Rico is not included in this dataset self.df.rename( @@ -109,8 +107,6 @@ class CDCSVIIndex(ExtractTransformLoad): ) def load(self) -> None: - logger.info("Saving CDC SVI Index Data") - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.df[self.COLUMNS_TO_KEEP].to_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py index 103453d1..407b83fc 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py @@ -70,14 +70,9 @@ class CensusETL(ExtractTransformLoad): None """ shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP) - logger.info(f"Checking if {fips_code} shp file exists") # check if file exists if not shp_file_path.is_file(): - logger.info( - f"{fips_code} shp file does not exist. Downloading and extracting shape file" - ) - tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip" unzip_file_from_url( tract_state_url, @@ -86,8 +81,11 @@ class CensusETL(ExtractTransformLoad): ) def extract(self) -> None: - logger.info("Downloading Census Data") - for fips_code in self.STATE_FIPS_CODES: + logger.debug("Extracting census data") + for index, fips_code in enumerate(self.STATE_FIPS_CODES): + logger.debug( + f"Extracting shape for FIPS {fips_code} – {index+1} of {len(self.STATE_FIPS_CODES)}" + ) self._extract_shp(fips_code) def _transform_to_geojson(self, fips_code: str) -> None: @@ -100,11 +98,8 @@ class CensusETL(ExtractTransformLoad): geojson_file_path = self._path_for_fips_file( fips_code, GeoFileType.GEOJSON ) - logger.info(f"Checking if {fips_code} geoJSON file exists ") + if not geojson_file_path.is_file(): - logger.info( - f"GeoJSON file {fips_code} does not exist. Converting shp to geoJSON" - ) cmd = [ "ogr2ogr", "-f", @@ -120,9 +115,11 @@ class CensusETL(ExtractTransformLoad): Returns: None """ + logger.debug("Transforming tracts") + for file in self.GEOJSON_BASE_PATH.iterdir(): if file.suffix == ".json": - logger.info(f"Ingesting geoid10 for file {file}") + logger.debug(f"Adding GEOID10 for file {file.name}") with open(self.GEOJSON_BASE_PATH / file, encoding="utf-8") as f: geojson = json.load(f) for feature in geojson["features"]: @@ -142,13 +139,19 @@ class CensusETL(ExtractTransformLoad): Returns: None """ - logger.info("Transforming Census Data") - for fips_code in self.STATE_FIPS_CODES: + logger.debug("Transforming census data") + + logger.debug("Transforming SHP files to GeoJSON") + for index, fips_code in enumerate(self.STATE_FIPS_CODES): + logger.debug( + f"Transforming FIPS {fips_code} to GeoJSON – {index+1} of {len(self.STATE_FIPS_CODES)}" + ) self._transform_to_geojson(fips_code) + self._generate_tract_table() def _load_into_state_csvs(self, fips_code: str) -> None: - """Load state CSVS into individual CSV files + """Load state CSVs into individual CSV files Args: fips_code (str): the FIPS code for the region of interest @@ -182,10 +185,9 @@ class CensusETL(ExtractTransformLoad): Returns: None """ - logger.info("Writing national us.csv file") + logger.debug("Loading national US.csv") if not self.NATIONAL_TRACT_CSV_PATH.is_file(): - logger.info(f"Creating {self.NATIONAL_TRACT_CSV_PATH}") with open( self.NATIONAL_TRACT_CSV_PATH, mode="w", @@ -211,22 +213,21 @@ class CensusETL(ExtractTransformLoad): Returns: None """ - logger.info("Generating national geojson file") + logger.debug("Loading National GeoJson") usa_df = gpd.GeoDataFrame() for file_name in self.GEOJSON_BASE_PATH.rglob("*.json"): - logger.info(f"Ingesting {file_name}") + logger.debug(f"Adding national GeoJSON file {file_name.name}") state_gdf = gpd.read_file(file_name) usa_df = usa_df.append(state_gdf) usa_df = usa_df.to_crs( "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" ) - logger.info("Writing national geojson file") - usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON") - logger.info("Census tract downloading complete") + logger.debug("Saving national GeoJSON file") + usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON") def load(self) -> None: """Create state CSVs, National CSV, and National GeoJSON @@ -234,8 +235,13 @@ class CensusETL(ExtractTransformLoad): Returns: None """ - logger.info("Saving Census CSV") + logger.debug("Loading census data") + + logger.debug("Loading individual state csv files") for fips_code in self.TRACT_PER_STATE: self._load_into_state_csvs(fips_code) + self._load_national_csv() self._load_national_geojson() + + logger.debug("Census data complete") diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py index d172fafb..67a9b32e 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py @@ -39,7 +39,6 @@ def get_state_fips_codes(data_path: Path) -> list: """Returns a list with state data""" fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv" - logger.info("Downloading fips from S3 repository") unzip_file_from_url( settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip", data_path / "tmp", @@ -97,7 +96,6 @@ def check_census_data_source( # download from s3 if census_data_source is aws if census_data_source == "aws": - logger.info("Fetching Census data from AWS S3") unzip_file_from_url( CENSUS_DATA_S3_URL, DATA_PATH / "tmp", @@ -106,14 +104,13 @@ def check_census_data_source( else: # check if census data is found locally if not os.path.isfile(census_data_path / "geojson" / "us.json"): - logger.info( + logger.error( "No local census data found. Please use '-s aws` to fetch from AWS" ) sys.exit() def zip_census_data(): - logger.info("Compressing census files to data/tmp folder") CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census" TMP_PATH = settings.APP_ROOT / "data" / "tmp" diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index b3613976..c2965493 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -363,18 +363,16 @@ class CensusACSETL(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Starting Census ACS Transform") - df = self.df # Here we join the geometry of the US to the dataframe so that we can impute # The income of neighbors. first this looks locally; if there's no local # geojson file for all of the US, this will read it off of S3 - logger.info("Reading in geojson for the country") + logger.debug("Reading in geojson for the country") if not os.path.exists( self.DATA_PATH / "census" / "geojson" / "us.json" ): - logger.info("Fetching Census data from AWS S3") + logger.debug("Fetching Census data from AWS S3") unzip_file_from_url( CENSUS_DATA_S3_URL, self.DATA_PATH / "tmp", @@ -406,7 +404,7 @@ class CensusACSETL(ExtractTransformLoad): self.MEDIAN_HOUSE_VALUE_FIELD_NAME, ]: missing_value_count = sum(df[field] == -666666666) - logger.info( + logger.debug( f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of " + f"`{field}` being marked as null values." ) @@ -591,7 +589,7 @@ class CensusACSETL(ExtractTransformLoad): # we impute income for both income measures ## TODO: Convert to pydantic for clarity - logger.info("Imputing income information") + logger.debug("Imputing income information") ImputeVariables = namedtuple( "ImputeVariables", ["raw_field_name", "imputed_field_name"] ) @@ -612,7 +610,7 @@ class CensusACSETL(ExtractTransformLoad): minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION, ) - logger.info("Calculating with imputed values") + logger.debug("Calculating with imputed values") df[ self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME @@ -644,7 +642,7 @@ class CensusACSETL(ExtractTransformLoad): == 0 ), "Error: not all values were filled..." - logger.info("Renaming columns...") + logger.debug("Renaming columns...") df = df.rename( columns={ self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME: field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD, diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py index e3d52988..ad0514ff 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py @@ -88,7 +88,7 @@ def _prepare_dataframe_for_imputation( ][geoid_field].unique() # Check that imputation is a valid choice for this set of fields - logger.info(f"Imputing values for {len(tract_list)} unique tracts.") + logger.debug(f"Imputing values for {len(tract_list)} unique tracts.") assert len(tract_list) > 0, "Error: No missing values to impute" return tract_list, geo_df @@ -156,7 +156,7 @@ def calculate_income_measures( mask_to_use ][impute_var_pair.raw_field_name].mean() - logger.info("Casting geodataframe as a typical dataframe") + logger.debug("Casting geodataframe as a typical dataframe") # get rid of the geometry column and cast as a typical df df = pd.DataFrame( geo_df[[col for col in geo_df.columns if col != "geometry"]] diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py index 633576c7..76743f48 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py @@ -30,14 +30,14 @@ def retrieve_census_acs_data( dfs = [] for fips in get_state_fips_codes(data_path_for_fips_codes): if fips in CENSUS_ACS_FIPS_CODES_TO_SKIP: - logger.info( + logger.debug( f"Skipping download for state/territory with FIPS code {fips}" ) else: census_api_key = "" if os.environ.get("CENSUS_API_KEY"): census_api_key = "with API key" - logger.info( + logger.debug( f"Downloading data for state/territory with FIPS code {fips} {census_api_key}" ) @@ -55,7 +55,7 @@ def retrieve_census_acs_data( except ValueError as e: logger.error( - f"Could not download data for state/territory with FIPS code {fips}" + f"Could not download data for state/territory with FIPS code {fips} because {e}" ) raise e diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py index 49648eae..a6dc5869 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py @@ -100,7 +100,6 @@ class CensusACS2010ETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Starting Census 2010 ACS Transform") # Define the variables to retrieve variables = ( self.UNEMPLOYED_FIELDS @@ -118,8 +117,6 @@ class CensusACS2010ETL(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Starting Census 2010 ACS Transform") - df = self.df # Calculate percent unemployment. @@ -184,8 +181,6 @@ class CensusACS2010ETL(ExtractTransformLoad): self.df = output_df def load(self) -> None: - logger.info("Saving Census 2010 ACS Data") - # mkdir census self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py index e0322e4f..f8abc7c4 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py @@ -224,7 +224,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): return state_median_incomes_df def extract(self) -> None: - logger.info("Starting four separate downloads.") # Load and clean GEOCORR data # Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census. # The specific query used is the following, which takes a couple of minutes to run: @@ -239,7 +238,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): # and with the "target geographies" selected being: # - Core based statistical area (CBSA) # - CBSA Type (Metro or Micro) - logger.info("Starting download of 1.5MB Geocorr information.") + logger.debug("Starting download of 1.5MB Geocorr information.") unzip_file_from_url( file_url=settings.AWS_JUSTICE40_DATASOURCES_URL @@ -265,7 +264,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): low_memory=False, ) - logger.info("Pulling PR tract list down.") + logger.debug("Pulling PR tract list down.") # This step is necessary because PR is not in geocorr at the level that gets joined pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv" download_file_from_url( @@ -282,7 +281,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): self.pr_tracts["State Abbreviation"] = "PR" # Download MSA median incomes - logger.info("Starting download of MSA median incomes.") + logger.debug("Starting download of MSA median incomes.") download = requests.get( self.MSA_MEDIAN_INCOME_URL, verify=None, @@ -291,7 +290,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): self.msa_median_incomes = json.loads(download.content) # Download state median incomes - logger.info("Starting download of state median incomes.") + logger.debug("Starting download of state median incomes.") download_state = requests.get( self.STATE_MEDIAN_INCOME_URL, verify=None, @@ -301,8 +300,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): ## NOTE we already have PR's MI here def transform(self) -> None: - logger.info("Starting transforms.") - # Run transforms: geocorr_df = self._transform_geocorr() msa_median_incomes_df = self._transform_msa_median_incomes() @@ -352,8 +349,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): self.output_df = merged_with_state_income_df def load(self) -> None: - logger.info("Saving Census ACS Median Income CSV") - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.output_df[self.COLUMNS_TO_KEEP].to_csv( path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index e230fa33..395697fc 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -352,7 +352,7 @@ class CensusDecennialETL(ExtractTransformLoad): dfs = [] dfs_vi = [] for island in self.ISLAND_TERRITORIES: - logger.info( + logger.debug( f"Downloading data for state/territory {island['state_abbreviation']}" ) for county in island["county_fips"]: @@ -369,7 +369,13 @@ class CensusDecennialETL(ExtractTransformLoad): timeout=settings.REQUESTS_DEFAULT_TIMOUT, ) - df = json.loads(download.content) + try: + df = json.loads(download.content) + except ValueError as e: + logger.error( + f"Could not load content in census decennial ETL because {e}. Content is {download.content}." + ) + # First row is the header df = pd.DataFrame(df[1:], columns=df[0]) @@ -393,8 +399,6 @@ class CensusDecennialETL(ExtractTransformLoad): self.df_vi = pd.concat(dfs_vi) def transform(self) -> None: - logger.info("Starting Census Decennial Transform") - # Rename All Fields self.df.rename(columns=self.FIELD_NAME_XWALK, inplace=True) self.df_vi.rename(columns=self.FIELD_NAME_XWALK, inplace=True) @@ -489,13 +493,11 @@ class CensusDecennialETL(ExtractTransformLoad): # Reporting Missing Values for col in self.df_all.columns: missing_value_count = self.df_all[col].isnull().sum() - logger.info( + logger.debug( f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows" ) def load(self) -> None: - logger.info("Saving Census Decennial Data") - # mkdir census self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) diff --git a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py index c3ecb5fb..5f9a10b8 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py @@ -65,14 +65,12 @@ class ChildOpportunityIndex(ExtractTransformLoad): self.output_df: pd.DataFrame def extract(self) -> None: - logger.info("Starting 51MB data download.") super().extract( source_url=self.SOURCE_URL, extract_path=self.get_tmp_path(), ) def transform(self) -> None: - logger.info("Starting transforms.") raw_df = pd.read_csv( filepath_or_buffer=self.get_tmp_path() / "raw.csv", # The following need to remain as strings for all of their digits, not get diff --git a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py index 3ce2e5a6..0056be9a 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py @@ -30,7 +30,6 @@ class DOEEnergyBurden(ExtractTransformLoad): self.output_df: pd.DataFrame def transform(self) -> None: - logger.info("Starting DOE Energy Burden transforms.") raw_df: pd.DataFrame = pd.read_csv( filepath_or_buffer=self.get_tmp_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv", @@ -41,7 +40,7 @@ class DOEEnergyBurden(ExtractTransformLoad): low_memory=False, ) - logger.info("Renaming columns and ensuring output format is correct") + logger.debug("Renaming columns and ensuring output format is correct") output_df = raw_df.rename( columns={ self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME, diff --git a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py index f68c2d78..3329ec6a 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py @@ -53,7 +53,6 @@ class TravelCompositeETL(ExtractTransformLoad): - Renames the Census Tract column to match the other datasets - Converts to CSV """ - logger.info("Transforming DOT Travel Disadvantage Data") # read in the unzipped shapefile from data source # reformat it to be standard df, remove unassigned rows, and diff --git a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py index 100c3d6a..3162c637 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py @@ -60,7 +60,6 @@ class AbandonedMineETL(ExtractTransformLoad): self.output_df: pd.DataFrame def transform(self) -> None: - logger.info("Starting eAMLIS transforms.") df = pd.read_csv( self.get_tmp_path() / "eAMLIS export of all data.tsv", sep="\t", diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py index cba408cc..0db8e648 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py @@ -44,7 +44,6 @@ class EJSCREENETL(ExtractTransformLoad): ] def extract(self) -> None: - logger.info("Downloading EJScreen Data") super().extract( self.EJSCREEN_FTP_URL, self.get_tmp_path(), @@ -52,7 +51,6 @@ class EJSCREENETL(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Transforming EJScreen Data") self.df = pd.read_csv( self.EJSCREEN_CSV, dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py index 366e3738..8c18034d 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py @@ -39,7 +39,7 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad): def extract(self) -> None: if self.ejscreen_areas_of_concern_data_exists(): - logger.info("Loading EJSCREEN Areas of Concern Data Locally") + logger.debug("Loading EJSCREEN Areas of Concern Data Locally") self.df = pd.read_csv( filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA, dtype={ @@ -48,24 +48,24 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad): low_memory=False, ) else: - logger.info( + logger.warning( "EJSCREEN areas of concern data does not exist locally. Not loading the data." ) def transform(self) -> None: - logger.info("Transforming EJSCREEN Areas of Concern Data") + logger.debug("Transforming EJSCREEN Areas of Concern Data") # TO DO: As a one off we did all the processing in a separate Notebook # Can add here later for a future PR def load(self) -> None: if self.ejscreen_areas_of_concern_data_exists(): - logger.info("Saving EJSCREEN Areas of Concern Data") + logger.debug("Saving EJSCREEN Areas of Concern Data") # write nationwide csv self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False) else: - logger.info( + logger.warning( "EJSCREEN areas of concern data does not exist locally. Not saving the data." ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py index d83287fa..3f27898e 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py @@ -49,8 +49,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Starting data download.") - unzip_file_from_url( file_url=self.DEFINITION_ALTERNATIVE_FILE_URL, download_path=self.get_tmp_path(), @@ -70,8 +68,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Starting transforms.") - self.df = self.df.rename( columns={ self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, @@ -105,8 +101,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad): ) def load(self) -> None: - logger.info("Saving CSV") - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.df[self.COLUMNS_TO_KEEP].to_csv( path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False diff --git a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py index 1788b835..56f8bcc4 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py @@ -65,8 +65,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Starting 2.5 MB data download.") - # the column headers from the above dataset are actually a census tract's data at this point # We will use this data structure later to specify the column names input_columns = [ @@ -98,8 +96,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Starting transforms.") - score_columns = [x for x in self.df.columns if "SCORE" in x] # coerce dataframe type to perform correct next steps @@ -157,8 +153,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad): ) def load(self) -> None: - logger.info("Saving CSV") - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.df[self.COLUMNS_TO_KEEP].to_csv( path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py index 1e829f3d..60534daa 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py @@ -48,7 +48,6 @@ class FloodRiskETL(ExtractTransformLoad): - Renames the Census Tract column to match the other datasets - Calculates share of properties at risk, left-clipping number of properties at 250 """ - logger.info("Transforming National Risk Index Data") # read in the unzipped csv data source then rename the # Census Tract column for merging diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py index 38a77f00..2680eaf3 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py @@ -48,7 +48,6 @@ class WildfireRiskETL(ExtractTransformLoad): - Renames the Census Tract column to match the other datasets - Calculates share of properties at risk, left-clipping number of properties at 250 """ - logger.info("Transforming National Risk Index Data") # read in the unzipped csv data source then rename the # Census Tract column for merging df_fsf_fire: pd.DataFrame = pd.read_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py b/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py index ec0f1b51..7093f045 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py @@ -16,7 +16,7 @@ logger = get_module_logger(__name__) def get_tract_geojson( _tract_data_path: Optional[Path] = None, ) -> gpd.GeoDataFrame: - logger.info("Loading tract geometry data from census ETL") + logger.debug("Loading tract geometry data from census ETL") GEOJSON_PATH = _tract_data_path if GEOJSON_PATH is None: GEOJSON_PATH = CensusETL.NATIONAL_TRACT_JSON_PATH @@ -40,7 +40,7 @@ def get_tract_geojson( def get_tribal_geojson( _tribal_data_path: Optional[Path] = None, ) -> gpd.GeoDataFrame: - logger.info("Loading Tribal geometry data from Tribal ETL") + logger.debug("Loading Tribal geometry data from Tribal ETL") GEOJSON_PATH = _tribal_data_path if GEOJSON_PATH is None: GEOJSON_PATH = TribalETL().NATIONAL_TRIBAL_GEOJSON_PATH diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py index a75845be..223f0b09 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py @@ -34,9 +34,6 @@ class GeoCorrETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info( - "Starting to download 2MB GeoCorr Urban Rural Census Tract Map file." - ) unzip_file_from_url( file_url=settings.AWS_JUSTICE40_DATASOURCES_URL + "/geocorr_urban_rural.csv.zip", @@ -53,7 +50,6 @@ class GeoCorrETL(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Starting GeoCorr Urban Rural Map transform") # Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr self.output_df = self.df.rename( diff --git a/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py b/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py index 4a8de315..a65ed126 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py @@ -43,7 +43,6 @@ class HistoricRedliningETL(ExtractTransformLoad): self.df: pd.DataFrame def transform(self) -> None: - logger.info("Transforming Historic Redlining Data") # this is obviously temporary historic_redlining_data = pd.read_excel( self.HISTORIC_REDLINING_FILE_PATH @@ -55,7 +54,7 @@ class HistoricRedliningETL(ExtractTransformLoad): columns={"HRS2010": self.REDLINING_SCALAR} ) - logger.info(f"{historic_redlining_data.columns}") + logger.debug(f"{historic_redlining_data.columns}") # Calculate lots of different score thresholds for convenience for threshold in [3.25, 3.5, 3.75]: diff --git a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py index 681d2cb9..b5e5a875 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py @@ -23,7 +23,7 @@ class HousingTransportationETL(ExtractTransformLoad): dfs = [] zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index" for fips in get_state_fips_codes(self.DATA_PATH): - logger.info( + logger.debug( f"Downloading housing data for state/territory with FIPS code {fips}" ) @@ -50,8 +50,6 @@ class HousingTransportationETL(ExtractTransformLoad): self.df = pd.concat(dfs) def transform(self) -> None: - logger.info("Transforming Housing and Transportation Data") - # Rename and reformat tract ID self.df.rename( columns={"tract": self.GEOID_TRACT_FIELD_NAME}, inplace=True @@ -61,7 +59,5 @@ class HousingTransportationETL(ExtractTransformLoad): ].str.replace('"', "") def load(self) -> None: - logger.info("Saving Housing and Transportation Data") - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py index 0e08d225..4cf0ee7d 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py @@ -56,7 +56,6 @@ class HudHousingETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Extracting 1.09 GB HUD Housing Data") super().extract( self.HOUSING_FTP_URL, self.HOUSING_ZIP_FILE_DIR, @@ -80,8 +79,6 @@ class HudHousingETL(ExtractTransformLoad): return tmp_df def transform(self) -> None: - logger.info("Transforming HUD Housing Data") - table_8 = self._read_chas_table("Table8.csv") table_3 = self._read_chas_table("Table3.csv") diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py index 447202f3..ddf476b6 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py @@ -36,7 +36,6 @@ class HudRecapETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Downloading HUD Recap Data") download = requests.get( self.HUD_RECAP_CSV_URL, verify=None, @@ -48,8 +47,6 @@ class HudRecapETL(ExtractTransformLoad): csv_file.close() def transform(self) -> None: - logger.info("Transforming HUD Recap Data") - # Load comparison index (CalEnviroScreen 4) self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"}) @@ -75,7 +72,6 @@ class HudRecapETL(ExtractTransformLoad): self.df.sort_values(by=self.GEOID_TRACT_FIELD_NAME, inplace=True) def load(self) -> None: - logger.info("Saving HUD Recap CSV") # write nationwide csv self.CSV_PATH.mkdir(parents=True, exist_ok=True) self.df.to_csv(self.CSV_PATH / "usa.csv", index=False) diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py index 4299dc04..7b4879f3 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py @@ -39,7 +39,6 @@ class MappingForEJETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Downloading Mapping for EJ Data") super().extract( self.MAPPING_FOR_EJ_VA_URL, self.get_tmp_path(), @@ -50,8 +49,6 @@ class MappingForEJETL(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Transforming Mapping for EJ Data") - # Join (here, it's just concatenating) the two dataframes from # CO and VA self.df = pd.concat( @@ -86,7 +83,6 @@ class MappingForEJETL(ExtractTransformLoad): ) def load(self) -> None: - logger.info("Saving Mapping for EJ CSV") # write selected states csv self.CSV_PATH.mkdir(parents=True, exist_ok=True) self.df[self.COLUMNS_TO_KEEP].to_csv( @@ -94,4 +90,4 @@ class MappingForEJETL(ExtractTransformLoad): ) def validate(self) -> None: - logger.info("Validating Mapping For EJ Data") + logger.debug("Skipping validation for MappingForEJETL") diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py index d9caac8d..05ff0593 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py @@ -75,14 +75,12 @@ class MappingInequalityETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Downloading Mapping Inequality Data") download_file_from_url( file_url=self.MAPPING_INEQUALITY_CSV_URL, download_file_name=self.MAPPING_INEQUALITY_CSV, ) def transform(self) -> None: - logger.info("Transforming Mapping Inequality Data") df: pd.DataFrame = pd.read_csv( self.MAPPING_INEQUALITY_CSV, dtype={self.TRACT_INPUT_FIELD: "string"}, @@ -207,7 +205,6 @@ class MappingInequalityETL(ExtractTransformLoad): self.df = grouped_df def load(self) -> None: - logger.info("Saving Mapping Inequality CSV") # write nationwide csv self.CSV_PATH.mkdir(parents=True, exist_ok=True) self.df[self.COLUMNS_TO_KEEP].to_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py index fb1679ce..8f714c81 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py @@ -33,15 +33,13 @@ class MarylandEJScreenETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Downloading 207MB Maryland EJSCREEN Data") + logger.debug("Downloading 207MB Maryland EJSCREEN Data") super().extract( self.MARYLAND_EJSCREEN_URL, self.get_tmp_path(), ) def transform(self) -> None: - logger.info("Transforming Maryland EJSCREEN Data") - list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp")) # Ignore counties becauses this is not the level of measurement @@ -105,7 +103,6 @@ class MarylandEJScreenETL(ExtractTransformLoad): ) def load(self) -> None: - logger.info("Saving Maryland EJSCREEN CSV") # write maryland tracts to csv self.OUTPUT_CSV_PATH.mkdir(parents=True, exist_ok=True) self.df[self.COLUMNS_TO_KEEP].to_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py index cfbad0d3..efde123c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py @@ -33,7 +33,6 @@ class MichiganEnviroScreenETL(ExtractTransformLoad): self.df: pd.DataFrame def extract(self) -> None: - logger.info("Downloading Michigan EJSCREEN Data") self.df = pd.read_csv( filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL, dtype={"GEO_ID": "string"}, @@ -41,8 +40,6 @@ class MichiganEnviroScreenETL(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Transforming Michigan EJSCREEN Data") - self.df.rename( columns={ "GEO_ID": self.GEOID_TRACT_FIELD_NAME, @@ -60,7 +57,6 @@ class MichiganEnviroScreenETL(ExtractTransformLoad): ) def load(self) -> None: - logger.info("Saving Michigan Environmental Screening Tool to CSV") # write nationwide csv self.CSV_PATH.mkdir(parents=True, exist_ok=True) self.df[self.COLUMNS_TO_KEEP].to_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index ae31dab1..bced98f5 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -69,7 +69,6 @@ class NationalRiskIndexETL(ExtractTransformLoad): """Unzips NRI dataset from the FEMA data source and writes the files to the temporary data folder for use in the transform() method """ - logger.info("Downloading 405MB National Risk Index Data") super().extract( source_url=self.SOURCE_URL, @@ -84,7 +83,6 @@ class NationalRiskIndexETL(ExtractTransformLoad): - Applies the NRI score for each Census Tract to the Census Block Groups inside of that Tract """ - logger.info("Transforming National Risk Index Data") # read in the unzipped csv from NRI data source then rename the # Census Tract column for merging diff --git a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py index ff1fab77..39b12af0 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py @@ -53,7 +53,6 @@ class NatureDeprivedETL(ExtractTransformLoad): - Renames columns as needed """ - logger.info("Transforming NLCD Data") df_ncld: pd.DataFrame = pd.read_csv( self.INPUT_CSV, diff --git a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py index 82daf861..b797c418 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py @@ -76,8 +76,6 @@ class PersistentPovertyETL(ExtractTransformLoad): return df def extract(self) -> None: - logger.info("Starting to download 86MB persistent poverty file.") - unzipped_file_path = self.get_tmp_path() unzip_file_from_url( @@ -124,7 +122,6 @@ class PersistentPovertyETL(ExtractTransformLoad): self.df = self._join_input_dfs(temporary_input_dfs) def transform(self) -> None: - logger.info("Starting persistent poverty transform") transformed_df = self.df # Note: the fields are defined as following. diff --git a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py index a6b89218..0b99b01a 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py @@ -77,7 +77,6 @@ class TreeEquityScoreETL(ExtractTransformLoad): ] def extract(self) -> None: - logger.info("Downloading Tree Equity Score Data") for state in self.states: super().extract( f"{self.TES_URL}{state}.zip.zip", @@ -85,7 +84,6 @@ class TreeEquityScoreETL(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Transforming Tree Equity Score Data") tes_state_dfs = [] for state in self.states: tes_state_dfs.append( @@ -103,7 +101,6 @@ class TreeEquityScoreETL(ExtractTransformLoad): ) def load(self) -> None: - logger.info("Saving Tree Equity Score CSV") # write nationwide csv self.CSV_PATH.mkdir(parents=True, exist_ok=True) self.df = self.df[ diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py index 0fdb852c..f8bd9df7 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py @@ -28,7 +28,6 @@ class TribalETL(ExtractTransformLoad): Returns: None """ - logger.info("Downloading Tribal Data") bia_shapefile_zip_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL @@ -77,7 +76,7 @@ class TribalETL(ExtractTransformLoad): bia_national_lar_df = gpd.read_file(path) # DELETE - logger.info(f"Columns: {bia_national_lar_df.columns}\n") + logger.debug(f"Columns: {bia_national_lar_df.columns}\n") bia_national_lar_df.drop( ["GISAcres"], @@ -186,8 +185,6 @@ class TribalETL(ExtractTransformLoad): Returns: None """ - logger.info("Transforming Tribal Data") - # Set the filepaths: bia_national_lar_shapefile = ( self.GEOGRAPHIC_BASE_PATH / "bia_national_lar" @@ -220,7 +217,7 @@ class TribalETL(ExtractTransformLoad): Returns: None """ - logger.info("Saving Tribal GeoJson and CSV") + logger.debug("Saving Tribal GeoJson and CSV") usa_tribal_df = gpd.GeoDataFrame( pd.concat(self.USA_TRIBAL_DF_LIST, ignore_index=True) ) @@ -228,7 +225,7 @@ class TribalETL(ExtractTransformLoad): "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" ) - logger.info("Writing national geojson file") + logger.debug("Writing national geojson file") usa_tribal_df.to_file( self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON" ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py index 35945228..ba2e2226 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py @@ -94,8 +94,6 @@ class TribalOverlapETL(ExtractTransformLoad): self.tribal_gdf = get_tribal_geojson() def transform(self) -> None: - logger.info("Starting tribal overlap transforms.") - # First, calculate whether tracts include any areas from the Tribal areas, # for both the points in AK and the polygons in the continental US (CONUS). tribal_overlap_with_tracts = add_tracts_for_geometries( diff --git a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py index e915d7d9..7f692603 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py @@ -56,8 +56,6 @@ class USArmyFUDS(ExtractTransformLoad): self.output_df: pd.DataFrame def extract(self) -> None: - logger.info("Starting FUDS data download.") - download_file_from_url( file_url=self.FILE_URL, download_file_name=self.DOWNLOAD_FILE_NAME, @@ -65,11 +63,10 @@ class USArmyFUDS(ExtractTransformLoad): ) def transform(self) -> None: - logger.info("Starting FUDS transform.") # before we try to do any transformation, get the tract data # so it's loaded and the census ETL is out of scope - logger.info("Loading FUDS data as GeoDataFrame for transform") + logger.debug("Loading FUDS data as GeoDataFrame for transform") raw_df = gpd.read_file( filename=self.DOWNLOAD_FILE_NAME, low_memory=False, diff --git a/data/data-pipeline/data_pipeline/score/score_a.py b/data/data-pipeline/data_pipeline/score/score_a.py index c1a21298..0788f094 100644 --- a/data/data-pipeline/data_pipeline/score/score_a.py +++ b/data/data-pipeline/data_pipeline/score/score_a.py @@ -8,7 +8,7 @@ logger = get_module_logger(__name__) class ScoreA(Score): def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score A") + logger.debug("Adding Score A") self.df[field_names.SCORE_A] = self.df[ [ field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, diff --git a/data/data-pipeline/data_pipeline/score/score_b.py b/data/data-pipeline/data_pipeline/score/score_b.py index 12f7048e..2e980b3d 100644 --- a/data/data-pipeline/data_pipeline/score/score_b.py +++ b/data/data-pipeline/data_pipeline/score/score_b.py @@ -8,7 +8,7 @@ logger = get_module_logger(__name__) class ScoreB(Score): def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score B") + logger.debug("Adding Score B") self.df[field_names.SCORE_B] = ( self.df[ field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX diff --git a/data/data-pipeline/data_pipeline/score/score_c.py b/data/data-pipeline/data_pipeline/score/score_c.py index ef10d86d..b91b7417 100644 --- a/data/data-pipeline/data_pipeline/score/score_c.py +++ b/data/data-pipeline/data_pipeline/score/score_c.py @@ -72,7 +72,7 @@ class ScoreC(Score): # "CalEnviroScreen for the US" score def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score C") + logger.debug("Adding Score C") # Average all the percentile values in each bucket into a single score for each of the four buckets. for bucket in self.BUCKETS: self.df[bucket.name] = self.df[bucket.fields].mean(axis=1) diff --git a/data/data-pipeline/data_pipeline/score/score_d.py b/data/data-pipeline/data_pipeline/score/score_d.py index 55b430a4..08e55f01 100644 --- a/data/data-pipeline/data_pipeline/score/score_d.py +++ b/data/data-pipeline/data_pipeline/score/score_d.py @@ -8,7 +8,7 @@ logger = get_module_logger(__name__) class ScoreD(Score): def add_columns(self) -> pd.DataFrame: - logger.info("Adding Scores D and E") + logger.debug("Adding Scores D and E") fields_to_use_in_score = [ field_names.UNEMPLOYMENT_FIELD, field_names.LINGUISTIC_ISO_FIELD, diff --git a/data/data-pipeline/data_pipeline/score/score_f.py b/data/data-pipeline/data_pipeline/score/score_f.py index 523f2cf1..be38e514 100644 --- a/data/data-pipeline/data_pipeline/score/score_f.py +++ b/data/data-pipeline/data_pipeline/score/score_f.py @@ -10,7 +10,7 @@ class ScoreF(Score): # TODO Make variables and constants clearer (meaning and type) def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score F") + logger.debug("Adding Score F") ami_and_high_school_field = "Low AMI, Low HS graduation" meets_socio_field = "Meets socioeconomic criteria" meets_burden_field = "Meets burden criteria" diff --git a/data/data-pipeline/data_pipeline/score/score_g.py b/data/data-pipeline/data_pipeline/score/score_g.py index 7cd342de..d4e10726 100644 --- a/data/data-pipeline/data_pipeline/score/score_g.py +++ b/data/data-pipeline/data_pipeline/score/score_g.py @@ -8,7 +8,7 @@ logger = get_module_logger(__name__) class ScoreG(Score): def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score G") + logger.debug("Adding Score G") high_school_cutoff_threshold = 0.05 diff --git a/data/data-pipeline/data_pipeline/score/score_h.py b/data/data-pipeline/data_pipeline/score/score_h.py index efc24ee1..56168c53 100644 --- a/data/data-pipeline/data_pipeline/score/score_h.py +++ b/data/data-pipeline/data_pipeline/score/score_h.py @@ -8,7 +8,7 @@ logger = get_module_logger(__name__) class ScoreH(Score): def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score H") + logger.debug("Adding Score H") high_school_cutoff_threshold = 0.06 diff --git a/data/data-pipeline/data_pipeline/score/score_i.py b/data/data-pipeline/data_pipeline/score/score_i.py index 02528ebc..25f2c098 100644 --- a/data/data-pipeline/data_pipeline/score/score_i.py +++ b/data/data-pipeline/data_pipeline/score/score_i.py @@ -8,7 +8,7 @@ logger = get_module_logger(__name__) class ScoreI(Score): def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score I") + logger.debug("Adding Score I") high_school_cutoff_threshold = 0.05 diff --git a/data/data-pipeline/data_pipeline/score/score_k.py b/data/data-pipeline/data_pipeline/score/score_k.py index bc6b8057..db05344c 100644 --- a/data/data-pipeline/data_pipeline/score/score_k.py +++ b/data/data-pipeline/data_pipeline/score/score_k.py @@ -8,7 +8,7 @@ logger = get_module_logger(__name__) class ScoreK(Score): def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score K") + logger.debug("Adding Score K") high_school_cutoff_threshold = 0.06 diff --git a/data/data-pipeline/data_pipeline/score/score_l.py b/data/data-pipeline/data_pipeline/score/score_l.py index ab935b72..322b9129 100644 --- a/data/data-pipeline/data_pipeline/score/score_l.py +++ b/data/data-pipeline/data_pipeline/score/score_l.py @@ -52,7 +52,7 @@ class ScoreL(Score): [column_from_island_areas, column_from_decennial_census] ].mean(axis=1, skipna=True) - logger.info( + logger.debug( f"Combined field `{combined_column_name}` has " f"{df[combined_column_name].isnull().sum()} " f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) " @@ -64,7 +64,7 @@ class ScoreL(Score): a=df[combined_column_name], q=threshold_cutoff_for_island_areas ) - logger.info( + logger.debug( f"For combined field `{combined_column_name}`, " f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a " f"raw value of {raw_threshold:.3f}." @@ -627,7 +627,7 @@ class ScoreL(Score): .sum() ) - logger.info( + logger.debug( f"For workforce criteria in island areas, " f"{workforce_combined_criteria_for_island_areas.sum()} (" f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data " @@ -642,7 +642,7 @@ class ScoreL(Score): ) def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score L") + logger.debug("Adding Score L") self.df[field_names.THRESHOLD_COUNT] = 0 self.df[field_names.FPL_200_SERIES] = self._create_low_income_threshold( diff --git a/data/data-pipeline/data_pipeline/score/score_m.py b/data/data-pipeline/data_pipeline/score/score_m.py index 89eef6fa..a8751c2a 100644 --- a/data/data-pipeline/data_pipeline/score/score_m.py +++ b/data/data-pipeline/data_pipeline/score/score_m.py @@ -768,7 +768,7 @@ class ScoreM(Score): .sum() ) - logger.info( + logger.debug( f"For workforce criteria in island areas, " f"{workforce_combined_criteria_for_island_areas.sum()} (" f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data " @@ -812,7 +812,7 @@ class ScoreM(Score): ) def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score M") + logger.debug("Adding Score M") self.df[field_names.THRESHOLD_COUNT] = 0 diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index 07e2ab61..65753090 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -889,7 +889,7 @@ class ScoreNarwhal(Score): .sum() ) - logger.info( + logger.debug( f"For workforce criteria in island areas, " f"{workforce_combined_criteria_for_island_areas.sum()} (" f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data " @@ -947,7 +947,7 @@ class ScoreNarwhal(Score): We calculate "donut holes" after the initial score generation """ - logger.info("Marking donut hole tracts") + logger.debug("Marking donut hole tracts") # This is the boolean we pass to the front end for the donut-hole-specific # low income criterion @@ -1025,7 +1025,7 @@ class ScoreNarwhal(Score): ) def add_columns(self) -> pd.DataFrame: - logger.info("Adding Score Narhwal") + logger.debug("Adding Score Narhwal") self.df[field_names.THRESHOLD_COUNT] = 0 self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] = ( diff --git a/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py index df7c1b4b..3132c51a 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py @@ -23,7 +23,7 @@ def toy_score_df(scope="module"): def _helper_test_dropping_tracts(toy_score_df, drop_tracts): - logger.info(drop_tracts) + logger.debug(drop_tracts) test_frame = toy_score_df[ ~toy_score_df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts) ] diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py index 9aca968a..7f78d3e4 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py @@ -41,12 +41,10 @@ class ExampleETL(ExtractTransformLoad): / "input.zip" ) - logger.info(f"Extracting {zip_file_path}") with zipfile.ZipFile(zip_file_path, "r") as zip_ref: zip_ref.extractall(self.get_tmp_path()) def transform(self): - logger.info(f"Loading file from {self.get_tmp_path() / 'input.csv'}.") df: pd.DataFrame = pd.read_csv( self.get_tmp_path() / "input.csv", dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py index 3e7d9cb0..888cb5f1 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py @@ -202,7 +202,7 @@ class TestETL: expected_file_path = data_path / "dataset" / etl.NAME / "usa.csv" - logger.info(f"Expected: {expected_file_path}") + logger.debug(f"Expected: {expected_file_path}") assert actual_file_path == expected_file_path @@ -545,7 +545,7 @@ class TestETL: # Delete output file. output_file_path = etl._get_output_file_path() if os.path.exists(output_file_path): - logger.info("Deleting output file created by other tests.") + logger.debug("Deleting output file created by other tests.") os.remove(output_file_path) # Run more steps to generate test data. diff --git a/data/data-pipeline/data_pipeline/tile/generate.py b/data/data-pipeline/data_pipeline/tile/generate.py index ec03ad3c..b8d5c7ce 100644 --- a/data/data-pipeline/data_pipeline/tile/generate.py +++ b/data/data-pipeline/data_pipeline/tile/generate.py @@ -39,7 +39,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None: os.mkdir(low_tile_path) # generate high mbtiles file - logger.info("Generating USA High mbtiles file") + logger.debug("Generating USA High mbtiles file") cmd = "tippecanoe " cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --layer=blocks " cmd += "--no-feature-limit --no-tile-size-limit " @@ -48,7 +48,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None: call(cmd, shell=True) # generate high mvts - logger.info("Generating USA High mvt folders and files") + logger.debug("Generating USA High mvt folders and files") cmd = "tippecanoe " cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --no-tile-compression " cmd += "--no-feature-limit --no-tile-size-limit " @@ -57,7 +57,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None: call(cmd, shell=True) # generate low mbtiles file - logger.info("Generating USA Low mbtiles file") + logger.debug("Generating USA Low mbtiles file") cmd = "tippecanoe " cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --layer=blocks " cmd += f"--output={low_tile_path}/usa_low.mbtiles " @@ -65,7 +65,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None: call(cmd, shell=True) # generate low mvts - logger.info("Generating USA Low mvt folders and files") + logger.debug("Generating USA Low mvt folders and files") cmd = "tippecanoe " cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --no-tile-compression " cmd += "--drop-densest-as-needed " @@ -86,7 +86,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None: remove_all_from_dir(tribal_tiles_path) # generate mbtiles file - logger.info("Generating Tribal mbtiles file") + logger.debug("Generating Tribal mbtiles file") cmd = "tippecanoe " cmd += "--layer=blocks " cmd += "--base-zoom=3 " @@ -96,7 +96,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None: call(cmd, shell=True) # generate mvts - logger.info("Generating Tribal mvt folders and files") + logger.debug("Generating Tribal mvt folders and files") cmd = "tippecanoe " cmd += "--layer=blocks " cmd += "--base-zoom=3 " diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py index a5e08c4a..31b52880 100644 --- a/data/data-pipeline/data_pipeline/utils.py +++ b/data/data-pipeline/data_pipeline/utils.py @@ -1,5 +1,4 @@ import datetime -import json import logging import os import shutil @@ -17,7 +16,9 @@ from data_pipeline.config import settings from data_pipeline.content.schemas.download_schemas import CodebookConfig from data_pipeline.content.schemas.download_schemas import CSVConfig from data_pipeline.content.schemas.download_schemas import ExcelConfig -from data_pipeline.etl.score.constants import SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH +from data_pipeline.etl.score.constants import ( + SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH, +) from marshmallow import ValidationError from marshmallow_dataclass import class_schema @@ -43,11 +44,12 @@ def get_module_logger(module_name: str) -> logging.Logger: logger = logging.getLogger(module_name) handler = logging.StreamHandler() formatter = logging.Formatter( - "%(asctime)s [%(name)-12s] %(levelname)-8s %(message)s" + "%(asctime)s [%(name)40.40s] %(levelname)-8s %(message)s" ) handler.setFormatter(formatter) logger.addHandler(handler) - logger.setLevel(logging.DEBUG) + logger.setLevel(logging.INFO) + logger.propagate = False # don't send log messages to the parent logger (to avoid duplicate log messages) return logger @@ -81,7 +83,6 @@ def remove_files_from_dir( if not file.endswith(extension): continue os.remove(files_path / file) - logger.info(f"Removing {file}") def remove_all_from_dir(files_path: Path) -> None: @@ -103,9 +104,8 @@ def remove_all_from_dir(files_path: Path) -> None: os.remove(files_path / file) else: shutil.rmtree(files_path / file) - logger.info(f"Removing {file}") else: - logger.info(f"The following path does not exist: `{files_path}`.") + logger.warning(f"The following path does not exist: `{files_path}`.") def remove_all_dirs_from_dir(dir_path: Path) -> None: @@ -122,7 +122,6 @@ def remove_all_dirs_from_dir(dir_path: Path) -> None: file_path = os.path.join(dir_path, filename) if os.path.isdir(file_path): shutil.rmtree(file_path) - logging.info(f"Removing directory {file_path}") def download_file_from_url( @@ -147,7 +146,6 @@ def download_file_from_url( if not os.path.isdir(download_file_name.parent): os.mkdir(download_file_name.parent) - logger.info(f"Downloading {file_url}") response = requests.get( file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT ) @@ -193,7 +191,6 @@ def unzip_file_from_url( verify=verify, ) - logger.info(f"Extracting {zip_file_path}") with zipfile.ZipFile(zip_file_path, "r") as zip_ref: zip_ref.extractall(unzipped_file_path) @@ -206,7 +203,7 @@ def data_folder_cleanup() -> None: data_path = settings.APP_ROOT / "data" - logger.info("Initializing all dataset directoriees") + logger.debug("Initializing all dataset directoriees") remove_all_from_dir(data_path / "dataset") @@ -215,7 +212,7 @@ def score_folder_cleanup() -> None: data_path = settings.APP_ROOT / "data" - logger.info("Initializing all score data") + logger.debug("Initializing all score data") remove_all_from_dir(data_path / "score" / "csv") remove_all_from_dir(data_path / "score" / "geojson") remove_all_from_dir(data_path / "score" / "tiles") @@ -227,17 +224,19 @@ def geo_score_folder_cleanup() -> None: """Removes the necessary files to run geo-score. This works out to be zip files, since if we don't remove them python's zip utils continuously add to them instead of overwriting the contents.""" - + data_path = settings.APP_ROOT / "data" - - logger.info("Removing zip files") + + logger.debug("Removing zip files") remove_files_from_dir(data_path / "score" / "shapefile", ".zip") - - shapefile_and_codebook_zipped = SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH - + + shapefile_and_codebook_zipped = ( + SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH + ) + if os.path.isfile(shapefile_and_codebook_zipped): os.remove(shapefile_and_codebook_zipped) - + def downloadable_cleanup() -> None: """Remove all files from downloadable directory in the local data/score path""" @@ -251,7 +250,7 @@ def temp_folder_cleanup() -> None: data_path = settings.APP_ROOT / "data" - logger.info("Initializing all temp directories") + logger.debug("Initializing all temp directories") remove_all_from_dir(data_path / "tmp") @@ -307,8 +306,6 @@ def zip_files(zip_file_path: Path, files_to_compress: List[Path]): with zipfile.ZipFile(zip_file_path, "w") as zf: for f in files_to_compress: zf.write(f, arcname=Path(f).name, compress_type=compression) - zip_info = get_zip_info(zip_file_path) - logger.info(json.dumps(zip_info, indent=4, sort_keys=True, default=str)) def zip_directory( @@ -327,7 +324,6 @@ def zip_directory( def zipdir(origin_directory: Path, ziph: zipfile.ZipFile): for root, dirs, files in os.walk(origin_directory): for file in files: - logger.info(f"Compressing file: {file}") ziph.write( os.path.join(root, file), os.path.relpath( @@ -337,7 +333,6 @@ def zip_directory( compress_type=compression, ) - logger.info(f"Compressing {Path(origin_zip_directory).name} directory") zip_file_name = f"{Path(origin_zip_directory).name}.zip" # start archiving @@ -347,10 +342,6 @@ def zip_directory( zipdir(f"{origin_zip_directory}/", zipf) zipf.close() - logger.info( - f"Completed compression of {Path(origin_zip_directory).name} directory" - ) - def load_yaml_dict_from_file( yaml_file_path: Path,