From a7a4df037ee7d9aaa2662552a333cd2d5b784d64 Mon Sep 17 00:00:00 2001 From: lucasmbrown-usds Date: Wed, 28 Sep 2022 17:02:54 -0400 Subject: [PATCH] wip --- .../data_pipeline/etl/score/constants.py | 4 ++++ .../data_pipeline/etl/score/etl_score_post.py | 6 ++++++ .../etl/sources/geocorr_alternatives/etl.py | 20 ++++++++++++++++++- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 410d194c..e33d1dd6 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -58,6 +58,10 @@ SCORE_DOWNLOADABLE_CSV_FILE_PATH = ( SCORE_DOWNLOADABLE_EXCEL_FILE_PATH = ( SCORE_DOWNLOADABLE_DIR / f"communities-{timestamp_str}.xlsx" ) +ZIP_CODES_DOWNLOADABLE_CSV_FILE_PATH = ( + SCORE_DOWNLOADABLE_DIR + / f"communities-compared-to-zip-codes-{timestamp_str}.csv" +) SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH = ( SCORE_DOWNLOADABLE_DIR / f"codebook-{timestamp_str}.csv" ) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 35ddfa98..498a3b8c 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -387,6 +387,9 @@ class PostScoreETL(ExtractTransformLoad): return final_df + def _create_zip_codes_data(self, downloadable_df): + pass + def transform(self) -> None: logger.info("Transforming data sources for Score + County CSVs") @@ -543,6 +546,9 @@ class PostScoreETL(ExtractTransformLoad): codebook_df.to_csv(codebook_path, index=False) # TODO: Write zip-code based files + zip_codes_df = self._create_zip_codes_data( + downloadable_df=downloadable_df + ) logger.info("Compressing files") files_to_compress = [ diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py index ce4b6567..68c451a2 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py @@ -11,7 +11,25 @@ logger = get_module_logger(__name__) class GeoCorrAlternativesETL(ExtractTransformLoad): - """Calculates overlap between Census tracts & various alternative geographies.""" + """Calculates overlap between Census tracts & various alternative geographies. + + Note: for almost all 2020 zip codes in the USA (33,781 zip codes), this ETL + divides them into census tracts such that 100% of the zip code is represented + within the census tracts in the output of this file. + + For a very small number of 2020 zip codes in the USA (9 zip codes), this ETL + only matches 98% of more of the zip code into tracts. For one 2020 zip code, this + ETL only matches 86% of the tract. + + The reason for these 10 outliers is unclear. + + Here are the value counts for `PERCENT_OF_ZIP_CODE_IN_TRACT` aggregated at two + digits of precision: + 1.00 33781 + 0.99 7 + 0.98 2 + 0.86 1 + """ NAME = "geocorr_alternatives" GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT