This commit is contained in:
lucasmbrown-usds 2022-09-28 17:02:54 -04:00
parent a6ba9f6970
commit a7a4df037e
3 changed files with 29 additions and 1 deletions

View file

@ -58,6 +58,10 @@ SCORE_DOWNLOADABLE_CSV_FILE_PATH = (
SCORE_DOWNLOADABLE_EXCEL_FILE_PATH = (
SCORE_DOWNLOADABLE_DIR / f"communities-{timestamp_str}.xlsx"
)
ZIP_CODES_DOWNLOADABLE_CSV_FILE_PATH = (
SCORE_DOWNLOADABLE_DIR
/ f"communities-compared-to-zip-codes-{timestamp_str}.csv"
)
SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH = (
SCORE_DOWNLOADABLE_DIR / f"codebook-{timestamp_str}.csv"
)

View file

@ -387,6 +387,9 @@ class PostScoreETL(ExtractTransformLoad):
return final_df
def _create_zip_codes_data(self, downloadable_df):
pass
def transform(self) -> None:
logger.info("Transforming data sources for Score + County CSVs")
@ -543,6 +546,9 @@ class PostScoreETL(ExtractTransformLoad):
codebook_df.to_csv(codebook_path, index=False)
# TODO: Write zip-code based files
zip_codes_df = self._create_zip_codes_data(
downloadable_df=downloadable_df
)
logger.info("Compressing files")
files_to_compress = [

View file

@ -11,7 +11,25 @@ logger = get_module_logger(__name__)
class GeoCorrAlternativesETL(ExtractTransformLoad):
"""Calculates overlap between Census tracts & various alternative geographies."""
"""Calculates overlap between Census tracts & various alternative geographies.
Note: for almost all 2020 zip codes in the USA (33,781 zip codes), this ETL
divides them into census tracts such that 100% of the zip code is represented
within the census tracts in the output of this file.
For a very small number of 2020 zip codes in the USA (9 zip codes), this ETL
only matches 98% of more of the zip code into tracts. For one 2020 zip code, this
ETL only matches 86% of the tract.
The reason for these 10 outliers is unclear.
Here are the value counts for `PERCENT_OF_ZIP_CODE_IN_TRACT` aggregated at two
digits of precision:
1.00 33781
0.99 7
0.98 2
0.86 1
"""
NAME = "geocorr_alternatives"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT