From 0316906a69b13786fd45df3ff0a0130b89986511 Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Thu, 15 Jul 2021 13:34:08 -0400 Subject: [PATCH] County Names for Score #188 (#347) * starting PR * completed feature * checkpoint * adding new fips and updating counties to 2010 * updated sources to 2010 - 2019 * more cleanup * creating tiles score csv --- score/data/census/csv/fips_states_2010.csv | 53 +++++++ score/etl/runner.py | 41 ++++-- score/etl/score/{etl.py => etl_score.py} | 79 ++++++---- score/etl/score/etl_score_post.py | 112 ++++++++++++++ score/etl/sources/census_acs/etl.py | 20 ++- score/etl/sources/ejscreen/etl.py | 8 +- score/ipython/county_lookup.ipynb | 161 +++++++++++++++++++++ score/utils.py | 5 +- 8 files changed, 425 insertions(+), 54 deletions(-) create mode 100644 score/data/census/csv/fips_states_2010.csv rename score/etl/score/{etl.py => etl_score.py} (86%) create mode 100644 score/etl/score/etl_score_post.py create mode 100644 score/ipython/county_lookup.ipynb diff --git a/score/data/census/csv/fips_states_2010.csv b/score/data/census/csv/fips_states_2010.csv new file mode 100644 index 00000000..cf2e1550 --- /dev/null +++ b/score/data/census/csv/fips_states_2010.csv @@ -0,0 +1,53 @@ +fips,state_name,state_abbreviation,region,division +01,Alabama,AL,South,East South Central +02,Alaska,AK,West,Pacific +04,Arizona,AZ,West,Mountain +05,Arkansas,AR,South,West South Central +06,California,CA,West,Pacific +08,Colorado,CO,West,Mountain +09,Connecticut,CT,Northeast,New England +10,Delaware,DE,South,South Atlantic +11,District of Columbia,DC,South,South Atlantic +12,Florida,FL,South,South Atlantic +13,Georgia,GA,South,South Atlantic +15,Hawaii,HI,West,Pacific +16,Idaho,ID,West,Mountain +17,Illinois,IL,Midwest,East North Central +18,Indiana,IN,Midwest,East North Central +19,Iowa,IA,Midwest,West North Central +20,Kansas,KS,Midwest,West North Central +21,Kentucky,KY,South,East South Central +22,Louisiana,LA,South,West South Central +23,Maine,ME,Northeast,New England +24,Maryland,MD,South,South Atlantic +25,Massachusetts,MA,Northeast,New England +26,Michigan,MI,Midwest,East North Central +27,Minnesota,MN,Midwest,West North Central +28,Mississippi,MS,South,East South Central +29,Missouri,MO,Midwest,West North Central +30,Montana,MT,West,Mountain +31,Nebraska,NE,Midwest,West North Central +32,Nevada,NV,West,Mountain +33,New Hampshire,NH,Northeast,New England +34,New Jersey,NJ,Northeast,Middle Atlantic +35,New Mexico,NM,West,Mountain +36,New York,NY,Northeast,Middle Atlantic +37,North Carolina,NC,South,South Atlantic +38,North Dakota,ND,Midwest,West North Central +39,Ohio,OH,Midwest,East North Central +40,Oklahoma,OK,South,West South Central +41,Oregon,OR,West,Pacific +42,Pennsylvania,PA,Northeast,Middle Atlantic +44,Rhode Island,RI,Northeast,New England +45,South Carolina,SC,South,South Atlantic +46,South Dakota,SD,Midwest,West North Central +47,Tennessee,TN,South,East South Central +48,Texas,TX,South,West South Central +49,Utah,UT,West,Mountain +50,Vermont,VT,Northeast,New England +51,Virginia,VA,South,South Atlantic +53,Washington,WA,West,Pacific +54,West Virginia,WV,South,South Atlantic +55,Wisconsin,WI,Midwest,East North Central +56,Wyoming,WY,West,Mountain +72,Puerto Rico,PR,Puerto Rico,Puerto Rico diff --git a/score/etl/runner.py b/score/etl/runner.py index 3da284f4..01ade7bc 100644 --- a/score/etl/runner.py +++ b/score/etl/runner.py @@ -1,6 +1,7 @@ import importlib -from etl.score.etl import ScoreETL +from etl.score.etl_score import ScoreETL +from etl.score.etl_score_post import PostScoreETL def etl_runner(dataset_to_run: str = None) -> None: @@ -20,7 +21,11 @@ def etl_runner(dataset_to_run: str = None) -> None: "module_dir": "census_acs", "class_name": "CensusACSETL", }, - {"name": "ejscreen", "module_dir": "ejscreen", "class_name": "EJScreenETL"}, + { + "name": "ejscreen", + "module_dir": "ejscreen", + "class_name": "EJScreenETL", + }, { "name": "housing_and_transportation", "module_dir": "housing_and_transportation", @@ -36,12 +41,17 @@ def etl_runner(dataset_to_run: str = None) -> None: "module_dir": "calenviroscreen", "class_name": "CalEnviroScreenETL", }, - {"name": "hud_recap", "module_dir": "hud_recap", "class_name": "HudRecapETL"}, + { + "name": "hud_recap", + "module_dir": "hud_recap", + "class_name": "HudRecapETL", + }, ] if dataset_to_run: dataset_element = next( - (item for item in dataset_list if item["name"] == dataset_to_run), None + (item for item in dataset_list if item["name"] == dataset_to_run), + None, ) if not dataset_list: raise ValueError("Invalid dataset name") @@ -51,7 +61,9 @@ def etl_runner(dataset_to_run: str = None) -> None: # Run the ETLs for the dataset_list for dataset in dataset_list: - etl_module = importlib.import_module(f"etl.sources.{dataset['module_dir']}.etl") + etl_module = importlib.import_module( + f"etl.sources.{dataset['module_dir']}.etl" + ) etl_class = getattr(etl_module, dataset["class_name"]) etl_instance = etl_class() @@ -80,16 +92,19 @@ def score_generate() -> None: Returns: None """ - score = ScoreETL() - # run extract - score.extract() + # Score Gen + score_gen = ScoreETL() + score_gen.extract() + score_gen.transform() + score_gen.load() - # run transform - score.transform() - - # run load - score.load() + # Post Score Processing + score_post = PostScoreETL() + score_post.extract() + score_post.transform() + score_post.load() + score_post.cleanup() def _find_dataset_index(dataset_list, key, value): diff --git a/score/etl/score/etl.py b/score/etl/score/etl_score.py similarity index 86% rename from score/etl/score/etl.py rename to score/etl/score/etl_score.py index a956888a..a4a48d7b 100644 --- a/score/etl/score/etl.py +++ b/score/etl/score/etl_score.py @@ -28,10 +28,10 @@ class ScoreETL(ExtractTransformLoad): self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)" self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)" - self.POVERTY_FIELD_NAME = "Poverty (Less than 200% of federal poverty line)" - self.HIGH_SCHOOL_FIELD_NAME = ( - "Percent individuals age 25 or over with less than high school degree" + self.POVERTY_FIELD_NAME = ( + "Poverty (Less than 200% of federal poverty line)" ) + self.HIGH_SCHOOL_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree" # There's another aggregation level (a second level of "buckets"). self.AGGREGATION_POLLUTION = "Pollution Burden" @@ -40,7 +40,7 @@ class ScoreETL(ExtractTransformLoad): self.PERCENTILE_FIELD_SUFFIX = " (percentile)" self.MIN_MAX_FIELD_SUFFIX = " (min-max normalized)" - self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" + self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" / "full" # dataframes self.df: pd.DataFrame @@ -51,21 +51,28 @@ class ScoreETL(ExtractTransformLoad): def extract(self) -> None: # EJSCreen csv Load - ejscreen_csv = self.DATA_PATH / "dataset" / "ejscreen_2020" / "usa.csv" + ejscreen_csv = self.DATA_PATH / "dataset" / "ejscreen_2019" / "usa.csv" self.ejscreen_df = pd.read_csv( ejscreen_csv, dtype={"ID": "string"}, low_memory=False ) - self.ejscreen_df.rename(columns={"ID": self.GEOID_FIELD_NAME}, inplace=True) + self.ejscreen_df.rename( + columns={"ID": self.GEOID_FIELD_NAME}, inplace=True + ) # Load census data census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" self.census_df = pd.read_csv( - census_csv, dtype={self.GEOID_FIELD_NAME: "string"}, low_memory=False + census_csv, + dtype={self.GEOID_FIELD_NAME: "string"}, + low_memory=False, ) # Load housing and transportation data housing_and_transportation_index_csv = ( - self.DATA_PATH / "dataset" / "housing_and_transportation_index" / "usa.csv" + self.DATA_PATH + / "dataset" + / "housing_and_transportation_index" + / "usa.csv" ) self.housing_and_transportation_df = pd.read_csv( housing_and_transportation_index_csv, @@ -99,7 +106,10 @@ class ScoreETL(ExtractTransformLoad): ) # Sanity check the join. - if len(census_block_group_df[self.GEOID_FIELD_NAME].str.len().unique()) != 1: + if ( + len(census_block_group_df[self.GEOID_FIELD_NAME].str.len().unique()) + != 1 + ): raise ValueError( f"One of the input CSVs uses {self.GEOID_FIELD_NAME} with a different length." ) @@ -109,9 +119,9 @@ class ScoreETL(ExtractTransformLoad): census_tract_df = self.hud_housing_df # Calculate the tract for the CBG data. - census_block_group_df[self.GEOID_TRACT_FIELD_NAME] = census_block_group_df[ - self.GEOID_FIELD_NAME - ].str[0:11] + census_block_group_df[ + self.GEOID_TRACT_FIELD_NAME + ] = census_block_group_df[self.GEOID_FIELD_NAME].str[0:11] self.df = census_block_group_df.merge( census_tract_df, on=self.GEOID_TRACT_FIELD_NAME @@ -122,7 +132,8 @@ class ScoreETL(ExtractTransformLoad): # Define a named tuple that will be used for each data set input. DataSet = collections.namedtuple( - typename="DataSet", field_names=["input_field", "renamed_field", "bucket"] + typename="DataSet", + field_names=["input_field", "renamed_field", "bucket"], ) data_sets = [ @@ -139,7 +150,9 @@ class ScoreETL(ExtractTransformLoad): bucket=None, ), DataSet( - input_field="ACSTOTPOP", renamed_field="Total population", bucket=None + input_field="ACSTOTPOP", + renamed_field="Total population", + bucket=None, ), # The following data sets have buckets, because they're used in the score DataSet( @@ -163,7 +176,9 @@ class ScoreETL(ExtractTransformLoad): bucket=self.BUCKET_EXPOSURES, ), DataSet( - input_field="OZONE", renamed_field="Ozone", bucket=self.BUCKET_EXPOSURES + input_field="OZONE", + renamed_field="Ozone", + bucket=self.BUCKET_EXPOSURES, ), DataSet( input_field="PTRAF", @@ -239,7 +254,8 @@ class ScoreETL(ExtractTransformLoad): # Rename columns: renaming_dict = { - data_set.input_field: data_set.renamed_field for data_set in data_sets + data_set.input_field: data_set.renamed_field + for data_set in data_sets } self.df.rename( @@ -308,7 +324,9 @@ class ScoreETL(ExtractTransformLoad): ] ].mean(axis=1) self.df["Score B"] = ( - self.df["Poverty (Less than 200% of federal poverty line) (percentile)"] + self.df[ + "Poverty (Less than 200% of federal poverty line) (percentile)" + ] * self.df[ "Percent individuals age 25 or over with less than high school degree (percentile)" ] @@ -337,7 +355,8 @@ class ScoreETL(ExtractTransformLoad): # Multiply the "Pollution Burden" score and the "Population Characteristics" together to produce the cumulative impact score. self.df["Score C"] = ( - self.df[self.AGGREGATION_POLLUTION] * self.df[self.AGGREGATION_POPULATION] + self.df[self.AGGREGATION_POLLUTION] + * self.df[self.AGGREGATION_POPULATION] ) if len(census_block_group_df) > 220333: @@ -352,10 +371,12 @@ class ScoreETL(ExtractTransformLoad): ] fields_min_max = [ - f"{field}{self.MIN_MAX_FIELD_SUFFIX}" for field in fields_to_use_in_score + f"{field}{self.MIN_MAX_FIELD_SUFFIX}" + for field in fields_to_use_in_score ] fields_percentile = [ - f"{field}{self.PERCENTILE_FIELD_SUFFIX}" for field in fields_to_use_in_score + f"{field}{self.PERCENTILE_FIELD_SUFFIX}" + for field in fields_to_use_in_score ] # Calculate "Score D", which uses min-max normalization @@ -367,7 +388,13 @@ class ScoreETL(ExtractTransformLoad): self.df[fields_min_max].corr() # Create percentiles for the scores - for score_field in ["Score A", "Score B", "Score C", "Score D", "Score E"]: + for score_field in [ + "Score A", + "Score B", + "Score C", + "Score D", + "Score E", + ]: self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] = self.df[ score_field ].rank(pct=True) @@ -376,14 +403,8 @@ class ScoreETL(ExtractTransformLoad): ) def load(self) -> None: - logger.info(f"Saving Score CSVs") + logger.info(f"Saving Score CSV") # write nationwide csv + self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) self.df.to_csv(self.SCORE_CSV_PATH / f"usa.csv", index=False) - - # write per state csvs - for states_fips in get_state_fips_codes(self.DATA_PATH): - logger.info(f"Generating data{states_fips} csv") - df1 = self.df[self.df["GEOID10"].str[:2] == states_fips] - # we need to name the file data01.csv for ogr2ogr csv merge to work - df1.to_csv(self.SCORE_CSV_PATH / f"data{states_fips}.csv", index=False) diff --git a/score/etl/score/etl_score_post.py b/score/etl/score/etl_score_post.py new file mode 100644 index 00000000..17da0790 --- /dev/null +++ b/score/etl/score/etl_score_post.py @@ -0,0 +1,112 @@ +import pandas as pd + +from etl.base import ExtractTransformLoad +from utils import get_module_logger + +logger = get_module_logger(__name__) + + +class PostScoreETL(ExtractTransformLoad): + """ + A class used to instantiate an ETL object to retrieve and process data from + datasets. + """ + + def __init__(self): + self.CENSUS_COUNTIES_ZIP_URL = "https://www2.census.gov/geo/docs/maps-data/data/gazetteer/Gaz_counties_national.zip" + self.CENSUS_COUNTIES_TXT = self.TMP_PATH / "Gaz_counties_national.txt" + self.CENSUS_COUNTIES_COLS = ["USPS", "GEOID", "NAME"] + self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" + self.STATE_CSV = ( + self.DATA_PATH / "census" / "csv" / "fips_states_2010.csv" + ) + self.SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa.csv" + self.COUNTY_SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa-county.csv" + + self.TILES_SCORE_COLUMNS = [ + "GEOID10", + "Score E (percentile)", + "Score E (top 25th percentile)", + "GEOID", + "State Abbreviation", + "County Name", + ] + self.TILES_SCORE_CSV_PATH = self.SCORE_CSV_PATH / "tiles" + self.TILES_SCORE_CSV = self.TILES_SCORE_CSV_PATH / "usa.csv" + + self.counties_df: pd.DataFrame + self.states_df: pd.DataFrame + self.score_df: pd.DataFrame + self.score_county_state_merged: pd.DataFrame + self.score_for_tiles: pd.DataFrame + + def extract(self) -> None: + super().extract( + self.CENSUS_COUNTIES_ZIP_URL, + self.TMP_PATH, + ) + + logger.info(f"Reading Counties CSV") + self.counties_df = pd.read_csv( + self.CENSUS_COUNTIES_TXT, + sep="\t", + dtype={"GEOID": "string", "USPS": "string"}, + low_memory=False, + encoding="latin-1", + ) + + logger.info(f"Reading States CSV") + self.states_df = pd.read_csv( + self.STATE_CSV, dtype={"fips": "string", "state_code": "string"} + ) + self.score_df = pd.read_csv(self.SCORE_CSV, dtype={"GEOID10": "string"}) + + def transform(self) -> None: + logger.info(f"Transforming data sources for Score + County CSV") + + # rename some of the columns to prepare for merge + self.counties_df = self.counties_df[["USPS", "GEOID", "NAME"]] + self.counties_df.rename( + columns={"USPS": "State Abbreviation", "NAME": "County Name"}, + inplace=True, + ) + + # remove unnecessary columns + self.states_df.rename( + columns={ + "fips": "State Code", + "state_name": "State Name", + "state_abbreviation": "State Abbreviation", + }, + inplace=True, + ) + self.states_df.drop(["region", "division"], axis=1, inplace=True) + + # add the tract level column + self.score_df["GEOID"] = self.score_df.GEOID10.str[:5] + + # merge state and counties + county_state_merged = self.counties_df.join( + self.states_df, rsuffix=" Other" + ) + del county_state_merged["State Abbreviation Other"] + + # merge county and score + self.score_county_state_merged = self.score_df.join( + county_state_merged, rsuffix="_OTHER" + ) + del self.score_county_state_merged["GEOID_OTHER"] + + def load(self) -> None: + logger.info(f"Saving Score + County CSV") + self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) + # self.score_county_state_merged.to_csv( + # self.COUNTY_SCORE_CSV, index=False + # ) + + logger.info(f"Saving Tile Score CSV") + # TODO: check which are the columns we'll use + # Related to: https://github.com/usds/justice40-tool/issues/302 + score_tiles = self.score_county_state_merged[self.TILES_SCORE_COLUMNS] + self.TILES_SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) + score_tiles.to_csv(self.TILES_SCORE_CSV, index=False) diff --git a/score/etl/sources/census_acs/etl.py b/score/etl/sources/census_acs/etl.py index a144c169..39db151c 100644 --- a/score/etl/sources/census_acs/etl.py +++ b/score/etl/sources/census_acs/etl.py @@ -11,10 +11,14 @@ logger = get_module_logger(__name__) class CensusACSETL(ExtractTransformLoad): def __init__(self): self.ACS_YEAR = 2019 - self.OUTPUT_PATH = self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}" + self.OUTPUT_PATH = ( + self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}" + ) self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)" - self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = "Linguistic isolation (total)" + self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = ( + "Linguistic isolation (total)" + ) self.LINGUISTIC_ISOLATION_FIELDS = [ "C16002_001E", "C16002_004E", @@ -24,7 +28,9 @@ class CensusACSETL(ExtractTransformLoad): ] self.df: pd.DataFrame - def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str: + def _fips_from_censusdata_censusgeo( + self, censusgeo: censusdata.censusgeo + ) -> str: """Create a FIPS code from the proprietary censusgeo index.""" fips = "".join([value for (key, value) in censusgeo.params()]) return fips @@ -32,7 +38,9 @@ class CensusACSETL(ExtractTransformLoad): def extract(self) -> None: dfs = [] for fips in get_state_fips_codes(self.DATA_PATH): - logger.info(f"Downloading data for state/territory with FIPS code {fips}") + logger.info( + f"Downloading data for state/territory with FIPS code {fips}" + ) dfs.append( censusdata.download( @@ -61,7 +69,9 @@ class CensusACSETL(ExtractTransformLoad): # Calculate percent unemployment. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction. - self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E + self.df[self.UNEMPLOYED_FIELD_NAME] = ( + self.df.B23025_005E / self.df.B23025_003E + ) # Calculate linguistic isolation. individual_limited_english_fields = [ diff --git a/score/etl/sources/ejscreen/etl.py b/score/etl/sources/ejscreen/etl.py index f9175551..5fbffb24 100644 --- a/score/etl/sources/ejscreen/etl.py +++ b/score/etl/sources/ejscreen/etl.py @@ -8,11 +8,9 @@ logger = get_module_logger(__name__) class EJScreenETL(ExtractTransformLoad): def __init__(self): - self.EJSCREEN_FTP_URL = ( - "https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip" - ) - self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2020_StatePctile.csv" - self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2020" + self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip" + self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctile.csv" + self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019" self.df: pd.DataFrame def extract(self) -> None: diff --git a/score/ipython/county_lookup.ipynb b/score/ipython/county_lookup.ipynb new file mode 100644 index 00000000..32d90d6b --- /dev/null +++ b/score/ipython/county_lookup.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "7185e18d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import csv\n", + "from pathlib import Path\n", + "import os\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "174bbd09", + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.abspath(os.path.join(\"..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + " \n", + "from utils import unzip_file_from_url\n", + "from etl.sources.census.etl_utils import get_state_fips_codes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd090fcc", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "TMP_PATH: Path = DATA_PATH / \"tmp\"\n", + "STATE_CSV = DATA_PATH / \"census\" / \"csv\" / \"fips_states_2010.csv\"\n", + "SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa.csv\"\n", + "COUNTY_SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa-county.csv\"\n", + "CENSUS_COUNTIES_ZIP_URL = \"https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_counties_national.zip\"\n", + "CENSUS_COUNTIES_TXT = TMP_PATH / \"2020_Gaz_counties_national.txt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf2e266b", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "unzip_file_from_url(CENSUS_COUNTIES_ZIP_URL, TMP_PATH, TMP_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ff96da8", + "metadata": {}, + "outputs": [], + "source": [ + "counties_df = pd.read_csv(CENSUS_COUNTIES_TXT, sep=\"\\t\", dtype={\"GEOID\": \"string\", \"USPS\": \"string\"}, low_memory=False)\n", + "counties_df = counties_df[['USPS', 'GEOID', 'NAME']]\n", + "counties_df.rename(columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True)\n", + "counties_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5af103da", + "metadata": {}, + "outputs": [], + "source": [ + "states_df = pd.read_csv(STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"})\n", + "states_df.rename(columns={\"fips\": \"State Code\", \"state_name\": \"State Name\", \"state_abbreviation\": \"State Abbreviation\"}, inplace=True)\n", + "states_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8680258", + "metadata": {}, + "outputs": [], + "source": [ + "county_state_merged = counties_df.join(states_df, rsuffix=' Other')\n", + "del county_state_merged[\"State Abbreviation Other\"]\n", + "county_state_merged.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58dca55a", + "metadata": {}, + "outputs": [], + "source": [ + "score_df = pd.read_csv(SCORE_CSV, dtype={\"GEOID10\": \"string\"})\n", + "score_df[\"GEOID\"] = score_df.GEOID10.str[:5]\n", + "score_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45e04d42", + "metadata": {}, + "outputs": [], + "source": [ + "score_county_state_merged = score_df.join(county_state_merged, rsuffix='_OTHER')\n", + "del score_county_state_merged[\"GEOID_OTHER\"]\n", + "score_county_state_merged.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5a0b32b", + "metadata": {}, + "outputs": [], + "source": [ + "score_county_state_merged.to_csv(COUNTY_SCORE_CSV, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b690937e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/utils.py b/score/utils.py index bb915003..4ad2d985 100644 --- a/score/utils.py +++ b/score/utils.py @@ -120,6 +120,7 @@ def unzip_file_from_url( logger.info(f"Downloading {file_url}") download = requests.get(file_url, verify=verify) file_contents = download.content + zip_file_path = download_path / "downloaded.zip" zip_file = open(zip_file_path, "wb") zip_file.write(file_contents) @@ -148,8 +149,8 @@ def score_folder_cleanup() -> None: data_path = settings.APP_ROOT / "data" logger.info(f"Initializing all score data") - remove_files_from_dir(data_path / "score" / "csv", ".csv") - remove_files_from_dir(data_path / "score" / "geojson", ".json") + remove_all_from_dir(data_path / "score" / "csv") + remove_all_from_dir(data_path / "score" / "geojson") def temp_folder_cleanup() -> None: