County Names for Score #188 (#347)

* starting PR

* completed feature

* checkpoint

* adding new fips and updating counties to 2010

* updated sources to 2010 - 2019

* more cleanup

* creating tiles score csv
This commit is contained in:
Jorge Escobar 2021-07-15 13:34:08 -04:00 committed by GitHub
commit 0316906a69
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 425 additions and 54 deletions

View file

@ -0,0 +1,53 @@
fips,state_name,state_abbreviation,region,division
01,Alabama,AL,South,East South Central
02,Alaska,AK,West,Pacific
04,Arizona,AZ,West,Mountain
05,Arkansas,AR,South,West South Central
06,California,CA,West,Pacific
08,Colorado,CO,West,Mountain
09,Connecticut,CT,Northeast,New England
10,Delaware,DE,South,South Atlantic
11,District of Columbia,DC,South,South Atlantic
12,Florida,FL,South,South Atlantic
13,Georgia,GA,South,South Atlantic
15,Hawaii,HI,West,Pacific
16,Idaho,ID,West,Mountain
17,Illinois,IL,Midwest,East North Central
18,Indiana,IN,Midwest,East North Central
19,Iowa,IA,Midwest,West North Central
20,Kansas,KS,Midwest,West North Central
21,Kentucky,KY,South,East South Central
22,Louisiana,LA,South,West South Central
23,Maine,ME,Northeast,New England
24,Maryland,MD,South,South Atlantic
25,Massachusetts,MA,Northeast,New England
26,Michigan,MI,Midwest,East North Central
27,Minnesota,MN,Midwest,West North Central
28,Mississippi,MS,South,East South Central
29,Missouri,MO,Midwest,West North Central
30,Montana,MT,West,Mountain
31,Nebraska,NE,Midwest,West North Central
32,Nevada,NV,West,Mountain
33,New Hampshire,NH,Northeast,New England
34,New Jersey,NJ,Northeast,Middle Atlantic
35,New Mexico,NM,West,Mountain
36,New York,NY,Northeast,Middle Atlantic
37,North Carolina,NC,South,South Atlantic
38,North Dakota,ND,Midwest,West North Central
39,Ohio,OH,Midwest,East North Central
40,Oklahoma,OK,South,West South Central
41,Oregon,OR,West,Pacific
42,Pennsylvania,PA,Northeast,Middle Atlantic
44,Rhode Island,RI,Northeast,New England
45,South Carolina,SC,South,South Atlantic
46,South Dakota,SD,Midwest,West North Central
47,Tennessee,TN,South,East South Central
48,Texas,TX,South,West South Central
49,Utah,UT,West,Mountain
50,Vermont,VT,Northeast,New England
51,Virginia,VA,South,South Atlantic
53,Washington,WA,West,Pacific
54,West Virginia,WV,South,South Atlantic
55,Wisconsin,WI,Midwest,East North Central
56,Wyoming,WY,West,Mountain
72,Puerto Rico,PR,Puerto Rico,Puerto Rico
1 fips state_name state_abbreviation region division
2 01 Alabama AL South East South Central
3 02 Alaska AK West Pacific
4 04 Arizona AZ West Mountain
5 05 Arkansas AR South West South Central
6 06 California CA West Pacific
7 08 Colorado CO West Mountain
8 09 Connecticut CT Northeast New England
9 10 Delaware DE South South Atlantic
10 11 District of Columbia DC South South Atlantic
11 12 Florida FL South South Atlantic
12 13 Georgia GA South South Atlantic
13 15 Hawaii HI West Pacific
14 16 Idaho ID West Mountain
15 17 Illinois IL Midwest East North Central
16 18 Indiana IN Midwest East North Central
17 19 Iowa IA Midwest West North Central
18 20 Kansas KS Midwest West North Central
19 21 Kentucky KY South East South Central
20 22 Louisiana LA South West South Central
21 23 Maine ME Northeast New England
22 24 Maryland MD South South Atlantic
23 25 Massachusetts MA Northeast New England
24 26 Michigan MI Midwest East North Central
25 27 Minnesota MN Midwest West North Central
26 28 Mississippi MS South East South Central
27 29 Missouri MO Midwest West North Central
28 30 Montana MT West Mountain
29 31 Nebraska NE Midwest West North Central
30 32 Nevada NV West Mountain
31 33 New Hampshire NH Northeast New England
32 34 New Jersey NJ Northeast Middle Atlantic
33 35 New Mexico NM West Mountain
34 36 New York NY Northeast Middle Atlantic
35 37 North Carolina NC South South Atlantic
36 38 North Dakota ND Midwest West North Central
37 39 Ohio OH Midwest East North Central
38 40 Oklahoma OK South West South Central
39 41 Oregon OR West Pacific
40 42 Pennsylvania PA Northeast Middle Atlantic
41 44 Rhode Island RI Northeast New England
42 45 South Carolina SC South South Atlantic
43 46 South Dakota SD Midwest West North Central
44 47 Tennessee TN South East South Central
45 48 Texas TX South West South Central
46 49 Utah UT West Mountain
47 50 Vermont VT Northeast New England
48 51 Virginia VA South South Atlantic
49 53 Washington WA West Pacific
50 54 West Virginia WV South South Atlantic
51 55 Wisconsin WI Midwest East North Central
52 56 Wyoming WY West Mountain
53 72 Puerto Rico PR Puerto Rico Puerto Rico

View file

@ -1,6 +1,7 @@
import importlib
from etl.score.etl import ScoreETL
from etl.score.etl_score import ScoreETL
from etl.score.etl_score_post import PostScoreETL
def etl_runner(dataset_to_run: str = None) -> None:
@ -20,7 +21,11 @@ def etl_runner(dataset_to_run: str = None) -> None:
"module_dir": "census_acs",
"class_name": "CensusACSETL",
},
{"name": "ejscreen", "module_dir": "ejscreen", "class_name": "EJScreenETL"},
{
"name": "ejscreen",
"module_dir": "ejscreen",
"class_name": "EJScreenETL",
},
{
"name": "housing_and_transportation",
"module_dir": "housing_and_transportation",
@ -36,12 +41,17 @@ def etl_runner(dataset_to_run: str = None) -> None:
"module_dir": "calenviroscreen",
"class_name": "CalEnviroScreenETL",
},
{"name": "hud_recap", "module_dir": "hud_recap", "class_name": "HudRecapETL"},
{
"name": "hud_recap",
"module_dir": "hud_recap",
"class_name": "HudRecapETL",
},
]
if dataset_to_run:
dataset_element = next(
(item for item in dataset_list if item["name"] == dataset_to_run), None
(item for item in dataset_list if item["name"] == dataset_to_run),
None,
)
if not dataset_list:
raise ValueError("Invalid dataset name")
@ -51,7 +61,9 @@ def etl_runner(dataset_to_run: str = None) -> None:
# Run the ETLs for the dataset_list
for dataset in dataset_list:
etl_module = importlib.import_module(f"etl.sources.{dataset['module_dir']}.etl")
etl_module = importlib.import_module(
f"etl.sources.{dataset['module_dir']}.etl"
)
etl_class = getattr(etl_module, dataset["class_name"])
etl_instance = etl_class()
@ -80,16 +92,19 @@ def score_generate() -> None:
Returns:
None
"""
score = ScoreETL()
# run extract
score.extract()
# Score Gen
score_gen = ScoreETL()
score_gen.extract()
score_gen.transform()
score_gen.load()
# run transform
score.transform()
# run load
score.load()
# Post Score Processing
score_post = PostScoreETL()
score_post.extract()
score_post.transform()
score_post.load()
score_post.cleanup()
def _find_dataset_index(dataset_list, key, value):

View file

@ -28,10 +28,10 @@ class ScoreETL(ExtractTransformLoad):
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)"
self.POVERTY_FIELD_NAME = "Poverty (Less than 200% of federal poverty line)"
self.HIGH_SCHOOL_FIELD_NAME = (
"Percent individuals age 25 or over with less than high school degree"
self.POVERTY_FIELD_NAME = (
"Poverty (Less than 200% of federal poverty line)"
)
self.HIGH_SCHOOL_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree"
# There's another aggregation level (a second level of "buckets").
self.AGGREGATION_POLLUTION = "Pollution Burden"
@ -40,7 +40,7 @@ class ScoreETL(ExtractTransformLoad):
self.PERCENTILE_FIELD_SUFFIX = " (percentile)"
self.MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" / "full"
# dataframes
self.df: pd.DataFrame
@ -51,21 +51,28 @@ class ScoreETL(ExtractTransformLoad):
def extract(self) -> None:
# EJSCreen csv Load
ejscreen_csv = self.DATA_PATH / "dataset" / "ejscreen_2020" / "usa.csv"
ejscreen_csv = self.DATA_PATH / "dataset" / "ejscreen_2019" / "usa.csv"
self.ejscreen_df = pd.read_csv(
ejscreen_csv, dtype={"ID": "string"}, low_memory=False
)
self.ejscreen_df.rename(columns={"ID": self.GEOID_FIELD_NAME}, inplace=True)
self.ejscreen_df.rename(
columns={"ID": self.GEOID_FIELD_NAME}, inplace=True
)
# Load census data
census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
self.census_df = pd.read_csv(
census_csv, dtype={self.GEOID_FIELD_NAME: "string"}, low_memory=False
census_csv,
dtype={self.GEOID_FIELD_NAME: "string"},
low_memory=False,
)
# Load housing and transportation data
housing_and_transportation_index_csv = (
self.DATA_PATH / "dataset" / "housing_and_transportation_index" / "usa.csv"
self.DATA_PATH
/ "dataset"
/ "housing_and_transportation_index"
/ "usa.csv"
)
self.housing_and_transportation_df = pd.read_csv(
housing_and_transportation_index_csv,
@ -99,7 +106,10 @@ class ScoreETL(ExtractTransformLoad):
)
# Sanity check the join.
if len(census_block_group_df[self.GEOID_FIELD_NAME].str.len().unique()) != 1:
if (
len(census_block_group_df[self.GEOID_FIELD_NAME].str.len().unique())
!= 1
):
raise ValueError(
f"One of the input CSVs uses {self.GEOID_FIELD_NAME} with a different length."
)
@ -109,9 +119,9 @@ class ScoreETL(ExtractTransformLoad):
census_tract_df = self.hud_housing_df
# Calculate the tract for the CBG data.
census_block_group_df[self.GEOID_TRACT_FIELD_NAME] = census_block_group_df[
self.GEOID_FIELD_NAME
].str[0:11]
census_block_group_df[
self.GEOID_TRACT_FIELD_NAME
] = census_block_group_df[self.GEOID_FIELD_NAME].str[0:11]
self.df = census_block_group_df.merge(
census_tract_df, on=self.GEOID_TRACT_FIELD_NAME
@ -122,7 +132,8 @@ class ScoreETL(ExtractTransformLoad):
# Define a named tuple that will be used for each data set input.
DataSet = collections.namedtuple(
typename="DataSet", field_names=["input_field", "renamed_field", "bucket"]
typename="DataSet",
field_names=["input_field", "renamed_field", "bucket"],
)
data_sets = [
@ -139,7 +150,9 @@ class ScoreETL(ExtractTransformLoad):
bucket=None,
),
DataSet(
input_field="ACSTOTPOP", renamed_field="Total population", bucket=None
input_field="ACSTOTPOP",
renamed_field="Total population",
bucket=None,
),
# The following data sets have buckets, because they're used in the score
DataSet(
@ -163,7 +176,9 @@ class ScoreETL(ExtractTransformLoad):
bucket=self.BUCKET_EXPOSURES,
),
DataSet(
input_field="OZONE", renamed_field="Ozone", bucket=self.BUCKET_EXPOSURES
input_field="OZONE",
renamed_field="Ozone",
bucket=self.BUCKET_EXPOSURES,
),
DataSet(
input_field="PTRAF",
@ -239,7 +254,8 @@ class ScoreETL(ExtractTransformLoad):
# Rename columns:
renaming_dict = {
data_set.input_field: data_set.renamed_field for data_set in data_sets
data_set.input_field: data_set.renamed_field
for data_set in data_sets
}
self.df.rename(
@ -308,7 +324,9 @@ class ScoreETL(ExtractTransformLoad):
]
].mean(axis=1)
self.df["Score B"] = (
self.df["Poverty (Less than 200% of federal poverty line) (percentile)"]
self.df[
"Poverty (Less than 200% of federal poverty line) (percentile)"
]
* self.df[
"Percent individuals age 25 or over with less than high school degree (percentile)"
]
@ -337,7 +355,8 @@ class ScoreETL(ExtractTransformLoad):
# Multiply the "Pollution Burden" score and the "Population Characteristics" together to produce the cumulative impact score.
self.df["Score C"] = (
self.df[self.AGGREGATION_POLLUTION] * self.df[self.AGGREGATION_POPULATION]
self.df[self.AGGREGATION_POLLUTION]
* self.df[self.AGGREGATION_POPULATION]
)
if len(census_block_group_df) > 220333:
@ -352,10 +371,12 @@ class ScoreETL(ExtractTransformLoad):
]
fields_min_max = [
f"{field}{self.MIN_MAX_FIELD_SUFFIX}" for field in fields_to_use_in_score
f"{field}{self.MIN_MAX_FIELD_SUFFIX}"
for field in fields_to_use_in_score
]
fields_percentile = [
f"{field}{self.PERCENTILE_FIELD_SUFFIX}" for field in fields_to_use_in_score
f"{field}{self.PERCENTILE_FIELD_SUFFIX}"
for field in fields_to_use_in_score
]
# Calculate "Score D", which uses min-max normalization
@ -367,7 +388,13 @@ class ScoreETL(ExtractTransformLoad):
self.df[fields_min_max].corr()
# Create percentiles for the scores
for score_field in ["Score A", "Score B", "Score C", "Score D", "Score E"]:
for score_field in [
"Score A",
"Score B",
"Score C",
"Score D",
"Score E",
]:
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] = self.df[
score_field
].rank(pct=True)
@ -376,14 +403,8 @@ class ScoreETL(ExtractTransformLoad):
)
def load(self) -> None:
logger.info(f"Saving Score CSVs")
logger.info(f"Saving Score CSV")
# write nationwide csv
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.SCORE_CSV_PATH / f"usa.csv", index=False)
# write per state csvs
for states_fips in get_state_fips_codes(self.DATA_PATH):
logger.info(f"Generating data{states_fips} csv")
df1 = self.df[self.df["GEOID10"].str[:2] == states_fips]
# we need to name the file data01.csv for ogr2ogr csv merge to work
df1.to_csv(self.SCORE_CSV_PATH / f"data{states_fips}.csv", index=False)

View file

@ -0,0 +1,112 @@
import pandas as pd
from etl.base import ExtractTransformLoad
from utils import get_module_logger
logger = get_module_logger(__name__)
class PostScoreETL(ExtractTransformLoad):
"""
A class used to instantiate an ETL object to retrieve and process data from
datasets.
"""
def __init__(self):
self.CENSUS_COUNTIES_ZIP_URL = "https://www2.census.gov/geo/docs/maps-data/data/gazetteer/Gaz_counties_national.zip"
self.CENSUS_COUNTIES_TXT = self.TMP_PATH / "Gaz_counties_national.txt"
self.CENSUS_COUNTIES_COLS = ["USPS", "GEOID", "NAME"]
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
self.STATE_CSV = (
self.DATA_PATH / "census" / "csv" / "fips_states_2010.csv"
)
self.SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa.csv"
self.COUNTY_SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa-county.csv"
self.TILES_SCORE_COLUMNS = [
"GEOID10",
"Score E (percentile)",
"Score E (top 25th percentile)",
"GEOID",
"State Abbreviation",
"County Name",
]
self.TILES_SCORE_CSV_PATH = self.SCORE_CSV_PATH / "tiles"
self.TILES_SCORE_CSV = self.TILES_SCORE_CSV_PATH / "usa.csv"
self.counties_df: pd.DataFrame
self.states_df: pd.DataFrame
self.score_df: pd.DataFrame
self.score_county_state_merged: pd.DataFrame
self.score_for_tiles: pd.DataFrame
def extract(self) -> None:
super().extract(
self.CENSUS_COUNTIES_ZIP_URL,
self.TMP_PATH,
)
logger.info(f"Reading Counties CSV")
self.counties_df = pd.read_csv(
self.CENSUS_COUNTIES_TXT,
sep="\t",
dtype={"GEOID": "string", "USPS": "string"},
low_memory=False,
encoding="latin-1",
)
logger.info(f"Reading States CSV")
self.states_df = pd.read_csv(
self.STATE_CSV, dtype={"fips": "string", "state_code": "string"}
)
self.score_df = pd.read_csv(self.SCORE_CSV, dtype={"GEOID10": "string"})
def transform(self) -> None:
logger.info(f"Transforming data sources for Score + County CSV")
# rename some of the columns to prepare for merge
self.counties_df = self.counties_df[["USPS", "GEOID", "NAME"]]
self.counties_df.rename(
columns={"USPS": "State Abbreviation", "NAME": "County Name"},
inplace=True,
)
# remove unnecessary columns
self.states_df.rename(
columns={
"fips": "State Code",
"state_name": "State Name",
"state_abbreviation": "State Abbreviation",
},
inplace=True,
)
self.states_df.drop(["region", "division"], axis=1, inplace=True)
# add the tract level column
self.score_df["GEOID"] = self.score_df.GEOID10.str[:5]
# merge state and counties
county_state_merged = self.counties_df.join(
self.states_df, rsuffix=" Other"
)
del county_state_merged["State Abbreviation Other"]
# merge county and score
self.score_county_state_merged = self.score_df.join(
county_state_merged, rsuffix="_OTHER"
)
del self.score_county_state_merged["GEOID_OTHER"]
def load(self) -> None:
logger.info(f"Saving Score + County CSV")
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
# self.score_county_state_merged.to_csv(
# self.COUNTY_SCORE_CSV, index=False
# )
logger.info(f"Saving Tile Score CSV")
# TODO: check which are the columns we'll use
# Related to: https://github.com/usds/justice40-tool/issues/302
score_tiles = self.score_county_state_merged[self.TILES_SCORE_COLUMNS]
self.TILES_SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
score_tiles.to_csv(self.TILES_SCORE_CSV, index=False)

View file

@ -11,10 +11,14 @@ logger = get_module_logger(__name__)
class CensusACSETL(ExtractTransformLoad):
def __init__(self):
self.ACS_YEAR = 2019
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
)
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = "Linguistic isolation (total)"
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
"Linguistic isolation (total)"
)
self.LINGUISTIC_ISOLATION_FIELDS = [
"C16002_001E",
"C16002_004E",
@ -24,7 +28,9 @@ class CensusACSETL(ExtractTransformLoad):
]
self.df: pd.DataFrame
def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
def _fips_from_censusdata_censusgeo(
self, censusgeo: censusdata.censusgeo
) -> str:
"""Create a FIPS code from the proprietary censusgeo index."""
fips = "".join([value for (key, value) in censusgeo.params()])
return fips
@ -32,7 +38,9 @@ class CensusACSETL(ExtractTransformLoad):
def extract(self) -> None:
dfs = []
for fips in get_state_fips_codes(self.DATA_PATH):
logger.info(f"Downloading data for state/territory with FIPS code {fips}")
logger.info(
f"Downloading data for state/territory with FIPS code {fips}"
)
dfs.append(
censusdata.download(
@ -61,7 +69,9 @@ class CensusACSETL(ExtractTransformLoad):
# Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
self.df[self.UNEMPLOYED_FIELD_NAME] = (
self.df.B23025_005E / self.df.B23025_003E
)
# Calculate linguistic isolation.
individual_limited_english_fields = [

View file

@ -8,11 +8,9 @@ logger = get_module_logger(__name__)
class EJScreenETL(ExtractTransformLoad):
def __init__(self):
self.EJSCREEN_FTP_URL = (
"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip"
)
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2020_StatePctile.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2020"
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctile.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
self.df: pd.DataFrame
def extract(self) -> None:

View file

@ -0,0 +1,161 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "7185e18d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "174bbd09",
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
" \n",
"from utils import unzip_file_from_url\n",
"from etl.sources.census.etl_utils import get_state_fips_codes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd090fcc",
"metadata": {},
"outputs": [],
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"STATE_CSV = DATA_PATH / \"census\" / \"csv\" / \"fips_states_2010.csv\"\n",
"SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa.csv\"\n",
"COUNTY_SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa-county.csv\"\n",
"CENSUS_COUNTIES_ZIP_URL = \"https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_counties_national.zip\"\n",
"CENSUS_COUNTIES_TXT = TMP_PATH / \"2020_Gaz_counties_national.txt\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf2e266b",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"unzip_file_from_url(CENSUS_COUNTIES_ZIP_URL, TMP_PATH, TMP_PATH)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ff96da8",
"metadata": {},
"outputs": [],
"source": [
"counties_df = pd.read_csv(CENSUS_COUNTIES_TXT, sep=\"\\t\", dtype={\"GEOID\": \"string\", \"USPS\": \"string\"}, low_memory=False)\n",
"counties_df = counties_df[['USPS', 'GEOID', 'NAME']]\n",
"counties_df.rename(columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True)\n",
"counties_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5af103da",
"metadata": {},
"outputs": [],
"source": [
"states_df = pd.read_csv(STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"})\n",
"states_df.rename(columns={\"fips\": \"State Code\", \"state_name\": \"State Name\", \"state_abbreviation\": \"State Abbreviation\"}, inplace=True)\n",
"states_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8680258",
"metadata": {},
"outputs": [],
"source": [
"county_state_merged = counties_df.join(states_df, rsuffix=' Other')\n",
"del county_state_merged[\"State Abbreviation Other\"]\n",
"county_state_merged.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58dca55a",
"metadata": {},
"outputs": [],
"source": [
"score_df = pd.read_csv(SCORE_CSV, dtype={\"GEOID10\": \"string\"})\n",
"score_df[\"GEOID\"] = score_df.GEOID10.str[:5]\n",
"score_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45e04d42",
"metadata": {},
"outputs": [],
"source": [
"score_county_state_merged = score_df.join(county_state_merged, rsuffix='_OTHER')\n",
"del score_county_state_merged[\"GEOID_OTHER\"]\n",
"score_county_state_merged.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5a0b32b",
"metadata": {},
"outputs": [],
"source": [
"score_county_state_merged.to_csv(COUNTY_SCORE_CSV, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b690937e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -120,6 +120,7 @@ def unzip_file_from_url(
logger.info(f"Downloading {file_url}")
download = requests.get(file_url, verify=verify)
file_contents = download.content
zip_file_path = download_path / "downloaded.zip"
zip_file = open(zip_file_path, "wb")
zip_file.write(file_contents)
@ -148,8 +149,8 @@ def score_folder_cleanup() -> None:
data_path = settings.APP_ROOT / "data"
logger.info(f"Initializing all score data")
remove_files_from_dir(data_path / "score" / "csv", ".csv")
remove_files_from_dir(data_path / "score" / "geojson", ".json")
remove_all_from_dir(data_path / "score" / "csv")
remove_all_from_dir(data_path / "score" / "geojson")
def temp_folder_cleanup() -> None: