Updating tiles csv to include state code (#1272)

Adding state codes for island areas and puerto rico to the tiles csv.
This commit is contained in:
Emma Nechamkin 2022-02-25 11:10:09 -05:00 committed by GitHub
parent 4517db6229
commit fab828dc66
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 12 deletions

View file

@ -27,6 +27,8 @@ class PostScoreETL(ExtractTransformLoad):
datasets. datasets.
""" """
STATE_CODE_COLUMN = "State Code"
def __init__(self, data_source: str = None): def __init__(self, data_source: str = None):
self.DATA_SOURCE = data_source self.DATA_SOURCE = data_source
self.input_counties_df: pd.DataFrame self.input_counties_df: pd.DataFrame
@ -54,7 +56,9 @@ class PostScoreETL(ExtractTransformLoad):
def _extract_states(self, state_path: Path) -> pd.DataFrame: def _extract_states(self, state_path: Path) -> pd.DataFrame:
logger.info("Reading States CSV") logger.info("Reading States CSV")
return pd.read_csv( return pd.read_csv(
state_path, dtype={"fips": "string", "state_abbreviation": "string"} state_path,
dtype={"fips": "string", "state_abbreviation": "string"},
usecols=["fips", "state_name", "state_abbreviation"],
) )
def _extract_score(self, score_path: Path) -> pd.DataFrame: def _extract_score(self, score_path: Path) -> pd.DataFrame:
@ -133,12 +137,11 @@ class PostScoreETL(ExtractTransformLoad):
# remove unnecessary columns # remove unnecessary columns
new_df = initial_states_df.rename( new_df = initial_states_df.rename(
columns={ columns={
"fips": "State Code", "fips": self.STATE_CODE_COLUMN,
"state_name": field_names.STATE_FIELD, "state_name": field_names.STATE_FIELD,
"state_abbreviation": "State Abbreviation", "state_abbreviation": "State Abbreviation",
} }
) )
new_df.drop(["region", "division"], axis=1, inplace=True)
return new_df return new_df
def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame: def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame:
@ -159,16 +162,31 @@ class PostScoreETL(ExtractTransformLoad):
states_df: pd.DataFrame, states_df: pd.DataFrame,
score_df: pd.DataFrame, score_df: pd.DataFrame,
) -> pd.DataFrame: ) -> pd.DataFrame:
# merge state with counties
logger.info("Merging state with county info") logger.info("Merging county info with score info")
county_state_merged = counties_df.merge( score_county_merged = score_df.merge(
states_df, on="State Abbreviation", how="left" # We drop state abbreviation so we don't get it twice
counties_df[["GEOID", "County Name"]],
on="GEOID", # GEOID is the county ID
how="left",
) )
# merge state + county with score logger.info("Merging state info with county-score info")
score_county_state_merged = score_df.merge( # Here, we need to join on a separate key, since there's no
county_state_merged, # entry for the island areas in the counties df (there are no
on="GEOID", # GEOID is the county ID # counties!) Thus, unless we join state separately from county,
# when we join on GEOID, we lose information about the islands
score_county_merged[self.STATE_CODE_COLUMN] = score_county_merged[
self.GEOID_TRACT_FIELD_NAME
].str[:2]
# TODO: For future reference, we could also refactor this code so that
# the FIPS / State or Territory / County info gets created as an ETL
# process and joined in etl_score, rather than added in post like this.
# That would be a bit more consistent and automatically parallelized
score_county_state_merged = score_county_merged.merge(
states_df,
left_on=self.STATE_CODE_COLUMN,
right_on=self.STATE_CODE_COLUMN,
how="left", how="left",
) )
@ -184,7 +202,7 @@ class PostScoreETL(ExtractTransformLoad):
# recast population to integer # recast population to integer
score_county_state_merged["Total population"] = ( score_county_state_merged["Total population"] = (
merged_df["Total population"].fillna(0.0).astype(int) merged_df["Total population"].fillna(0).astype(int)
) )
de_duplicated_df = merged_df.dropna( de_duplicated_df = merged_df.dropna(