mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
Updating tiles csv to include state code (#1272)
Adding state codes for island areas and puerto rico to the tiles csv.
This commit is contained in:
parent
4517db6229
commit
fab828dc66
2 changed files with 30 additions and 12 deletions
|
@ -27,6 +27,8 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
datasets.
|
datasets.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
STATE_CODE_COLUMN = "State Code"
|
||||||
|
|
||||||
def __init__(self, data_source: str = None):
|
def __init__(self, data_source: str = None):
|
||||||
self.DATA_SOURCE = data_source
|
self.DATA_SOURCE = data_source
|
||||||
self.input_counties_df: pd.DataFrame
|
self.input_counties_df: pd.DataFrame
|
||||||
|
@ -54,7 +56,9 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
def _extract_states(self, state_path: Path) -> pd.DataFrame:
|
def _extract_states(self, state_path: Path) -> pd.DataFrame:
|
||||||
logger.info("Reading States CSV")
|
logger.info("Reading States CSV")
|
||||||
return pd.read_csv(
|
return pd.read_csv(
|
||||||
state_path, dtype={"fips": "string", "state_abbreviation": "string"}
|
state_path,
|
||||||
|
dtype={"fips": "string", "state_abbreviation": "string"},
|
||||||
|
usecols=["fips", "state_name", "state_abbreviation"],
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
||||||
|
@ -133,12 +137,11 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
# remove unnecessary columns
|
# remove unnecessary columns
|
||||||
new_df = initial_states_df.rename(
|
new_df = initial_states_df.rename(
|
||||||
columns={
|
columns={
|
||||||
"fips": "State Code",
|
"fips": self.STATE_CODE_COLUMN,
|
||||||
"state_name": field_names.STATE_FIELD,
|
"state_name": field_names.STATE_FIELD,
|
||||||
"state_abbreviation": "State Abbreviation",
|
"state_abbreviation": "State Abbreviation",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
new_df.drop(["region", "division"], axis=1, inplace=True)
|
|
||||||
return new_df
|
return new_df
|
||||||
|
|
||||||
def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame:
|
def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
@ -159,16 +162,31 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
states_df: pd.DataFrame,
|
states_df: pd.DataFrame,
|
||||||
score_df: pd.DataFrame,
|
score_df: pd.DataFrame,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
# merge state with counties
|
|
||||||
logger.info("Merging state with county info")
|
logger.info("Merging county info with score info")
|
||||||
county_state_merged = counties_df.merge(
|
score_county_merged = score_df.merge(
|
||||||
states_df, on="State Abbreviation", how="left"
|
# We drop state abbreviation so we don't get it twice
|
||||||
|
counties_df[["GEOID", "County Name"]],
|
||||||
|
on="GEOID", # GEOID is the county ID
|
||||||
|
how="left",
|
||||||
)
|
)
|
||||||
|
|
||||||
# merge state + county with score
|
logger.info("Merging state info with county-score info")
|
||||||
score_county_state_merged = score_df.merge(
|
# Here, we need to join on a separate key, since there's no
|
||||||
county_state_merged,
|
# entry for the island areas in the counties df (there are no
|
||||||
on="GEOID", # GEOID is the county ID
|
# counties!) Thus, unless we join state separately from county,
|
||||||
|
# when we join on GEOID, we lose information about the islands
|
||||||
|
score_county_merged[self.STATE_CODE_COLUMN] = score_county_merged[
|
||||||
|
self.GEOID_TRACT_FIELD_NAME
|
||||||
|
].str[:2]
|
||||||
|
# TODO: For future reference, we could also refactor this code so that
|
||||||
|
# the FIPS / State or Territory / County info gets created as an ETL
|
||||||
|
# process and joined in etl_score, rather than added in post like this.
|
||||||
|
# That would be a bit more consistent and automatically parallelized
|
||||||
|
score_county_state_merged = score_county_merged.merge(
|
||||||
|
states_df,
|
||||||
|
left_on=self.STATE_CODE_COLUMN,
|
||||||
|
right_on=self.STATE_CODE_COLUMN,
|
||||||
how="left",
|
how="left",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -184,7 +202,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
# recast population to integer
|
# recast population to integer
|
||||||
score_county_state_merged["Total population"] = (
|
score_county_state_merged["Total population"] = (
|
||||||
merged_df["Total population"].fillna(0.0).astype(int)
|
merged_df["Total population"].fillna(0).astype(int)
|
||||||
)
|
)
|
||||||
|
|
||||||
de_duplicated_df = merged_df.dropna(
|
de_duplicated_df = merged_df.dropna(
|
||||||
|
|
Binary file not shown.
Loading…
Add table
Reference in a new issue