mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-25 08:20:16 -07:00
County mapping to Tracts (#923)
* County mapping to Tracts * passing tests * last traces of cbg
This commit is contained in:
parent
a4108d24c0
commit
cc7bf0d73d
11 changed files with 50 additions and 44 deletions
|
@ -59,7 +59,7 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
|
|||
# Column subsets
|
||||
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
|
||||
TILES_SCORE_COLUMNS = [
|
||||
"GEOID10",
|
||||
"GEOID10_TRACT",
|
||||
"State Name",
|
||||
"County Name",
|
||||
"Total population",
|
||||
|
@ -155,7 +155,7 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
|
|||
|
||||
# Finally we augment with the GEOID10, county, and state
|
||||
DOWNLOADABLE_SCORE_COLUMNS = [
|
||||
"GEOID10",
|
||||
"GEOID10_TRACT",
|
||||
"County Name",
|
||||
"State Name",
|
||||
"Score G (communities)",
|
||||
|
|
|
@ -23,7 +23,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
self.input_counties_df: pd.DataFrame
|
||||
self.input_states_df: pd.DataFrame
|
||||
self.input_score_df: pd.DataFrame
|
||||
self.input_national_cbg_df: pd.DataFrame
|
||||
self.input_national_tract_df: pd.DataFrame
|
||||
|
||||
self.output_score_county_state_merged_df: pd.DataFrame
|
||||
self.output_score_tiles_df: pd.DataFrame
|
||||
|
@ -50,7 +50,9 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
||||
logger.info("Reading Score CSV")
|
||||
df = pd.read_csv(score_path, dtype={"GEOID10": "string"})
|
||||
df = pd.read_csv(
|
||||
score_path, dtype={self.GEOID_TRACT_FIELD_NAME: "string"}
|
||||
)
|
||||
|
||||
# Convert total population to an int:
|
||||
df["Total population"] = df["Total population"].astype(
|
||||
|
@ -59,12 +61,14 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
return df
|
||||
|
||||
def _extract_national_cbg(self, national_cbg_path: Path) -> pd.DataFrame:
|
||||
logger.info("Reading national CBG")
|
||||
def _extract_national_tract(
|
||||
self, national_tract_path: Path
|
||||
) -> pd.DataFrame:
|
||||
logger.info("Reading national tract file")
|
||||
return pd.read_csv(
|
||||
national_cbg_path,
|
||||
names=["GEOID10"],
|
||||
dtype={"GEOID10": "string"},
|
||||
national_tract_path,
|
||||
names=[self.GEOID_TRACT_FIELD_NAME],
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
header=None,
|
||||
)
|
||||
|
@ -91,7 +95,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
self.input_score_df = self._extract_score(
|
||||
constants.DATA_SCORE_CSV_FULL_FILE_PATH
|
||||
)
|
||||
self.input_national_cbg_df = self._extract_national_cbg(
|
||||
self.input_national_tract_df = self._extract_national_tract(
|
||||
constants.DATA_CENSUS_CSV_FILE_PATH
|
||||
)
|
||||
|
||||
|
@ -130,21 +134,22 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Necessary modifications to the score dataframe
|
||||
Add the GEOID field to the score dataframe to do the merge with counties
|
||||
"""
|
||||
# Add the tract level column
|
||||
new_df = initial_score_df.copy()
|
||||
new_df["GEOID"] = initial_score_df.GEOID10.str[:5]
|
||||
return new_df
|
||||
# add GEOID column for counties
|
||||
initial_score_df["GEOID"] = initial_score_df[
|
||||
self.GEOID_TRACT_FIELD_NAME
|
||||
].str[:5]
|
||||
|
||||
return initial_score_df
|
||||
|
||||
def _create_score_data(
|
||||
self,
|
||||
national_cbg_df: pd.DataFrame,
|
||||
national_tract_df: pd.DataFrame,
|
||||
counties_df: pd.DataFrame,
|
||||
states_df: pd.DataFrame,
|
||||
score_df: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
|
||||
# merge state with counties
|
||||
logger.info("Merging state with county info")
|
||||
county_state_merged = counties_df.merge(
|
||||
|
@ -153,15 +158,19 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
# merge state + county with score
|
||||
score_county_state_merged = score_df.merge(
|
||||
county_state_merged, on="GEOID", how="left"
|
||||
county_state_merged,
|
||||
on="GEOID", # GEOID is the county ID
|
||||
how="left",
|
||||
)
|
||||
|
||||
# check if there are census cbgs without score
|
||||
logger.info("Removing CBG rows without score")
|
||||
# check if there are census tracts without score
|
||||
logger.info("Removing tract rows without score")
|
||||
|
||||
# merge census cbgs with score
|
||||
merged_df = national_cbg_df.merge(
|
||||
score_county_state_merged, on="GEOID10", how="left"
|
||||
# merge census tracts with score
|
||||
merged_df = national_tract_df.merge(
|
||||
score_county_state_merged,
|
||||
on=self.GEOID_TRACT_FIELD_NAME,
|
||||
how="left",
|
||||
)
|
||||
|
||||
# recast population to integer
|
||||
|
@ -169,14 +178,14 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
merged_df["Total population"].fillna(0.0).astype(int)
|
||||
)
|
||||
|
||||
# list the null score cbgs
|
||||
null_cbg_df = merged_df[merged_df["Score E (percentile)"].isnull()]
|
||||
# list the null score tracts
|
||||
null_tract_df = merged_df[merged_df["Score E (percentile)"].isnull()]
|
||||
|
||||
# subtract data sets
|
||||
# this follows the XOR pattern outlined here:
|
||||
# https://stackoverflow.com/a/37313953
|
||||
de_duplicated_df = pd.concat(
|
||||
[merged_df, null_cbg_df, null_cbg_df]
|
||||
[merged_df, null_tract_df, null_tract_df]
|
||||
).drop_duplicates(keep=False)
|
||||
|
||||
# set the score to the new df
|
||||
|
@ -212,7 +221,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
transformed_score = self._transform_score(self.input_score_df)
|
||||
|
||||
output_score_county_state_merged_df = self._create_score_data(
|
||||
self.input_national_cbg_df,
|
||||
self.input_national_tract_df,
|
||||
transformed_counties,
|
||||
transformed_states,
|
||||
transformed_score,
|
||||
|
|
|
@ -75,11 +75,9 @@ def score_pdf_initial(sample_data_dir):
|
|||
def counties_transformed_expected():
|
||||
return pd.DataFrame.from_dict(
|
||||
data={
|
||||
"State Abbreviation": pd.Series(["AL", "AL"], dtype="string"),
|
||||
"GEOID": pd.Series(["01001", "01003"], dtype="string"),
|
||||
"County Name": pd.Series(
|
||||
["AutaugaCounty", "BaldwinCounty"], dtype="object"
|
||||
),
|
||||
"State Abbreviation": pd.Series(["AL"], dtype="string"),
|
||||
"GEOID": pd.Series(["01073"], dtype="string"),
|
||||
"County Name": pd.Series(["Jefferson County"], dtype="object"),
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -105,11 +103,11 @@ def score_transformed_expected():
|
|||
|
||||
|
||||
@pytest.fixture()
|
||||
def national_cbg_df():
|
||||
def national_tract_df():
|
||||
return pd.DataFrame.from_dict(
|
||||
data={
|
||||
"GEOID10": pd.Series(
|
||||
["010010201001", "010010201002"], dtype="string"
|
||||
"GEOID10_TRACT": pd.Series(
|
||||
["01073001100", "01073001400"], dtype="string"
|
||||
),
|
||||
},
|
||||
)
|
||||
|
|
|
@ -1,3 +1,2 @@
|
|||
USPS GEOID ANSICODE NAME POP10 HU10 ALAND AWATER ALAND_SQMI AWATER_SQMI INTPTLAT INTPTLONG
|
||||
AL 01001 00161526 AutaugaCounty 54571 22135 1539582278 25775735 594.436 9.952 32.536382 -86.644490
|
||||
AL 01003 00161527 BaldwinCounty 182265 104061 4117521611 1133190229 1589.784 437.527 30.659218 -87.746067
|
||||
AL 01073 00161562 Jefferson County 658466 300552 2878192209 32474487 1111.276 12.538 33.553444 -86.896536
|
||||
|
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -29,7 +29,7 @@ def test_extract_states(etl, state_data_initial):
|
|||
|
||||
def test_extract_score(etl, score_data_initial):
|
||||
extracted = etl._extract_score(score_data_initial)
|
||||
string_cols = ["GEOID10"]
|
||||
string_cols = ["GEOID10_TRACT"]
|
||||
assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
|
||||
|
||||
|
||||
|
@ -63,14 +63,14 @@ def test_transform_score(etl, score_data_initial, score_transformed_expected):
|
|||
# pylint: disable=too-many-arguments
|
||||
def test_create_score_data(
|
||||
etl,
|
||||
national_cbg_df,
|
||||
national_tract_df,
|
||||
counties_transformed_expected,
|
||||
states_transformed_expected,
|
||||
score_transformed_expected,
|
||||
score_data_expected,
|
||||
):
|
||||
score_data_actual = etl._create_score_data(
|
||||
national_cbg_df,
|
||||
national_tract_df,
|
||||
counties_transformed_expected,
|
||||
states_transformed_expected,
|
||||
score_transformed_expected,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue