County mapping to Tracts (#923)

* County mapping to Tracts

* passing tests

* last traces of cbg
This commit is contained in:
Jorge Escobar 2021-11-26 11:23:40 -05:00 committed by lucasmbrown-usds
commit cc7bf0d73d
11 changed files with 50 additions and 44 deletions

View file

@ -59,7 +59,7 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
# Column subsets
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
TILES_SCORE_COLUMNS = [
"GEOID10",
"GEOID10_TRACT",
"State Name",
"County Name",
"Total population",
@ -155,7 +155,7 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
# Finally we augment with the GEOID10, county, and state
DOWNLOADABLE_SCORE_COLUMNS = [
"GEOID10",
"GEOID10_TRACT",
"County Name",
"State Name",
"Score G (communities)",

View file

@ -23,7 +23,7 @@ class PostScoreETL(ExtractTransformLoad):
self.input_counties_df: pd.DataFrame
self.input_states_df: pd.DataFrame
self.input_score_df: pd.DataFrame
self.input_national_cbg_df: pd.DataFrame
self.input_national_tract_df: pd.DataFrame
self.output_score_county_state_merged_df: pd.DataFrame
self.output_score_tiles_df: pd.DataFrame
@ -50,7 +50,9 @@ class PostScoreETL(ExtractTransformLoad):
def _extract_score(self, score_path: Path) -> pd.DataFrame:
logger.info("Reading Score CSV")
df = pd.read_csv(score_path, dtype={"GEOID10": "string"})
df = pd.read_csv(
score_path, dtype={self.GEOID_TRACT_FIELD_NAME: "string"}
)
# Convert total population to an int:
df["Total population"] = df["Total population"].astype(
@ -59,12 +61,14 @@ class PostScoreETL(ExtractTransformLoad):
return df
def _extract_national_cbg(self, national_cbg_path: Path) -> pd.DataFrame:
logger.info("Reading national CBG")
def _extract_national_tract(
self, national_tract_path: Path
) -> pd.DataFrame:
logger.info("Reading national tract file")
return pd.read_csv(
national_cbg_path,
names=["GEOID10"],
dtype={"GEOID10": "string"},
national_tract_path,
names=[self.GEOID_TRACT_FIELD_NAME],
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
header=None,
)
@ -91,7 +95,7 @@ class PostScoreETL(ExtractTransformLoad):
self.input_score_df = self._extract_score(
constants.DATA_SCORE_CSV_FULL_FILE_PATH
)
self.input_national_cbg_df = self._extract_national_cbg(
self.input_national_tract_df = self._extract_national_tract(
constants.DATA_CENSUS_CSV_FILE_PATH
)
@ -130,21 +134,22 @@ class PostScoreETL(ExtractTransformLoad):
def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame:
"""
Necessary modifications to the score dataframe
Add the GEOID field to the score dataframe to do the merge with counties
"""
# Add the tract level column
new_df = initial_score_df.copy()
new_df["GEOID"] = initial_score_df.GEOID10.str[:5]
return new_df
# add GEOID column for counties
initial_score_df["GEOID"] = initial_score_df[
self.GEOID_TRACT_FIELD_NAME
].str[:5]
return initial_score_df
def _create_score_data(
self,
national_cbg_df: pd.DataFrame,
national_tract_df: pd.DataFrame,
counties_df: pd.DataFrame,
states_df: pd.DataFrame,
score_df: pd.DataFrame,
) -> pd.DataFrame:
# merge state with counties
logger.info("Merging state with county info")
county_state_merged = counties_df.merge(
@ -153,15 +158,19 @@ class PostScoreETL(ExtractTransformLoad):
# merge state + county with score
score_county_state_merged = score_df.merge(
county_state_merged, on="GEOID", how="left"
county_state_merged,
on="GEOID", # GEOID is the county ID
how="left",
)
# check if there are census cbgs without score
logger.info("Removing CBG rows without score")
# check if there are census tracts without score
logger.info("Removing tract rows without score")
# merge census cbgs with score
merged_df = national_cbg_df.merge(
score_county_state_merged, on="GEOID10", how="left"
# merge census tracts with score
merged_df = national_tract_df.merge(
score_county_state_merged,
on=self.GEOID_TRACT_FIELD_NAME,
how="left",
)
# recast population to integer
@ -169,14 +178,14 @@ class PostScoreETL(ExtractTransformLoad):
merged_df["Total population"].fillna(0.0).astype(int)
)
# list the null score cbgs
null_cbg_df = merged_df[merged_df["Score E (percentile)"].isnull()]
# list the null score tracts
null_tract_df = merged_df[merged_df["Score E (percentile)"].isnull()]
# subtract data sets
# this follows the XOR pattern outlined here:
# https://stackoverflow.com/a/37313953
de_duplicated_df = pd.concat(
[merged_df, null_cbg_df, null_cbg_df]
[merged_df, null_tract_df, null_tract_df]
).drop_duplicates(keep=False)
# set the score to the new df
@ -212,7 +221,7 @@ class PostScoreETL(ExtractTransformLoad):
transformed_score = self._transform_score(self.input_score_df)
output_score_county_state_merged_df = self._create_score_data(
self.input_national_cbg_df,
self.input_national_tract_df,
transformed_counties,
transformed_states,
transformed_score,

View file

@ -75,11 +75,9 @@ def score_pdf_initial(sample_data_dir):
def counties_transformed_expected():
return pd.DataFrame.from_dict(
data={
"State Abbreviation": pd.Series(["AL", "AL"], dtype="string"),
"GEOID": pd.Series(["01001", "01003"], dtype="string"),
"County Name": pd.Series(
["AutaugaCounty", "BaldwinCounty"], dtype="object"
),
"State Abbreviation": pd.Series(["AL"], dtype="string"),
"GEOID": pd.Series(["01073"], dtype="string"),
"County Name": pd.Series(["Jefferson County"], dtype="object"),
},
)
@ -105,11 +103,11 @@ def score_transformed_expected():
@pytest.fixture()
def national_cbg_df():
def national_tract_df():
return pd.DataFrame.from_dict(
data={
"GEOID10": pd.Series(
["010010201001", "010010201002"], dtype="string"
"GEOID10_TRACT": pd.Series(
["01073001100", "01073001400"], dtype="string"
),
},
)

View file

@ -1,3 +1,2 @@
USPS GEOID ANSICODE NAME POP10 HU10 ALAND AWATER ALAND_SQMI AWATER_SQMI INTPTLAT INTPTLONG
AL 01001 00161526 AutaugaCounty 54571 22135 1539582278 25775735 594.436 9.952 32.536382 -86.644490
AL 01003 00161527 BaldwinCounty 182265 104061 4117521611 1133190229 1589.784 437.527 30.659218 -87.746067
AL 01073 00161562 Jefferson County 658466 300552 2878192209 32474487 1111.276 12.538 33.553444 -86.896536

1 USPS GEOID ANSICODE NAME POP10 HU10 ALAND AWATER ALAND_SQMI AWATER_SQMI INTPTLAT INTPTLONG
2 AL 01001 01073 00161526 00161562 AutaugaCounty Jefferson County 54571 658466 22135 300552 1539582278 2878192209 25775735 32474487 594.436 1111.276 9.952 12.538 32.536382 33.553444 -86.644490 -86.896536
AL 01003 00161527 BaldwinCounty 182265 104061 4117521611 1133190229 1589.784 437.527 30.659218 -87.746067

File diff suppressed because one or more lines are too long

View file

@ -29,7 +29,7 @@ def test_extract_states(etl, state_data_initial):
def test_extract_score(etl, score_data_initial):
extracted = etl._extract_score(score_data_initial)
string_cols = ["GEOID10"]
string_cols = ["GEOID10_TRACT"]
assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
@ -63,14 +63,14 @@ def test_transform_score(etl, score_data_initial, score_transformed_expected):
# pylint: disable=too-many-arguments
def test_create_score_data(
etl,
national_cbg_df,
national_tract_df,
counties_transformed_expected,
states_transformed_expected,
score_transformed_expected,
score_data_expected,
):
score_data_actual = etl._create_score_data(
national_cbg_df,
national_tract_df,
counties_transformed_expected,
states_transformed_expected,
score_transformed_expected,