County mapping to Tracts (#923)

* County mapping to Tracts

* passing tests

* last traces of cbg
This commit is contained in:
Jorge Escobar 2021-11-26 11:23:40 -05:00 committed by lucasmbrown-usds
commit cc7bf0d73d
11 changed files with 50 additions and 44 deletions

View file

@ -318,7 +318,7 @@ data_path = Path.cwd()
# score data expected # score data expected
score_csv_path = data_path / "data_pipeline" / "data" / "score" / "csv" / "full" / "usa.csv" score_csv_path = data_path / "data_pipeline" / "data" / "score" / "csv" / "full" / "usa.csv"
score_initial_df = pd.read_csv(score_csv_path, dtype={"GEOID10": "string"}, low_memory=False)[:2] score_initial_df = pd.read_csv(score_csv_path, dtype={"GEOID10_TRACT": "string"}, low_memory=False)[:2]
score_initial_df.to_csv(data_path / "data_pipeline" / "etl" / "score" / "tests" / "sample_data" /"score_data_initial.csv", index=False) score_initial_df.to_csv(data_path / "data_pipeline" / "etl" / "score" / "tests" / "sample_data" /"score_data_initial.csv", index=False)
``` ```

View file

@ -59,7 +59,7 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
# Column subsets # Column subsets
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"] CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
TILES_SCORE_COLUMNS = [ TILES_SCORE_COLUMNS = [
"GEOID10", "GEOID10_TRACT",
"State Name", "State Name",
"County Name", "County Name",
"Total population", "Total population",
@ -155,7 +155,7 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
# Finally we augment with the GEOID10, county, and state # Finally we augment with the GEOID10, county, and state
DOWNLOADABLE_SCORE_COLUMNS = [ DOWNLOADABLE_SCORE_COLUMNS = [
"GEOID10", "GEOID10_TRACT",
"County Name", "County Name",
"State Name", "State Name",
"Score G (communities)", "Score G (communities)",

View file

@ -23,7 +23,7 @@ class PostScoreETL(ExtractTransformLoad):
self.input_counties_df: pd.DataFrame self.input_counties_df: pd.DataFrame
self.input_states_df: pd.DataFrame self.input_states_df: pd.DataFrame
self.input_score_df: pd.DataFrame self.input_score_df: pd.DataFrame
self.input_national_cbg_df: pd.DataFrame self.input_national_tract_df: pd.DataFrame
self.output_score_county_state_merged_df: pd.DataFrame self.output_score_county_state_merged_df: pd.DataFrame
self.output_score_tiles_df: pd.DataFrame self.output_score_tiles_df: pd.DataFrame
@ -50,7 +50,9 @@ class PostScoreETL(ExtractTransformLoad):
def _extract_score(self, score_path: Path) -> pd.DataFrame: def _extract_score(self, score_path: Path) -> pd.DataFrame:
logger.info("Reading Score CSV") logger.info("Reading Score CSV")
df = pd.read_csv(score_path, dtype={"GEOID10": "string"}) df = pd.read_csv(
score_path, dtype={self.GEOID_TRACT_FIELD_NAME: "string"}
)
# Convert total population to an int: # Convert total population to an int:
df["Total population"] = df["Total population"].astype( df["Total population"] = df["Total population"].astype(
@ -59,12 +61,14 @@ class PostScoreETL(ExtractTransformLoad):
return df return df
def _extract_national_cbg(self, national_cbg_path: Path) -> pd.DataFrame: def _extract_national_tract(
logger.info("Reading national CBG") self, national_tract_path: Path
) -> pd.DataFrame:
logger.info("Reading national tract file")
return pd.read_csv( return pd.read_csv(
national_cbg_path, national_tract_path,
names=["GEOID10"], names=[self.GEOID_TRACT_FIELD_NAME],
dtype={"GEOID10": "string"}, dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False, low_memory=False,
header=None, header=None,
) )
@ -91,7 +95,7 @@ class PostScoreETL(ExtractTransformLoad):
self.input_score_df = self._extract_score( self.input_score_df = self._extract_score(
constants.DATA_SCORE_CSV_FULL_FILE_PATH constants.DATA_SCORE_CSV_FULL_FILE_PATH
) )
self.input_national_cbg_df = self._extract_national_cbg( self.input_national_tract_df = self._extract_national_tract(
constants.DATA_CENSUS_CSV_FILE_PATH constants.DATA_CENSUS_CSV_FILE_PATH
) )
@ -130,21 +134,22 @@ class PostScoreETL(ExtractTransformLoad):
def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame: def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame:
""" """
Necessary modifications to the score dataframe Add the GEOID field to the score dataframe to do the merge with counties
""" """
# Add the tract level column # add GEOID column for counties
new_df = initial_score_df.copy() initial_score_df["GEOID"] = initial_score_df[
new_df["GEOID"] = initial_score_df.GEOID10.str[:5] self.GEOID_TRACT_FIELD_NAME
return new_df ].str[:5]
return initial_score_df
def _create_score_data( def _create_score_data(
self, self,
national_cbg_df: pd.DataFrame, national_tract_df: pd.DataFrame,
counties_df: pd.DataFrame, counties_df: pd.DataFrame,
states_df: pd.DataFrame, states_df: pd.DataFrame,
score_df: pd.DataFrame, score_df: pd.DataFrame,
) -> pd.DataFrame: ) -> pd.DataFrame:
# merge state with counties # merge state with counties
logger.info("Merging state with county info") logger.info("Merging state with county info")
county_state_merged = counties_df.merge( county_state_merged = counties_df.merge(
@ -153,15 +158,19 @@ class PostScoreETL(ExtractTransformLoad):
# merge state + county with score # merge state + county with score
score_county_state_merged = score_df.merge( score_county_state_merged = score_df.merge(
county_state_merged, on="GEOID", how="left" county_state_merged,
on="GEOID", # GEOID is the county ID
how="left",
) )
# check if there are census cbgs without score # check if there are census tracts without score
logger.info("Removing CBG rows without score") logger.info("Removing tract rows without score")
# merge census cbgs with score # merge census tracts with score
merged_df = national_cbg_df.merge( merged_df = national_tract_df.merge(
score_county_state_merged, on="GEOID10", how="left" score_county_state_merged,
on=self.GEOID_TRACT_FIELD_NAME,
how="left",
) )
# recast population to integer # recast population to integer
@ -169,14 +178,14 @@ class PostScoreETL(ExtractTransformLoad):
merged_df["Total population"].fillna(0.0).astype(int) merged_df["Total population"].fillna(0.0).astype(int)
) )
# list the null score cbgs # list the null score tracts
null_cbg_df = merged_df[merged_df["Score E (percentile)"].isnull()] null_tract_df = merged_df[merged_df["Score E (percentile)"].isnull()]
# subtract data sets # subtract data sets
# this follows the XOR pattern outlined here: # this follows the XOR pattern outlined here:
# https://stackoverflow.com/a/37313953 # https://stackoverflow.com/a/37313953
de_duplicated_df = pd.concat( de_duplicated_df = pd.concat(
[merged_df, null_cbg_df, null_cbg_df] [merged_df, null_tract_df, null_tract_df]
).drop_duplicates(keep=False) ).drop_duplicates(keep=False)
# set the score to the new df # set the score to the new df
@ -212,7 +221,7 @@ class PostScoreETL(ExtractTransformLoad):
transformed_score = self._transform_score(self.input_score_df) transformed_score = self._transform_score(self.input_score_df)
output_score_county_state_merged_df = self._create_score_data( output_score_county_state_merged_df = self._create_score_data(
self.input_national_cbg_df, self.input_national_tract_df,
transformed_counties, transformed_counties,
transformed_states, transformed_states,
transformed_score, transformed_score,

View file

@ -75,11 +75,9 @@ def score_pdf_initial(sample_data_dir):
def counties_transformed_expected(): def counties_transformed_expected():
return pd.DataFrame.from_dict( return pd.DataFrame.from_dict(
data={ data={
"State Abbreviation": pd.Series(["AL", "AL"], dtype="string"), "State Abbreviation": pd.Series(["AL"], dtype="string"),
"GEOID": pd.Series(["01001", "01003"], dtype="string"), "GEOID": pd.Series(["01073"], dtype="string"),
"County Name": pd.Series( "County Name": pd.Series(["Jefferson County"], dtype="object"),
["AutaugaCounty", "BaldwinCounty"], dtype="object"
),
}, },
) )
@ -105,11 +103,11 @@ def score_transformed_expected():
@pytest.fixture() @pytest.fixture()
def national_cbg_df(): def national_tract_df():
return pd.DataFrame.from_dict( return pd.DataFrame.from_dict(
data={ data={
"GEOID10": pd.Series( "GEOID10_TRACT": pd.Series(
["010010201001", "010010201002"], dtype="string" ["01073001100", "01073001400"], dtype="string"
), ),
}, },
) )

View file

@ -1,3 +1,2 @@
USPS GEOID ANSICODE NAME POP10 HU10 ALAND AWATER ALAND_SQMI AWATER_SQMI INTPTLAT INTPTLONG USPS GEOID ANSICODE NAME POP10 HU10 ALAND AWATER ALAND_SQMI AWATER_SQMI INTPTLAT INTPTLONG
AL 01001 00161526 AutaugaCounty 54571 22135 1539582278 25775735 594.436 9.952 32.536382 -86.644490 AL 01073 00161562 Jefferson County 658466 300552 2878192209 32474487 1111.276 12.538 33.553444 -86.896536
AL 01003 00161527 BaldwinCounty 182265 104061 4117521611 1133190229 1589.784 437.527 30.659218 -87.746067

1 USPS GEOID ANSICODE NAME POP10 HU10 ALAND AWATER ALAND_SQMI AWATER_SQMI INTPTLAT INTPTLONG
2 AL 01001 01073 00161526 00161562 AutaugaCounty Jefferson County 54571 658466 22135 300552 1539582278 2878192209 25775735 32474487 594.436 1111.276 9.952 12.538 32.536382 33.553444 -86.644490 -86.896536
AL 01003 00161527 BaldwinCounty 182265 104061 4117521611 1133190229 1589.784 437.527 30.659218 -87.746067

File diff suppressed because one or more lines are too long

View file

@ -29,7 +29,7 @@ def test_extract_states(etl, state_data_initial):
def test_extract_score(etl, score_data_initial): def test_extract_score(etl, score_data_initial):
extracted = etl._extract_score(score_data_initial) extracted = etl._extract_score(score_data_initial)
string_cols = ["GEOID10"] string_cols = ["GEOID10_TRACT"]
assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols) assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
@ -63,14 +63,14 @@ def test_transform_score(etl, score_data_initial, score_transformed_expected):
# pylint: disable=too-many-arguments # pylint: disable=too-many-arguments
def test_create_score_data( def test_create_score_data(
etl, etl,
national_cbg_df, national_tract_df,
counties_transformed_expected, counties_transformed_expected,
states_transformed_expected, states_transformed_expected,
score_transformed_expected, score_transformed_expected,
score_data_expected, score_data_expected,
): ):
score_data_actual = etl._create_score_data( score_data_actual = etl._create_score_data(
national_cbg_df, national_tract_df,
counties_transformed_expected, counties_transformed_expected,
states_transformed_expected, states_transformed_expected,
score_transformed_expected, score_transformed_expected,