mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 05:41:16 -07:00
County mapping to Tracts (#923)
* County mapping to Tracts * passing tests * last traces of cbg
This commit is contained in:
parent
a4108d24c0
commit
cc7bf0d73d
11 changed files with 50 additions and 44 deletions
|
@ -318,7 +318,7 @@ data_path = Path.cwd()
|
||||||
|
|
||||||
# score data expected
|
# score data expected
|
||||||
score_csv_path = data_path / "data_pipeline" / "data" / "score" / "csv" / "full" / "usa.csv"
|
score_csv_path = data_path / "data_pipeline" / "data" / "score" / "csv" / "full" / "usa.csv"
|
||||||
score_initial_df = pd.read_csv(score_csv_path, dtype={"GEOID10": "string"}, low_memory=False)[:2]
|
score_initial_df = pd.read_csv(score_csv_path, dtype={"GEOID10_TRACT": "string"}, low_memory=False)[:2]
|
||||||
score_initial_df.to_csv(data_path / "data_pipeline" / "etl" / "score" / "tests" / "sample_data" /"score_data_initial.csv", index=False)
|
score_initial_df.to_csv(data_path / "data_pipeline" / "etl" / "score" / "tests" / "sample_data" /"score_data_initial.csv", index=False)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -59,7 +59,7 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
|
||||||
# Column subsets
|
# Column subsets
|
||||||
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
|
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
|
||||||
TILES_SCORE_COLUMNS = [
|
TILES_SCORE_COLUMNS = [
|
||||||
"GEOID10",
|
"GEOID10_TRACT",
|
||||||
"State Name",
|
"State Name",
|
||||||
"County Name",
|
"County Name",
|
||||||
"Total population",
|
"Total population",
|
||||||
|
@ -155,7 +155,7 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
|
||||||
|
|
||||||
# Finally we augment with the GEOID10, county, and state
|
# Finally we augment with the GEOID10, county, and state
|
||||||
DOWNLOADABLE_SCORE_COLUMNS = [
|
DOWNLOADABLE_SCORE_COLUMNS = [
|
||||||
"GEOID10",
|
"GEOID10_TRACT",
|
||||||
"County Name",
|
"County Name",
|
||||||
"State Name",
|
"State Name",
|
||||||
"Score G (communities)",
|
"Score G (communities)",
|
||||||
|
|
|
@ -23,7 +23,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
self.input_counties_df: pd.DataFrame
|
self.input_counties_df: pd.DataFrame
|
||||||
self.input_states_df: pd.DataFrame
|
self.input_states_df: pd.DataFrame
|
||||||
self.input_score_df: pd.DataFrame
|
self.input_score_df: pd.DataFrame
|
||||||
self.input_national_cbg_df: pd.DataFrame
|
self.input_national_tract_df: pd.DataFrame
|
||||||
|
|
||||||
self.output_score_county_state_merged_df: pd.DataFrame
|
self.output_score_county_state_merged_df: pd.DataFrame
|
||||||
self.output_score_tiles_df: pd.DataFrame
|
self.output_score_tiles_df: pd.DataFrame
|
||||||
|
@ -50,7 +50,9 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
||||||
logger.info("Reading Score CSV")
|
logger.info("Reading Score CSV")
|
||||||
df = pd.read_csv(score_path, dtype={"GEOID10": "string"})
|
df = pd.read_csv(
|
||||||
|
score_path, dtype={self.GEOID_TRACT_FIELD_NAME: "string"}
|
||||||
|
)
|
||||||
|
|
||||||
# Convert total population to an int:
|
# Convert total population to an int:
|
||||||
df["Total population"] = df["Total population"].astype(
|
df["Total population"] = df["Total population"].astype(
|
||||||
|
@ -59,12 +61,14 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def _extract_national_cbg(self, national_cbg_path: Path) -> pd.DataFrame:
|
def _extract_national_tract(
|
||||||
logger.info("Reading national CBG")
|
self, national_tract_path: Path
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
logger.info("Reading national tract file")
|
||||||
return pd.read_csv(
|
return pd.read_csv(
|
||||||
national_cbg_path,
|
national_tract_path,
|
||||||
names=["GEOID10"],
|
names=[self.GEOID_TRACT_FIELD_NAME],
|
||||||
dtype={"GEOID10": "string"},
|
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
header=None,
|
header=None,
|
||||||
)
|
)
|
||||||
|
@ -91,7 +95,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
self.input_score_df = self._extract_score(
|
self.input_score_df = self._extract_score(
|
||||||
constants.DATA_SCORE_CSV_FULL_FILE_PATH
|
constants.DATA_SCORE_CSV_FULL_FILE_PATH
|
||||||
)
|
)
|
||||||
self.input_national_cbg_df = self._extract_national_cbg(
|
self.input_national_tract_df = self._extract_national_tract(
|
||||||
constants.DATA_CENSUS_CSV_FILE_PATH
|
constants.DATA_CENSUS_CSV_FILE_PATH
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -130,21 +134,22 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame:
|
def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Necessary modifications to the score dataframe
|
Add the GEOID field to the score dataframe to do the merge with counties
|
||||||
"""
|
"""
|
||||||
# Add the tract level column
|
# add GEOID column for counties
|
||||||
new_df = initial_score_df.copy()
|
initial_score_df["GEOID"] = initial_score_df[
|
||||||
new_df["GEOID"] = initial_score_df.GEOID10.str[:5]
|
self.GEOID_TRACT_FIELD_NAME
|
||||||
return new_df
|
].str[:5]
|
||||||
|
|
||||||
|
return initial_score_df
|
||||||
|
|
||||||
def _create_score_data(
|
def _create_score_data(
|
||||||
self,
|
self,
|
||||||
national_cbg_df: pd.DataFrame,
|
national_tract_df: pd.DataFrame,
|
||||||
counties_df: pd.DataFrame,
|
counties_df: pd.DataFrame,
|
||||||
states_df: pd.DataFrame,
|
states_df: pd.DataFrame,
|
||||||
score_df: pd.DataFrame,
|
score_df: pd.DataFrame,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
|
|
||||||
# merge state with counties
|
# merge state with counties
|
||||||
logger.info("Merging state with county info")
|
logger.info("Merging state with county info")
|
||||||
county_state_merged = counties_df.merge(
|
county_state_merged = counties_df.merge(
|
||||||
|
@ -153,15 +158,19 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
# merge state + county with score
|
# merge state + county with score
|
||||||
score_county_state_merged = score_df.merge(
|
score_county_state_merged = score_df.merge(
|
||||||
county_state_merged, on="GEOID", how="left"
|
county_state_merged,
|
||||||
|
on="GEOID", # GEOID is the county ID
|
||||||
|
how="left",
|
||||||
)
|
)
|
||||||
|
|
||||||
# check if there are census cbgs without score
|
# check if there are census tracts without score
|
||||||
logger.info("Removing CBG rows without score")
|
logger.info("Removing tract rows without score")
|
||||||
|
|
||||||
# merge census cbgs with score
|
# merge census tracts with score
|
||||||
merged_df = national_cbg_df.merge(
|
merged_df = national_tract_df.merge(
|
||||||
score_county_state_merged, on="GEOID10", how="left"
|
score_county_state_merged,
|
||||||
|
on=self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
how="left",
|
||||||
)
|
)
|
||||||
|
|
||||||
# recast population to integer
|
# recast population to integer
|
||||||
|
@ -169,14 +178,14 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
merged_df["Total population"].fillna(0.0).astype(int)
|
merged_df["Total population"].fillna(0.0).astype(int)
|
||||||
)
|
)
|
||||||
|
|
||||||
# list the null score cbgs
|
# list the null score tracts
|
||||||
null_cbg_df = merged_df[merged_df["Score E (percentile)"].isnull()]
|
null_tract_df = merged_df[merged_df["Score E (percentile)"].isnull()]
|
||||||
|
|
||||||
# subtract data sets
|
# subtract data sets
|
||||||
# this follows the XOR pattern outlined here:
|
# this follows the XOR pattern outlined here:
|
||||||
# https://stackoverflow.com/a/37313953
|
# https://stackoverflow.com/a/37313953
|
||||||
de_duplicated_df = pd.concat(
|
de_duplicated_df = pd.concat(
|
||||||
[merged_df, null_cbg_df, null_cbg_df]
|
[merged_df, null_tract_df, null_tract_df]
|
||||||
).drop_duplicates(keep=False)
|
).drop_duplicates(keep=False)
|
||||||
|
|
||||||
# set the score to the new df
|
# set the score to the new df
|
||||||
|
@ -212,7 +221,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
transformed_score = self._transform_score(self.input_score_df)
|
transformed_score = self._transform_score(self.input_score_df)
|
||||||
|
|
||||||
output_score_county_state_merged_df = self._create_score_data(
|
output_score_county_state_merged_df = self._create_score_data(
|
||||||
self.input_national_cbg_df,
|
self.input_national_tract_df,
|
||||||
transformed_counties,
|
transformed_counties,
|
||||||
transformed_states,
|
transformed_states,
|
||||||
transformed_score,
|
transformed_score,
|
||||||
|
|
|
@ -75,11 +75,9 @@ def score_pdf_initial(sample_data_dir):
|
||||||
def counties_transformed_expected():
|
def counties_transformed_expected():
|
||||||
return pd.DataFrame.from_dict(
|
return pd.DataFrame.from_dict(
|
||||||
data={
|
data={
|
||||||
"State Abbreviation": pd.Series(["AL", "AL"], dtype="string"),
|
"State Abbreviation": pd.Series(["AL"], dtype="string"),
|
||||||
"GEOID": pd.Series(["01001", "01003"], dtype="string"),
|
"GEOID": pd.Series(["01073"], dtype="string"),
|
||||||
"County Name": pd.Series(
|
"County Name": pd.Series(["Jefferson County"], dtype="object"),
|
||||||
["AutaugaCounty", "BaldwinCounty"], dtype="object"
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -105,11 +103,11 @@ def score_transformed_expected():
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def national_cbg_df():
|
def national_tract_df():
|
||||||
return pd.DataFrame.from_dict(
|
return pd.DataFrame.from_dict(
|
||||||
data={
|
data={
|
||||||
"GEOID10": pd.Series(
|
"GEOID10_TRACT": pd.Series(
|
||||||
["010010201001", "010010201002"], dtype="string"
|
["01073001100", "01073001400"], dtype="string"
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,3 +1,2 @@
|
||||||
USPS GEOID ANSICODE NAME POP10 HU10 ALAND AWATER ALAND_SQMI AWATER_SQMI INTPTLAT INTPTLONG
|
USPS GEOID ANSICODE NAME POP10 HU10 ALAND AWATER ALAND_SQMI AWATER_SQMI INTPTLAT INTPTLONG
|
||||||
AL 01001 00161526 AutaugaCounty 54571 22135 1539582278 25775735 594.436 9.952 32.536382 -86.644490
|
AL 01073 00161562 Jefferson County 658466 300552 2878192209 32474487 1111.276 12.538 33.553444 -86.896536
|
||||||
AL 01003 00161527 BaldwinCounty 182265 104061 4117521611 1133190229 1589.784 437.527 30.659218 -87.746067
|
|
||||||
|
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -29,7 +29,7 @@ def test_extract_states(etl, state_data_initial):
|
||||||
|
|
||||||
def test_extract_score(etl, score_data_initial):
|
def test_extract_score(etl, score_data_initial):
|
||||||
extracted = etl._extract_score(score_data_initial)
|
extracted = etl._extract_score(score_data_initial)
|
||||||
string_cols = ["GEOID10"]
|
string_cols = ["GEOID10_TRACT"]
|
||||||
assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
|
assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,14 +63,14 @@ def test_transform_score(etl, score_data_initial, score_transformed_expected):
|
||||||
# pylint: disable=too-many-arguments
|
# pylint: disable=too-many-arguments
|
||||||
def test_create_score_data(
|
def test_create_score_data(
|
||||||
etl,
|
etl,
|
||||||
national_cbg_df,
|
national_tract_df,
|
||||||
counties_transformed_expected,
|
counties_transformed_expected,
|
||||||
states_transformed_expected,
|
states_transformed_expected,
|
||||||
score_transformed_expected,
|
score_transformed_expected,
|
||||||
score_data_expected,
|
score_data_expected,
|
||||||
):
|
):
|
||||||
score_data_actual = etl._create_score_data(
|
score_data_actual = etl._create_score_data(
|
||||||
national_cbg_df,
|
national_tract_df,
|
||||||
counties_transformed_expected,
|
counties_transformed_expected,
|
||||||
states_transformed_expected,
|
states_transformed_expected,
|
||||||
score_transformed_expected,
|
score_transformed_expected,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue