mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 14:11:17 -07:00
adding median income field and running black
This commit is contained in:
parent
6c986adfe4
commit
4ae7eff4c4
6 changed files with 33 additions and 10 deletions
|
@ -59,7 +59,9 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
# Load census data
|
# Load census data
|
||||||
census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
|
census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
|
||||||
self.census_df = pd.read_csv(
|
self.census_df = pd.read_csv(
|
||||||
census_csv, dtype={self.GEOID_FIELD_NAME: "string"}, low_memory=False,
|
census_csv,
|
||||||
|
dtype={self.GEOID_FIELD_NAME: "string"},
|
||||||
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Load housing and transportation data
|
# Load housing and transportation data
|
||||||
|
@ -121,7 +123,8 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
# Define a named tuple that will be used for each data set input.
|
# Define a named tuple that will be used for each data set input.
|
||||||
DataSet = collections.namedtuple(
|
DataSet = collections.namedtuple(
|
||||||
typename="DataSet", field_names=["input_field", "renamed_field", "bucket"],
|
typename="DataSet",
|
||||||
|
field_names=["input_field", "renamed_field", "bucket"],
|
||||||
)
|
)
|
||||||
|
|
||||||
data_sets = [
|
data_sets = [
|
||||||
|
@ -138,7 +141,9 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
bucket=None,
|
bucket=None,
|
||||||
),
|
),
|
||||||
DataSet(
|
DataSet(
|
||||||
input_field="ACSTOTPOP", renamed_field="Total population", bucket=None,
|
input_field="ACSTOTPOP",
|
||||||
|
renamed_field="Total population",
|
||||||
|
bucket=None,
|
||||||
),
|
),
|
||||||
# The following data sets have buckets, because they're used in the score
|
# The following data sets have buckets, because they're used in the score
|
||||||
DataSet(
|
DataSet(
|
||||||
|
@ -244,7 +249,9 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
}
|
}
|
||||||
|
|
||||||
self.df.rename(
|
self.df.rename(
|
||||||
columns=renaming_dict, inplace=True, errors="raise",
|
columns=renaming_dict,
|
||||||
|
inplace=True,
|
||||||
|
errors="raise",
|
||||||
)
|
)
|
||||||
|
|
||||||
columns_to_keep = [data_set.renamed_field for data_set in data_sets]
|
columns_to_keep = [data_set.renamed_field for data_set in data_sets]
|
||||||
|
|
|
@ -46,7 +46,9 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
logger.info("Reading score CSV")
|
logger.info("Reading score CSV")
|
||||||
self.score_usa_df = pd.read_csv(
|
self.score_usa_df = pd.read_csv(
|
||||||
self.TILE_SCORE_CSV, dtype={"GEOID10": "string"}, low_memory=False,
|
self.TILE_SCORE_CSV,
|
||||||
|
dtype={"GEOID10": "string"},
|
||||||
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
@ -68,7 +70,8 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
].reset_index(drop=True)
|
].reset_index(drop=True)
|
||||||
|
|
||||||
usa_simplified.rename(
|
usa_simplified.rename(
|
||||||
columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, inplace=True,
|
columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO},
|
||||||
|
inplace=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Aggregating into tracts (~5 minutes)")
|
logger.info("Aggregating into tracts (~5 minutes)")
|
||||||
|
|
|
@ -22,6 +22,8 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
"C16002_010E",
|
"C16002_010E",
|
||||||
"C16002_013E",
|
"C16002_013E",
|
||||||
]
|
]
|
||||||
|
self.MEDIAN_INCOME_FIELD = "B19013_001E"
|
||||||
|
self.MEDIAN_INCOME_FIELD_NAME = "Median household income in the past 12 months"
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
|
def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
|
||||||
|
@ -45,6 +47,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
# Emploment fields
|
# Emploment fields
|
||||||
"B23025_005E",
|
"B23025_005E",
|
||||||
"B23025_003E",
|
"B23025_003E",
|
||||||
|
self.MEDIAN_INCOME_FIELD,
|
||||||
]
|
]
|
||||||
+ self.LINGUISTIC_ISOLATION_FIELDS,
|
+ self.LINGUISTIC_ISOLATION_FIELDS,
|
||||||
)
|
)
|
||||||
|
@ -59,6 +62,9 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting Census ACS Transform")
|
logger.info("Starting Census ACS Transform")
|
||||||
|
|
||||||
|
# Rename median income
|
||||||
|
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[self.MEDIAN_INCOME_FIELD]
|
||||||
|
|
||||||
# Calculate percent unemployment.
|
# Calculate percent unemployment.
|
||||||
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
||||||
self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
|
self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
|
||||||
|
@ -91,6 +97,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
self.GEOID_FIELD_NAME,
|
self.GEOID_FIELD_NAME,
|
||||||
self.UNEMPLOYED_FIELD_NAME,
|
self.UNEMPLOYED_FIELD_NAME,
|
||||||
self.LINGUISTIC_ISOLATION_FIELD_NAME,
|
self.LINGUISTIC_ISOLATION_FIELD_NAME,
|
||||||
|
self.MEDIAN_INCOME_FIELD_NAME,
|
||||||
]
|
]
|
||||||
|
|
||||||
self.df[columns_to_include].to_csv(
|
self.df[columns_to_include].to_csv(
|
||||||
|
|
|
@ -18,7 +18,8 @@ class EJScreenETL(ExtractTransformLoad):
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading EJScreen Data")
|
logger.info("Downloading EJScreen Data")
|
||||||
super().extract(
|
super().extract(
|
||||||
self.EJSCREEN_FTP_URL, self.TMP_PATH,
|
self.EJSCREEN_FTP_URL,
|
||||||
|
self.TMP_PATH,
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
|
|
@ -34,7 +34,8 @@ class HudHousingETL(ExtractTransformLoad):
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Extracting HUD Housing Data")
|
logger.info("Extracting HUD Housing Data")
|
||||||
super().extract(
|
super().extract(
|
||||||
self.HOUSING_FTP_URL, self.HOUSING_ZIP_FILE_DIR,
|
self.HOUSING_FTP_URL,
|
||||||
|
self.HOUSING_ZIP_FILE_DIR,
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
@ -48,7 +49,10 @@ class HudHousingETL(ExtractTransformLoad):
|
||||||
/ "140"
|
/ "140"
|
||||||
/ "Table8.csv"
|
/ "Table8.csv"
|
||||||
)
|
)
|
||||||
self.df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path, encoding="latin-1",)
|
self.df = pd.read_csv(
|
||||||
|
filepath_or_buffer=tmp_csv_file_path,
|
||||||
|
encoding="latin-1",
|
||||||
|
)
|
||||||
|
|
||||||
# Rename and reformat block group ID
|
# Rename and reformat block group ID
|
||||||
self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True)
|
self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True)
|
||||||
|
|
|
@ -70,7 +70,8 @@ class TreeEquityScoreETL(ExtractTransformLoad):
|
||||||
logger.info("Downloading Tree Equity Score Data")
|
logger.info("Downloading Tree Equity Score Data")
|
||||||
for state in self.states:
|
for state in self.states:
|
||||||
super().extract(
|
super().extract(
|
||||||
f"{self.TES_URL}{state}.zip.zip", f"{self.TMP_PATH}/{state}",
|
f"{self.TES_URL}{state}.zip.zip",
|
||||||
|
f"{self.TMP_PATH}/{state}",
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue