adding median income field and running black

This commit is contained in:
lucasmbrown-usds 2021-08-09 20:47:51 -05:00
commit 4ae7eff4c4
6 changed files with 33 additions and 10 deletions

View file

@ -59,7 +59,9 @@ class ScoreETL(ExtractTransformLoad):
# Load census data # Load census data
census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
self.census_df = pd.read_csv( self.census_df = pd.read_csv(
census_csv, dtype={self.GEOID_FIELD_NAME: "string"}, low_memory=False, census_csv,
dtype={self.GEOID_FIELD_NAME: "string"},
low_memory=False,
) )
# Load housing and transportation data # Load housing and transportation data
@ -121,7 +123,8 @@ class ScoreETL(ExtractTransformLoad):
# Define a named tuple that will be used for each data set input. # Define a named tuple that will be used for each data set input.
DataSet = collections.namedtuple( DataSet = collections.namedtuple(
typename="DataSet", field_names=["input_field", "renamed_field", "bucket"], typename="DataSet",
field_names=["input_field", "renamed_field", "bucket"],
) )
data_sets = [ data_sets = [
@ -138,7 +141,9 @@ class ScoreETL(ExtractTransformLoad):
bucket=None, bucket=None,
), ),
DataSet( DataSet(
input_field="ACSTOTPOP", renamed_field="Total population", bucket=None, input_field="ACSTOTPOP",
renamed_field="Total population",
bucket=None,
), ),
# The following data sets have buckets, because they're used in the score # The following data sets have buckets, because they're used in the score
DataSet( DataSet(
@ -244,7 +249,9 @@ class ScoreETL(ExtractTransformLoad):
} }
self.df.rename( self.df.rename(
columns=renaming_dict, inplace=True, errors="raise", columns=renaming_dict,
inplace=True,
errors="raise",
) )
columns_to_keep = [data_set.renamed_field for data_set in data_sets] columns_to_keep = [data_set.renamed_field for data_set in data_sets]

View file

@ -46,7 +46,9 @@ class GeoScoreETL(ExtractTransformLoad):
logger.info("Reading score CSV") logger.info("Reading score CSV")
self.score_usa_df = pd.read_csv( self.score_usa_df = pd.read_csv(
self.TILE_SCORE_CSV, dtype={"GEOID10": "string"}, low_memory=False, self.TILE_SCORE_CSV,
dtype={"GEOID10": "string"},
low_memory=False,
) )
def transform(self) -> None: def transform(self) -> None:
@ -68,7 +70,8 @@ class GeoScoreETL(ExtractTransformLoad):
].reset_index(drop=True) ].reset_index(drop=True)
usa_simplified.rename( usa_simplified.rename(
columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, inplace=True, columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO},
inplace=True,
) )
logger.info("Aggregating into tracts (~5 minutes)") logger.info("Aggregating into tracts (~5 minutes)")

View file

@ -22,6 +22,8 @@ class CensusACSETL(ExtractTransformLoad):
"C16002_010E", "C16002_010E",
"C16002_013E", "C16002_013E",
] ]
self.MEDIAN_INCOME_FIELD = "B19013_001E"
self.MEDIAN_INCOME_FIELD_NAME = "Median household income in the past 12 months"
self.df: pd.DataFrame self.df: pd.DataFrame
def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str: def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
@ -45,6 +47,7 @@ class CensusACSETL(ExtractTransformLoad):
# Emploment fields # Emploment fields
"B23025_005E", "B23025_005E",
"B23025_003E", "B23025_003E",
self.MEDIAN_INCOME_FIELD,
] ]
+ self.LINGUISTIC_ISOLATION_FIELDS, + self.LINGUISTIC_ISOLATION_FIELDS,
) )
@ -59,6 +62,9 @@ class CensusACSETL(ExtractTransformLoad):
def transform(self) -> None: def transform(self) -> None:
logger.info("Starting Census ACS Transform") logger.info("Starting Census ACS Transform")
# Rename median income
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[self.MEDIAN_INCOME_FIELD]
# Calculate percent unemployment. # Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
@ -91,6 +97,7 @@ class CensusACSETL(ExtractTransformLoad):
self.GEOID_FIELD_NAME, self.GEOID_FIELD_NAME,
self.UNEMPLOYED_FIELD_NAME, self.UNEMPLOYED_FIELD_NAME,
self.LINGUISTIC_ISOLATION_FIELD_NAME, self.LINGUISTIC_ISOLATION_FIELD_NAME,
self.MEDIAN_INCOME_FIELD_NAME,
] ]
self.df[columns_to_include].to_csv( self.df[columns_to_include].to_csv(

View file

@ -18,7 +18,8 @@ class EJScreenETL(ExtractTransformLoad):
def extract(self) -> None: def extract(self) -> None:
logger.info("Downloading EJScreen Data") logger.info("Downloading EJScreen Data")
super().extract( super().extract(
self.EJSCREEN_FTP_URL, self.TMP_PATH, self.EJSCREEN_FTP_URL,
self.TMP_PATH,
) )
def transform(self) -> None: def transform(self) -> None:

View file

@ -34,7 +34,8 @@ class HudHousingETL(ExtractTransformLoad):
def extract(self) -> None: def extract(self) -> None:
logger.info("Extracting HUD Housing Data") logger.info("Extracting HUD Housing Data")
super().extract( super().extract(
self.HOUSING_FTP_URL, self.HOUSING_ZIP_FILE_DIR, self.HOUSING_FTP_URL,
self.HOUSING_ZIP_FILE_DIR,
) )
def transform(self) -> None: def transform(self) -> None:
@ -48,7 +49,10 @@ class HudHousingETL(ExtractTransformLoad):
/ "140" / "140"
/ "Table8.csv" / "Table8.csv"
) )
self.df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path, encoding="latin-1",) self.df = pd.read_csv(
filepath_or_buffer=tmp_csv_file_path,
encoding="latin-1",
)
# Rename and reformat block group ID # Rename and reformat block group ID
self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True) self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True)

View file

@ -70,7 +70,8 @@ class TreeEquityScoreETL(ExtractTransformLoad):
logger.info("Downloading Tree Equity Score Data") logger.info("Downloading Tree Equity Score Data")
for state in self.states: for state in self.states:
super().extract( super().extract(
f"{self.TES_URL}{state}.zip.zip", f"{self.TMP_PATH}/{state}", f"{self.TES_URL}{state}.zip.zip",
f"{self.TMP_PATH}/{state}",
) )
def transform(self) -> None: def transform(self) -> None: