Score F, testing methodology (#510)

* fixing dependency issue

* fixing more dependencies

* including fraction of state AMI

* wip

* nitpick whitespace

* etl working now

* wip on scoring

* fix rename error

* reducing metrics

* fixing score f

* fixing readme

* adding dependency

* passing tests;

* linting/black

* removing unnecessary sample

* fixing error

* adding verify flag on etl/base

Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
This commit is contained in:
Lucas Merrill Brown 2021-08-24 15:40:54 -05:00 committed by GitHub
commit 65ceb7900f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
23 changed files with 557 additions and 153 deletions

View file

@ -9,16 +9,16 @@ class HudHousingETL(ExtractTransformLoad):
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "hud_housing"
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
self.HOUSING_FTP_URL = (
"https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip"
)
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip"
self.HOUSING_ZIP_FILE_DIR = self.TMP_PATH / "hud_housing"
# We measure households earning less than 80% of HUD Area Median Family Income by county
# and paying greater than 30% of their income to housing costs.
self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)"
self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME = "HOUSING_BURDEN_NUMERATOR"
self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = "HOUSING_BURDEN_DENOMINATOR"
self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = (
"HOUSING_BURDEN_DENOMINATOR"
)
# Note: some variable definitions.
# HUD-adjusted median family income (HAMFI).
@ -55,7 +55,9 @@ class HudHousingETL(ExtractTransformLoad):
)
# Rename and reformat block group ID
self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True)
self.df.rename(
columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True
)
# The CHAS data has census tract ids such as `14000US01001020100`
# Whereas the rest of our data uses, for the same tract, `01001020100`.
@ -273,7 +275,9 @@ class HudHousingETL(ExtractTransformLoad):
# TODO: add small sample size checks
self.df[self.HOUSING_BURDEN_FIELD_NAME] = self.df[
self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME
].astype(float) / self.df[self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME].astype(
].astype(float) / self.df[
self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME
].astype(
float
)