Add median house value to Definition L (#882)

* Added house value to ETL

* Adding house value to score formula and comp tool
This commit is contained in:
Lucas Merrill Brown 2021-11-13 10:29:23 -05:00 committed by GitHub
parent 54bdda0f02
commit 05ebf9b48c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 90 additions and 48 deletions

View file

@ -312,6 +312,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.HT_INDEX_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD,
]
non_numeric_columns = [

View file

@ -50,6 +50,11 @@ class CensusACSETL(ExtractTransformLoad):
"Percent of individuals < 200% Federal Poverty Line"
)
self.MEDIAN_HOUSE_VALUE_FIELD = "B25077_001E"
self.MEDIAN_HOUSE_VALUE_FIELD_NAME = (
"Median value ($) of owner-occupied housing units"
)
self.STATE_GEOID_FIELD_NAME = "GEOID2"
self.df: pd.DataFrame
@ -78,7 +83,10 @@ class CensusACSETL(ExtractTransformLoad):
# Emploment fields
"B23025_005E",
"B23025_003E",
# Income field
self.MEDIAN_INCOME_FIELD,
# House value
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS,
@ -94,22 +102,27 @@ class CensusACSETL(ExtractTransformLoad):
def transform(self) -> None:
logger.info("Starting Census ACS Transform")
# Rename median income
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
self.MEDIAN_INCOME_FIELD
]
# Rename two fields.
self.df = self.df.rename(
columns={
self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
}
)
# Handle null values for CBG median income, which are `-666666666`.
missing_value_count = sum(
self.df[self.MEDIAN_INCOME_FIELD_NAME] == -666666666
)
# Handle null values for various fields, which are `-666666666`.
for field in [
self.MEDIAN_INCOME_FIELD_NAME,
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
]:
missing_value_count = sum(self.df[field] == -666666666)
logger.info(
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[self.MEDIAN_INCOME_FIELD_NAME].count())}%) values of "
+ f"`{self.MEDIAN_INCOME_FIELD_NAME}` being marked as null values."
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of "
+ f"`{field}` being marked as null values."
)
self.df[field] = self.df[field].replace(
to_replace=-666666666, value=None
)
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
self.MEDIAN_INCOME_FIELD_NAME
].replace(to_replace=-666666666, value=None)
# Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
@ -133,8 +146,6 @@ class CensusACSETL(ExtractTransformLoad):
/ self.df["C16002_001E"]
)
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME].describe()
# Calculate percent at different poverty thresholds
self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
self.df["C17002_002E"] + self.df["C17002_003E"]
@ -170,6 +181,7 @@ class CensusACSETL(ExtractTransformLoad):
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
]
self.df[columns_to_include].to_csv(

View file

@ -56,13 +56,17 @@ class CensusDecennialETL(ExtractTransformLoad):
self.MALE_HIGH_SCHOOL_ED_FIELD = "PBG026005"
self.MALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032011"
self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Male!!High school graduate, GED, or alternative; "\
self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = (
"Total!!Male!!High school graduate, GED, or alternative; "
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
)
self.FEMALE_HIGH_SCHOOL_ED_FIELD = "PBG026012"
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032028"
self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Female!!High school graduate, GED, or alternative; "\
self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = (
"Total!!Female!!High school graduate, GED, or alternative; "
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
)
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = (
"PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME"

View file

@ -3,6 +3,9 @@
{
"cell_type": "code",
"execution_count": null,
"id": "4899d2ef",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import censusdata\n",
@ -29,28 +32,34 @@
"# Some display settings to make pandas outputs more readable.\n",
"pd.set_option(\"display.expand_frame_repr\", False)\n",
"pd.set_option(\"display.precision\", 2)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4dd8feec",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
"censusdata.printtable(\n",
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\")\n",
")"
],
"outputs": [],
"metadata": {
"scrolled": true
}
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
")\n",
"\n",
"# censusdata.search(src=\"acs5\", year=ACS_YEAR, field='label', criterion='Owner-occupied units!!Median')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b40afd3",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
" \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
@ -82,15 +91,16 @@
")\n",
"\n",
"df.head()"
],
"outputs": [],
"metadata": {
"scrolled": true
}
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "caa0b502",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
"\n",
@ -103,18 +113,15 @@
")\n",
"\n",
"# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
],
"outputs": [],
"metadata": {
"scrolled": true
}
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"id": "f2bddf6a",
"metadata": {},
"outputs": [],
"metadata": {}
"source": []
}
],
"metadata": {

View file

@ -86,6 +86,7 @@
"BAD_HEALTH_FIELD = (\n",
" \"Physical health not good for >=14 days among adults aged >=18 years\"\n",
")\n",
"MEDIAN_HOUSE_VALUE_FIELD = \"Median value ($) of owner-occupied housing units\"\n",
"\n",
"# Define some suffixes\n",
"POPULATION_SUFFIX = \" (priority population)\""
@ -186,6 +187,7 @@
" \"Particulate matter (PM2.5) (percentile)\",\n",
" \"Traffic proximity and volume (percentile)\",\n",
" \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n",
" MEDIAN_HOUSE_VALUE_FIELD,\n",
"]:\n",
" print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
" print(cejst_df[field].describe())\n",

View file

@ -114,6 +114,8 @@ OVER_64_FIELD = "Individuals over 64 years old"
# Urban Rural Map
URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag"
# Housing value
MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"
# EJSCREEN Areas of Concern
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (

View file

@ -11,6 +11,7 @@ class ScoreL(Score):
def __init__(self, df: pd.DataFrame) -> None:
self.LOW_INCOME_THRESHOLD: float = 0.65
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
super().__init__(df)
def add_columns(self) -> pd.DataFrame:
@ -135,8 +136,12 @@ class ScoreL(Score):
) & transportation_criteria
def _housing_factor(self) -> bool:
# (
# In Xth percentile or above for lead paint (Source: Census's American Community Surveys
# percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
# AND
# In Yth percentile or below for Median House Value (Source: Census's American Community Survey)
# )
# or
# In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
# AND
@ -144,11 +149,20 @@ class ScoreL(Score):
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
housing_criteria = (
(
self.df[
field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
& (
self.df[
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
< self.MEDIAN_HOUSE_VALUE_THRESHOLD
)
) | (
self.df[
field_names.HOUSING_BURDEN_FIELD