mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 01:54:18 -08:00
Add median house value to Definition L (#882)
* Added house value to ETL * Adding house value to score formula and comp tool
This commit is contained in:
parent
54bdda0f02
commit
05ebf9b48c
7 changed files with 90 additions and 48 deletions
|
@ -312,6 +312,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
field_names.HIGH_SCHOOL_ED_FIELD,
|
field_names.HIGH_SCHOOL_ED_FIELD,
|
||||||
field_names.UNEMPLOYMENT_FIELD,
|
field_names.UNEMPLOYMENT_FIELD,
|
||||||
field_names.HT_INDEX_FIELD,
|
field_names.HT_INDEX_FIELD,
|
||||||
|
field_names.MEDIAN_HOUSE_VALUE_FIELD,
|
||||||
]
|
]
|
||||||
|
|
||||||
non_numeric_columns = [
|
non_numeric_columns = [
|
||||||
|
|
|
@ -50,6 +50,11 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
"Percent of individuals < 200% Federal Poverty Line"
|
"Percent of individuals < 200% Federal Poverty Line"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.MEDIAN_HOUSE_VALUE_FIELD = "B25077_001E"
|
||||||
|
self.MEDIAN_HOUSE_VALUE_FIELD_NAME = (
|
||||||
|
"Median value ($) of owner-occupied housing units"
|
||||||
|
)
|
||||||
|
|
||||||
self.STATE_GEOID_FIELD_NAME = "GEOID2"
|
self.STATE_GEOID_FIELD_NAME = "GEOID2"
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
@ -78,7 +83,10 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
# Emploment fields
|
# Emploment fields
|
||||||
"B23025_005E",
|
"B23025_005E",
|
||||||
"B23025_003E",
|
"B23025_003E",
|
||||||
|
# Income field
|
||||||
self.MEDIAN_INCOME_FIELD,
|
self.MEDIAN_INCOME_FIELD,
|
||||||
|
# House value
|
||||||
|
self.MEDIAN_HOUSE_VALUE_FIELD,
|
||||||
]
|
]
|
||||||
+ self.LINGUISTIC_ISOLATION_FIELDS
|
+ self.LINGUISTIC_ISOLATION_FIELDS
|
||||||
+ self.POVERTY_FIELDS,
|
+ self.POVERTY_FIELDS,
|
||||||
|
@ -94,22 +102,27 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting Census ACS Transform")
|
logger.info("Starting Census ACS Transform")
|
||||||
|
|
||||||
# Rename median income
|
# Rename two fields.
|
||||||
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
|
self.df = self.df.rename(
|
||||||
self.MEDIAN_INCOME_FIELD
|
columns={
|
||||||
]
|
self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||||
|
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Handle null values for CBG median income, which are `-666666666`.
|
# Handle null values for various fields, which are `-666666666`.
|
||||||
missing_value_count = sum(
|
for field in [
|
||||||
self.df[self.MEDIAN_INCOME_FIELD_NAME] == -666666666
|
self.MEDIAN_INCOME_FIELD_NAME,
|
||||||
)
|
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||||
|
]:
|
||||||
|
missing_value_count = sum(self.df[field] == -666666666)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[self.MEDIAN_INCOME_FIELD_NAME].count())}%) values of "
|
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of "
|
||||||
+ f"`{self.MEDIAN_INCOME_FIELD_NAME}` being marked as null values."
|
+ f"`{field}` being marked as null values."
|
||||||
|
)
|
||||||
|
self.df[field] = self.df[field].replace(
|
||||||
|
to_replace=-666666666, value=None
|
||||||
)
|
)
|
||||||
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
|
|
||||||
self.MEDIAN_INCOME_FIELD_NAME
|
|
||||||
].replace(to_replace=-666666666, value=None)
|
|
||||||
|
|
||||||
# Calculate percent unemployment.
|
# Calculate percent unemployment.
|
||||||
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
||||||
|
@ -133,8 +146,6 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
/ self.df["C16002_001E"]
|
/ self.df["C16002_001E"]
|
||||||
)
|
)
|
||||||
|
|
||||||
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME].describe()
|
|
||||||
|
|
||||||
# Calculate percent at different poverty thresholds
|
# Calculate percent at different poverty thresholds
|
||||||
self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
||||||
self.df["C17002_002E"] + self.df["C17002_003E"]
|
self.df["C17002_002E"] + self.df["C17002_003E"]
|
||||||
|
@ -170,6 +181,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
|
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
|
||||||
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
|
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
|
||||||
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
||||||
|
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||||
]
|
]
|
||||||
|
|
||||||
self.df[columns_to_include].to_csv(
|
self.df[columns_to_include].to_csv(
|
||||||
|
|
|
@ -56,13 +56,17 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.MALE_HIGH_SCHOOL_ED_FIELD = "PBG026005"
|
self.MALE_HIGH_SCHOOL_ED_FIELD = "PBG026005"
|
||||||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032011"
|
self.MALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032011"
|
||||||
self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Male!!High school graduate, GED, or alternative; "\
|
self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
||||||
|
"Total!!Male!!High school graduate, GED, or alternative; "
|
||||||
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
||||||
|
)
|
||||||
|
|
||||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD = "PBG026012"
|
self.FEMALE_HIGH_SCHOOL_ED_FIELD = "PBG026012"
|
||||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032028"
|
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032028"
|
||||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Female!!High school graduate, GED, or alternative; "\
|
self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
||||||
|
"Total!!Female!!High school graduate, GED, or alternative; "
|
||||||
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
||||||
|
)
|
||||||
|
|
||||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
||||||
"PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME"
|
"PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME"
|
||||||
|
|
|
@ -3,6 +3,9 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"id": "4899d2ef",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import censusdata\n",
|
"import censusdata\n",
|
||||||
|
@ -29,28 +32,34 @@
|
||||||
"# Some display settings to make pandas outputs more readable.\n",
|
"# Some display settings to make pandas outputs more readable.\n",
|
||||||
"pd.set_option(\"display.expand_frame_repr\", False)\n",
|
"pd.set_option(\"display.expand_frame_repr\", False)\n",
|
||||||
"pd.set_option(\"display.precision\", 2)"
|
"pd.set_option(\"display.precision\", 2)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"id": "4dd8feec",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
|
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
|
||||||
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
|
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
|
||||||
"censusdata.printtable(\n",
|
"censusdata.printtable(\n",
|
||||||
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\")\n",
|
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
|
||||||
")"
|
")\n",
|
||||||
],
|
"\n",
|
||||||
"outputs": [],
|
"# censusdata.search(src=\"acs5\", year=ACS_YEAR, field='label', criterion='Owner-occupied units!!Median')"
|
||||||
"metadata": {
|
]
|
||||||
"scrolled": true
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"id": "7b40afd3",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
|
"def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
|
||||||
" \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
|
" \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
|
||||||
|
@ -82,15 +91,16 @@
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df.head()"
|
"df.head()"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
"id": "caa0b502",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
|
"columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -103,18 +113,15 @@
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
|
"# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
|
||||||
],
|
]
|
||||||
"outputs": [],
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"source": [],
|
"id": "f2bddf6a",
|
||||||
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"metadata": {}
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|
|
@ -86,6 +86,7 @@
|
||||||
"BAD_HEALTH_FIELD = (\n",
|
"BAD_HEALTH_FIELD = (\n",
|
||||||
" \"Physical health not good for >=14 days among adults aged >=18 years\"\n",
|
" \"Physical health not good for >=14 days among adults aged >=18 years\"\n",
|
||||||
")\n",
|
")\n",
|
||||||
|
"MEDIAN_HOUSE_VALUE_FIELD = \"Median value ($) of owner-occupied housing units\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Define some suffixes\n",
|
"# Define some suffixes\n",
|
||||||
"POPULATION_SUFFIX = \" (priority population)\""
|
"POPULATION_SUFFIX = \" (priority population)\""
|
||||||
|
@ -186,6 +187,7 @@
|
||||||
" \"Particulate matter (PM2.5) (percentile)\",\n",
|
" \"Particulate matter (PM2.5) (percentile)\",\n",
|
||||||
" \"Traffic proximity and volume (percentile)\",\n",
|
" \"Traffic proximity and volume (percentile)\",\n",
|
||||||
" \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n",
|
" \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n",
|
||||||
|
" MEDIAN_HOUSE_VALUE_FIELD,\n",
|
||||||
"]:\n",
|
"]:\n",
|
||||||
" print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
|
" print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
|
||||||
" print(cejst_df[field].describe())\n",
|
" print(cejst_df[field].describe())\n",
|
||||||
|
|
|
@ -114,6 +114,8 @@ OVER_64_FIELD = "Individuals over 64 years old"
|
||||||
# Urban Rural Map
|
# Urban Rural Map
|
||||||
URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag"
|
URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag"
|
||||||
|
|
||||||
|
# Housing value
|
||||||
|
MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"
|
||||||
|
|
||||||
# EJSCREEN Areas of Concern
|
# EJSCREEN Areas of Concern
|
||||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||||
|
|
|
@ -11,6 +11,7 @@ class ScoreL(Score):
|
||||||
def __init__(self, df: pd.DataFrame) -> None:
|
def __init__(self, df: pd.DataFrame) -> None:
|
||||||
self.LOW_INCOME_THRESHOLD: float = 0.65
|
self.LOW_INCOME_THRESHOLD: float = 0.65
|
||||||
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
|
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
|
||||||
|
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
|
||||||
super().__init__(df)
|
super().__init__(df)
|
||||||
|
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
|
@ -135,8 +136,12 @@ class ScoreL(Score):
|
||||||
) & transportation_criteria
|
) & transportation_criteria
|
||||||
|
|
||||||
def _housing_factor(self) -> bool:
|
def _housing_factor(self) -> bool:
|
||||||
|
# (
|
||||||
# In Xth percentile or above for lead paint (Source: Census's American Community Survey’s
|
# In Xth percentile or above for lead paint (Source: Census's American Community Survey’s
|
||||||
# percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
|
# percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
|
||||||
|
# AND
|
||||||
|
# In Yth percentile or below for Median House Value (Source: Census's American Community Survey)
|
||||||
|
# )
|
||||||
# or
|
# or
|
||||||
# In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
|
# In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
|
||||||
# AND
|
# AND
|
||||||
|
@ -144,11 +149,20 @@ class ScoreL(Score):
|
||||||
# of households where household income is less than or equal to twice the federal
|
# of households where household income is less than or equal to twice the federal
|
||||||
# poverty level. Source: Census's American Community Survey]
|
# poverty level. Source: Census's American Community Survey]
|
||||||
housing_criteria = (
|
housing_criteria = (
|
||||||
|
(
|
||||||
self.df[
|
self.df[
|
||||||
field_names.LEAD_PAINT_FIELD
|
field_names.LEAD_PAINT_FIELD
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
]
|
]
|
||||||
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||||
|
)
|
||||||
|
& (
|
||||||
|
self.df[
|
||||||
|
field_names.MEDIAN_HOUSE_VALUE_FIELD
|
||||||
|
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
|
]
|
||||||
|
< self.MEDIAN_HOUSE_VALUE_THRESHOLD
|
||||||
|
)
|
||||||
) | (
|
) | (
|
||||||
self.df[
|
self.df[
|
||||||
field_names.HOUSING_BURDEN_FIELD
|
field_names.HOUSING_BURDEN_FIELD
|
||||||
|
|
Loading…
Add table
Reference in a new issue