Add median house value to Definition L (#882)

* Added house value to ETL

* Adding house value to score formula and comp tool
This commit is contained in:
Lucas Merrill Brown 2021-11-13 10:29:23 -05:00 committed by GitHub
parent 54bdda0f02
commit 05ebf9b48c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 90 additions and 48 deletions

View file

@ -312,6 +312,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.HIGH_SCHOOL_ED_FIELD, field_names.HIGH_SCHOOL_ED_FIELD,
field_names.UNEMPLOYMENT_FIELD, field_names.UNEMPLOYMENT_FIELD,
field_names.HT_INDEX_FIELD, field_names.HT_INDEX_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD,
] ]
non_numeric_columns = [ non_numeric_columns = [

View file

@ -50,6 +50,11 @@ class CensusACSETL(ExtractTransformLoad):
"Percent of individuals < 200% Federal Poverty Line" "Percent of individuals < 200% Federal Poverty Line"
) )
self.MEDIAN_HOUSE_VALUE_FIELD = "B25077_001E"
self.MEDIAN_HOUSE_VALUE_FIELD_NAME = (
"Median value ($) of owner-occupied housing units"
)
self.STATE_GEOID_FIELD_NAME = "GEOID2" self.STATE_GEOID_FIELD_NAME = "GEOID2"
self.df: pd.DataFrame self.df: pd.DataFrame
@ -78,7 +83,10 @@ class CensusACSETL(ExtractTransformLoad):
# Emploment fields # Emploment fields
"B23025_005E", "B23025_005E",
"B23025_003E", "B23025_003E",
# Income field
self.MEDIAN_INCOME_FIELD, self.MEDIAN_INCOME_FIELD,
# House value
self.MEDIAN_HOUSE_VALUE_FIELD,
] ]
+ self.LINGUISTIC_ISOLATION_FIELDS + self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS, + self.POVERTY_FIELDS,
@ -94,22 +102,27 @@ class CensusACSETL(ExtractTransformLoad):
def transform(self) -> None: def transform(self) -> None:
logger.info("Starting Census ACS Transform") logger.info("Starting Census ACS Transform")
# Rename median income # Rename two fields.
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[ self.df = self.df.rename(
self.MEDIAN_INCOME_FIELD columns={
] self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
}
)
# Handle null values for CBG median income, which are `-666666666`. # Handle null values for various fields, which are `-666666666`.
missing_value_count = sum( for field in [
self.df[self.MEDIAN_INCOME_FIELD_NAME] == -666666666 self.MEDIAN_INCOME_FIELD_NAME,
) self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
logger.info( ]:
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[self.MEDIAN_INCOME_FIELD_NAME].count())}%) values of " missing_value_count = sum(self.df[field] == -666666666)
+ f"`{self.MEDIAN_INCOME_FIELD_NAME}` being marked as null values." logger.info(
) f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of "
self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[ + f"`{field}` being marked as null values."
self.MEDIAN_INCOME_FIELD_NAME )
].replace(to_replace=-666666666, value=None) self.df[field] = self.df[field].replace(
to_replace=-666666666, value=None
)
# Calculate percent unemployment. # Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
@ -133,8 +146,6 @@ class CensusACSETL(ExtractTransformLoad):
/ self.df["C16002_001E"] / self.df["C16002_001E"]
) )
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME].describe()
# Calculate percent at different poverty thresholds # Calculate percent at different poverty thresholds
self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = ( self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
self.df["C17002_002E"] + self.df["C17002_003E"] self.df["C17002_002E"] + self.df["C17002_003E"]
@ -170,6 +181,7 @@ class CensusACSETL(ExtractTransformLoad):
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME, self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME, self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME, self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
] ]
self.df[columns_to_include].to_csv( self.df[columns_to_include].to_csv(

View file

@ -56,13 +56,17 @@ class CensusDecennialETL(ExtractTransformLoad):
self.MALE_HIGH_SCHOOL_ED_FIELD = "PBG026005" self.MALE_HIGH_SCHOOL_ED_FIELD = "PBG026005"
self.MALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032011" self.MALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032011"
self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Male!!High school graduate, GED, or alternative; "\ self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = (
"Total!!Male!!High school graduate, GED, or alternative; "
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER" "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
)
self.FEMALE_HIGH_SCHOOL_ED_FIELD = "PBG026012" self.FEMALE_HIGH_SCHOOL_ED_FIELD = "PBG026012"
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032028" self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032028"
self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Female!!High school graduate, GED, or alternative; "\ self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = (
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER" "Total!!Female!!High school graduate, GED, or alternative; "
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
)
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = ( self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = (
"PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME" "PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME"

View file

@ -3,6 +3,9 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "4899d2ef",
"metadata": {},
"outputs": [],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"import censusdata\n", "import censusdata\n",
@ -29,28 +32,34 @@
"# Some display settings to make pandas outputs more readable.\n", "# Some display settings to make pandas outputs more readable.\n",
"pd.set_option(\"display.expand_frame_repr\", False)\n", "pd.set_option(\"display.expand_frame_repr\", False)\n",
"pd.set_option(\"display.precision\", 2)" "pd.set_option(\"display.precision\", 2)"
], ]
"outputs": [],
"metadata": {}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "4dd8feec",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [ "source": [
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
"censusdata.printtable(\n", "censusdata.printtable(\n",
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\")\n", " censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
")" ")\n",
], "\n",
"outputs": [], "# censusdata.search(src=\"acs5\", year=ACS_YEAR, field='label', criterion='Owner-occupied units!!Median')"
"metadata": { ]
"scrolled": true
}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "7b40afd3",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [ "source": [
"def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n", "def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
" \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n", " \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
@ -82,15 +91,16 @@
")\n", ")\n",
"\n", "\n",
"df.head()" "df.head()"
], ]
"outputs": [],
"metadata": {
"scrolled": true
}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "caa0b502",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [ "source": [
"columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n", "columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
"\n", "\n",
@ -103,18 +113,15 @@
")\n", ")\n",
"\n", "\n",
"# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)" "# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
], ]
"outputs": [],
"metadata": {
"scrolled": true
}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"source": [], "id": "f2bddf6a",
"metadata": {},
"outputs": [], "outputs": [],
"metadata": {} "source": []
} }
], ],
"metadata": { "metadata": {

View file

@ -86,6 +86,7 @@
"BAD_HEALTH_FIELD = (\n", "BAD_HEALTH_FIELD = (\n",
" \"Physical health not good for >=14 days among adults aged >=18 years\"\n", " \"Physical health not good for >=14 days among adults aged >=18 years\"\n",
")\n", ")\n",
"MEDIAN_HOUSE_VALUE_FIELD = \"Median value ($) of owner-occupied housing units\"\n",
"\n", "\n",
"# Define some suffixes\n", "# Define some suffixes\n",
"POPULATION_SUFFIX = \" (priority population)\"" "POPULATION_SUFFIX = \" (priority population)\""
@ -186,6 +187,7 @@
" \"Particulate matter (PM2.5) (percentile)\",\n", " \"Particulate matter (PM2.5) (percentile)\",\n",
" \"Traffic proximity and volume (percentile)\",\n", " \"Traffic proximity and volume (percentile)\",\n",
" \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n", " \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n",
" MEDIAN_HOUSE_VALUE_FIELD,\n",
"]:\n", "]:\n",
" print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n", " print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
" print(cejst_df[field].describe())\n", " print(cejst_df[field].describe())\n",

View file

@ -114,6 +114,8 @@ OVER_64_FIELD = "Individuals over 64 years old"
# Urban Rural Map # Urban Rural Map
URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag" URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag"
# Housing value
MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"
# EJSCREEN Areas of Concern # EJSCREEN Areas of Concern
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (

View file

@ -11,6 +11,7 @@ class ScoreL(Score):
def __init__(self, df: pd.DataFrame) -> None: def __init__(self, df: pd.DataFrame) -> None:
self.LOW_INCOME_THRESHOLD: float = 0.65 self.LOW_INCOME_THRESHOLD: float = 0.65
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
super().__init__(df) super().__init__(df)
def add_columns(self) -> pd.DataFrame: def add_columns(self) -> pd.DataFrame:
@ -135,8 +136,12 @@ class ScoreL(Score):
) & transportation_criteria ) & transportation_criteria
def _housing_factor(self) -> bool: def _housing_factor(self) -> bool:
# (
# In Xth percentile or above for lead paint (Source: Census's American Community Surveys # In Xth percentile or above for lead paint (Source: Census's American Community Surveys
# percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes) # percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
# AND
# In Yth percentile or below for Median House Value (Source: Census's American Community Survey)
# )
# or # or
# In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset # In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
# AND # AND
@ -144,11 +149,20 @@ class ScoreL(Score):
# of households where household income is less than or equal to twice the federal # of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey] # poverty level. Source: Census's American Community Survey]
housing_criteria = ( housing_criteria = (
self.df[ (
field_names.LEAD_PAINT_FIELD self.df[
+ field_names.PERCENTILE_FIELD_SUFFIX field_names.LEAD_PAINT_FIELD
] + field_names.PERCENTILE_FIELD_SUFFIX
> self.ENVIRONMENTAL_BURDEN_THRESHOLD ]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
& (
self.df[
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
< self.MEDIAN_HOUSE_VALUE_THRESHOLD
)
) | ( ) | (
self.df[ self.df[
field_names.HOUSING_BURDEN_FIELD field_names.HOUSING_BURDEN_FIELD