Issue 242: Add HOLC Grades to data inputs (#978)

* Add mapping inequality data to data inputs

* Add mapping inequality data to comparison tool
This commit is contained in:
Lucas Merrill Brown 2021-12-04 12:23:01 -05:00 committed by GitHub
parent 1d101c93d2
commit c5dff6e5f7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 317 additions and 15 deletions

View file

@ -49,6 +49,11 @@ DATASET_LIST = [
"module_dir": "geocorr",
"class_name": "GeoCorrETL",
},
{
"name": "mapping_inequality",
"module_dir": "mapping_inequality",
"class_name": "MappingInequalityETL",
},
{
"name": "persistent_poverty",
"module_dir": "persistent_poverty",

View file

@ -1,7 +1,9 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)

View file

@ -9,9 +9,7 @@ from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
def _fips_from_censusdata_censusgeo(
censusgeo: censusdata.censusgeo
) -> str:
def _fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:
"""Create a FIPS code from the proprietary censusgeo index."""
fips = "".join([value for (key, value) in censusgeo.params()])
return fips
@ -19,12 +17,12 @@ def _fips_from_censusdata_censusgeo(
# pylint: disable=too-many-arguments
def retrieve_census_acs_data(
acs_year: int,
variables: List[str],
tract_output_field_name: str,
data_path_for_fips_codes: Path,
acs_type="acs5",
raise_errors: bool = False,
acs_year: int,
variables: List[str],
tract_output_field_name: str,
data_path_for_fips_codes: Path,
acs_type="acs5",
raise_errors: bool = False,
) -> pd.DataFrame:
"""Retrieves and combines census ACS data for a given year."""
dfs = []

View file

@ -1,7 +1,9 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)

View file

@ -0,0 +1,40 @@
city,holc_id,HOLC Grade (manually mapped)
Providence,25,D
Providence,26,D
Oklahoma City,46R,D
Oklahoma City,47R,D
Oklahoma City,48R,D
Oklahoma City,49R,D
Oklahoma City,50R,D
Oklahoma City,51R,D
Oklahoma City,52R,D
Oklahoma City,53R,D
Oklahoma City,54R,D
Oklahoma City,55R,D
Oklahoma City,56R,D
Oklahoma City,57R,D
Oklahoma City,58R,D
Oklahoma City,59R,D
Oklahoma City,60R,D
Oklahoma City,61R,D
Oklahoma City,62B,D
Oklahoma City,63R,D
Oklahoma City,64R,D
Oklahoma City,65R,D
Oklahoma City,66R,D
Oklahoma City,67R,D
Oklahoma City,68R,D
Oklahoma City,69R,D
Oklahoma City,70R,D
Oklahoma City,80R,D
Oklahoma City,81R,D
Oklahoma City,85R,D
Oklahoma City,86R,D
Oklahoma City,87R,D
Oklahoma City,88R,D
Oklahoma City,89R,D
Oklahoma City,90R,D
Milwaukee Co.,S-D1,D
Milwaukee Co.,S-D2,D
Milwaukee Co.,S-D3,D
Milwaukee Co.,S-D4,D
1 city holc_id HOLC Grade (manually mapped)
2 Providence 25 D
3 Providence 26 D
4 Oklahoma City 46R D
5 Oklahoma City 47R D
6 Oklahoma City 48R D
7 Oklahoma City 49R D
8 Oklahoma City 50R D
9 Oklahoma City 51R D
10 Oklahoma City 52R D
11 Oklahoma City 53R D
12 Oklahoma City 54R D
13 Oklahoma City 55R D
14 Oklahoma City 56R D
15 Oklahoma City 57R D
16 Oklahoma City 58R D
17 Oklahoma City 59R D
18 Oklahoma City 60R D
19 Oklahoma City 61R D
20 Oklahoma City 62B D
21 Oklahoma City 63R D
22 Oklahoma City 64R D
23 Oklahoma City 65R D
24 Oklahoma City 66R D
25 Oklahoma City 67R D
26 Oklahoma City 68R D
27 Oklahoma City 69R D
28 Oklahoma City 70R D
29 Oklahoma City 80R D
30 Oklahoma City 81R D
31 Oklahoma City 85R D
32 Oklahoma City 86R D
33 Oklahoma City 87R D
34 Oklahoma City 88R D
35 Oklahoma City 89R D
36 Oklahoma City 90R D
37 Milwaukee Co. S-D1 D
38 Milwaukee Co. S-D2 D
39 Milwaukee Co. S-D3 D
40 Milwaukee Co. S-D4 D

View file

@ -0,0 +1,177 @@
import pathlib
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url, get_module_logger
logger = get_module_logger(__name__)
class MappingInequalityETL(ExtractTransformLoad):
"""Load Mapping Inequality data.
Information on the source data is available at
https://dsl.richmond.edu/panorama/redlining/.
Information on the mapping of this data to census tracts is available at
https://github.com/americanpanorama/Census_HOLC_Research.
"""
def __init__(self):
self.MAPPING_INEQUALITY_CSV_URL = (
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
"main/2010_Census_Tracts/holc_tract_lookup.csv"
)
self.MAPPING_INEQUALITY_CSV = self.TMP_PATH / "holc_tract_lookup.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
self.HOLC_MANUAL_MAPPING_CSV_PATH = (
pathlib.Path(__file__).parent
/ "data"
/ "holc_grades_manually_mapped.csv"
)
# Some input field names. From documentation: 'Census Tracts were intersected
# with HOLC Polygons. Census information can be joined via the "geoid" field.
# There are two field "holc_prop" and "tract_prop" which give the proportion
# of the HOLC polygon in the Census Tract and the proportion of Census Tract
# in the HOLC Polygon respectively.'
# https://github.com/americanpanorama/Census_HOLC_Research/blob/main/2010_Census_Tracts/README.md
self.TRACT_INPUT_FIELD: str = "geoid"
self.TRACT_PROPORTION_FIELD: str = "tract_prop"
self.HOLC_GRADE_AND_ID_FIELD: str = "holc_id"
self.CITY_INPUT_FIELD: str = "city"
self.HOLC_GRADE_D_FIELD: str = "HOLC Grade D"
self.HOLC_GRADE_MANUAL_FIELD: str = "HOLC Grade (manually mapped)"
self.HOLC_GRADE_DERIVED_FIELD: str = "HOLC Grade (derived)"
self.COLUMNS_TO_KEEP = [
self.GEOID_TRACT_FIELD_NAME,
field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD,
field_names.HOLC_GRADE_D_TRACT_20_PERCENT_FIELD,
field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD,
field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD,
]
self.df: pd.DataFrame
def extract(self) -> None:
logger.info("Downloading Mapping Inequality Data")
download_file_from_url(
file_url=self.MAPPING_INEQUALITY_CSV_URL,
download_file_name=self.MAPPING_INEQUALITY_CSV,
)
def transform(self) -> None:
logger.info("Transforming Mapping Inequality Data")
df: pd.DataFrame = pd.read_csv(
self.MAPPING_INEQUALITY_CSV,
dtype={self.TRACT_INPUT_FIELD: "string"},
low_memory=False,
)
# rename Tract ID
df.rename(
columns={
self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
},
inplace=True,
)
# Keep the first character, which is the HOLC grade (A, B, C, D).
# TODO: investigate why this dataframe triggers these pylint errors.
# pylint: disable=unsupported-assignment-operation, unsubscriptable-object
df[self.HOLC_GRADE_DERIVED_FIELD] = df[
self.HOLC_GRADE_AND_ID_FIELD
].str[0:1]
# Remove nonsense when the field has no grade or invalid grades.
valid_grades = ["A", "B", "C", "D"]
df.loc[
# pylint: disable=unsubscriptable-object
~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
self.HOLC_GRADE_DERIVED_FIELD,
] = None
# Some data needs to be manually mapped to its grade.
# TODO: Investigate more data that may need to be manually mapped.
holc_manually_mapped_df = pd.read_csv(
filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
low_memory=False,
)
# Join on the existing data
merged_df = df.merge(
right=holc_manually_mapped_df,
on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
how="left",
)
# Create a single field that combines the 'derived' grade D field with the
# manually mapped grade D field into a single grade D field.
merged_df[self.HOLC_GRADE_D_FIELD] = np.where(
(merged_df[self.HOLC_GRADE_DERIVED_FIELD] == "D")
| (merged_df[self.HOLC_GRADE_MANUAL_FIELD] == "D"),
True,
None,
)
# Start grouping by, to sum all of the grade D parts of each tract.
grouped_df = (
merged_df.groupby(
by=[
self.GEOID_TRACT_FIELD_NAME,
self.HOLC_GRADE_D_FIELD,
],
# Keep the nulls, so we know the non-D proportion.
dropna=False,
)[self.TRACT_PROPORTION_FIELD]
.sum()
.reset_index()
)
# Create a field that is only the percent that is grade D.
grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] = np.where(
grouped_df[self.HOLC_GRADE_D_FIELD],
grouped_df[self.TRACT_PROPORTION_FIELD],
0,
)
# Calculate some specific threshold cutoffs, for convenience.
grouped_df[field_names.HOLC_GRADE_D_TRACT_20_PERCENT_FIELD] = (
grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] > 0.2
)
grouped_df[field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD] = (
grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] > 0.5
)
grouped_df[field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD] = (
grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] > 0.75
)
# Drop the non-True values of `self.HOLC_GRADE_D_FIELD` -- we only
# want one row per tract for future joins.
# Note this means not all tracts will be in this data.
# Note: this singleton comparison warning may be a pylint bug:
# https://stackoverflow.com/questions/51657715/pylint-pandas-comparison-to-true-should-be-just-expr-or-expr-is-true-sin#comment90876517_51657715
# pylint: disable=singleton-comparison
grouped_df = grouped_df[
grouped_df[self.HOLC_GRADE_D_FIELD] == True # noqa: E712
]
# Sort for convenience.
grouped_df.sort_values(by=self.GEOID_TRACT_FIELD_NAME, inplace=True)
# Save to self.
self.df = grouped_df
def load(self) -> None:
logger.info("Saving Mapping Inequality CSV")
# write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df[self.COLUMNS_TO_KEEP].to_csv(
self.CSV_PATH / "usa.csv", index=False
)

View file

@ -3,6 +3,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "71c4acd0",
"metadata": {
"scrolled": true
},
@ -48,6 +49,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "2ce3170c",
"metadata": {
"scrolled": true
},
@ -79,6 +81,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "8bd39090",
"metadata": {
"scrolled": true
},
@ -105,6 +108,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "a251a0fb",
"metadata": {},
"outputs": [],
"source": [
@ -138,6 +142,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "e43a9e23",
"metadata": {},
"outputs": [],
"source": [
@ -160,6 +165,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "38c0dc2f",
"metadata": {
"scrolled": false
},
@ -186,8 +192,9 @@
{
"cell_type": "code",
"execution_count": null,
"id": "8c3e462c",
"metadata": {
"scrolled": false
"scrolled": true
},
"outputs": [],
"source": [
@ -215,6 +222,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "d8ec43dc",
"metadata": {},
"outputs": [],
"source": [
@ -247,13 +255,43 @@
{
"cell_type": "code",
"execution_count": null,
"id": "81826d29",
"metadata": {},
"outputs": [],
"source": [
"# Load mapping inequality data\n",
"HOLC_FACTORS = [\n",
" field_names.HOLC_GRADE_D_TRACT_20_PERCENT_FIELD,\n",
" field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD,\n",
" field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD,\n",
"]\n",
"mapping_inequality_path = (\n",
" DATA_DIR / \"dataset\" / \"mapping_inequality\" / \"usa.csv\"\n",
")\n",
"mapping_inequality_df = pd.read_csv(\n",
" mapping_inequality_path,\n",
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
")\n",
"\n",
"mapping_inequality_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65659c26",
"metadata": {
"scrolled": false
"scrolled": true
},
"outputs": [],
"source": [
"# Join all dataframes that use tracts\n",
"census_tract_dfs = [cejst_df, calenviroscreen_df, persistent_poverty_df]\n",
"census_tract_dfs = [\n",
" cejst_df,\n",
" calenviroscreen_df,\n",
" persistent_poverty_df,\n",
" mapping_inequality_df,\n",
"]\n",
"\n",
"merged_df = functools.reduce(\n",
" lambda left, right: pd.merge(\n",
@ -281,6 +319,23 @@
{
"cell_type": "code",
"execution_count": null,
"id": "2de78f71",
"metadata": {},
"outputs": [],
"source": [
"# Special handling for HOLC.\n",
"# Fill in the null HOLC values as `False`. Otherwise the comparison tool will not run comparisons in states\n",
"# without HOLC scores, and for HOLC, we'd like to see it across the whole US.\n",
"for holc_factor in HOLC_FACTORS:\n",
" merged_df[holc_factor] = merged_df[holc_factor].fillna(False)\n",
"\n",
"merged_df[HOLC_FACTORS].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "980c0f66",
"metadata": {
"scrolled": true
},
@ -377,6 +432,16 @@
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" ]\n",
" # Insert indices for each of the HOLC factors.\n",
" # Note: since these involve no renaming, we write them using list comprehension.\n",
" + [\n",
" Index(\n",
" method_name=factor,\n",
" priority_communities_field=factor,\n",
" other_census_tract_fields_to_keep=[],\n",
" )\n",
" for factor in HOLC_FACTORS\n",
" ]\n",
")\n",
"\n",
"\n",
@ -429,6 +494,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "4b510cb1",
"metadata": {
"scrolled": true
},
@ -711,6 +777,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "2bcbcabf",
"metadata": {},
"outputs": [],
"source": [
@ -816,6 +883,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "d1eec560",
"metadata": {
"scrolled": true
},
@ -1014,6 +1082,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "48005fad",
"metadata": {
"scrolled": true
},
@ -1190,6 +1259,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "7d095ebd",
"metadata": {},
"outputs": [],
"source": [

View file

@ -126,7 +126,9 @@ CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = (
"Percentage households below 100% of federal poverty line in 2009"
)
CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009 = "Percent individuals age 25 or over with less than high school degree in 2009"
CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = "Unemployed civilians (percent) in 2009"
CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = (
"Unemployed civilians (percent) in 2009"
)
CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009"
# Fields from 2010 ACS (loaded for comparison with the territories)
@ -188,3 +190,9 @@ EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD = (
EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, State, 95th percentile (communities)"
)
# Mapping inequality data.
HOLC_GRADE_D_TRACT_PERCENT_FIELD: str = "Percent of tract that is HOLC Grade D"
HOLC_GRADE_D_TRACT_20_PERCENT_FIELD: str = "Tract is >20% HOLC Grade D"
HOLC_GRADE_D_TRACT_50_PERCENT_FIELD: str = "Tract is >50% HOLC Grade D"
HOLC_GRADE_D_TRACT_75_PERCENT_FIELD: str = "Tract is >75% HOLC Grade D"