diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index 2c8b1141..36487964 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -49,6 +49,11 @@ DATASET_LIST = [ "module_dir": "geocorr", "class_name": "GeoCorrETL", }, + { + "name": "mapping_inequality", + "module_dir": "mapping_inequality", + "class_name": "MappingInequalityETL", + }, { "name": "persistent_poverty", "module_dir": "persistent_poverty", diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index 8e4f8c8d..8d15e66c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -1,7 +1,9 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad -from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data +from data_pipeline.etl.sources.census_acs.etl_utils import ( + retrieve_census_acs_data, +) from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py index ce3c901c..2997e84c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py @@ -9,9 +9,7 @@ from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) -def _fips_from_censusdata_censusgeo( - censusgeo: censusdata.censusgeo -) -> str: +def _fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str: """Create a FIPS code from the proprietary censusgeo index.""" fips = "".join([value for (key, value) in censusgeo.params()]) return fips @@ -19,12 +17,12 @@ def _fips_from_censusdata_censusgeo( # pylint: disable=too-many-arguments def retrieve_census_acs_data( - acs_year: int, - variables: List[str], - tract_output_field_name: str, - data_path_for_fips_codes: Path, - acs_type="acs5", - raise_errors: bool = False, + acs_year: int, + variables: List[str], + tract_output_field_name: str, + data_path_for_fips_codes: Path, + acs_type="acs5", + raise_errors: bool = False, ) -> pd.DataFrame: """Retrieves and combines census ACS data for a given year.""" dfs = [] diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py index 870b8a7f..4095f60a 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py @@ -1,7 +1,9 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad -from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data +from data_pipeline.etl.sources.census_acs.etl_utils import ( + retrieve_census_acs_data, +) from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) diff --git a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py index 358061ce..7889d2f6 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py @@ -43,7 +43,7 @@ class DOEEnergyBurden(ExtractTransformLoad): self.raw_df = pd.read_csv( filepath_or_buffer=self.TMP_PATH / "doe_energy_burden" - / "DOE_LEAD_with_EJSCREEN.csv", + / "DOE_LEAD_AMI_TRACT_2018_ALL.csv", # The following need to remain as strings for all of their digits, not get converted to numbers. dtype={ self.TRACT_INPUT_COLUMN_NAME: "string", @@ -56,7 +56,7 @@ class DOEEnergyBurden(ExtractTransformLoad): output_df = self.raw_df.rename( columns={ - "AvgEnergyBurden": self.ENERGY_BURDEN_FIELD_NAME, + "BURDEN": self.ENERGY_BURDEN_FIELD_NAME, self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, } ) @@ -78,7 +78,7 @@ class DOEEnergyBurden(ExtractTransformLoad): def load(self) -> None: logger.info("Saving DOE Energy Burden CSV") - + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.output_df[self.COLUMNS_TO_KEEP].to_csv( path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/README.md b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/README.md new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/data/holc_grades_manually_mapped.csv b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/data/holc_grades_manually_mapped.csv new file mode 100644 index 00000000..b4bbe7d0 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/data/holc_grades_manually_mapped.csv @@ -0,0 +1,40 @@ +city,holc_id,HOLC Grade (manually mapped) +Providence,25,D +Providence,26,D +Oklahoma City,46R,D +Oklahoma City,47R,D +Oklahoma City,48R,D +Oklahoma City,49R,D +Oklahoma City,50R,D +Oklahoma City,51R,D +Oklahoma City,52R,D +Oklahoma City,53R,D +Oklahoma City,54R,D +Oklahoma City,55R,D +Oklahoma City,56R,D +Oklahoma City,57R,D +Oklahoma City,58R,D +Oklahoma City,59R,D +Oklahoma City,60R,D +Oklahoma City,61R,D +Oklahoma City,62B,D +Oklahoma City,63R,D +Oklahoma City,64R,D +Oklahoma City,65R,D +Oklahoma City,66R,D +Oklahoma City,67R,D +Oklahoma City,68R,D +Oklahoma City,69R,D +Oklahoma City,70R,D +Oklahoma City,80R,D +Oklahoma City,81R,D +Oklahoma City,85R,D +Oklahoma City,86R,D +Oklahoma City,87R,D +Oklahoma City,88R,D +Oklahoma City,89R,D +Oklahoma City,90R,D +Milwaukee Co.,S-D1,D +Milwaukee Co.,S-D2,D +Milwaukee Co.,S-D3,D +Milwaukee Co.,S-D4,D \ No newline at end of file diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py new file mode 100644 index 00000000..732ab594 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py @@ -0,0 +1,177 @@ +import pathlib +import numpy as np +import pandas as pd + +from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.score import field_names +from data_pipeline.utils import download_file_from_url, get_module_logger + +logger = get_module_logger(__name__) + + +class MappingInequalityETL(ExtractTransformLoad): + """Load Mapping Inequality data. + + Information on the source data is available at + https://dsl.richmond.edu/panorama/redlining/. + + Information on the mapping of this data to census tracts is available at + https://github.com/americanpanorama/Census_HOLC_Research. + + """ + + def __init__(self): + self.MAPPING_INEQUALITY_CSV_URL = ( + "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/" + "main/2010_Census_Tracts/holc_tract_lookup.csv" + ) + self.MAPPING_INEQUALITY_CSV = self.TMP_PATH / "holc_tract_lookup.csv" + self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality" + + self.HOLC_MANUAL_MAPPING_CSV_PATH = ( + pathlib.Path(__file__).parent + / "data" + / "holc_grades_manually_mapped.csv" + ) + + # Some input field names. From documentation: 'Census Tracts were intersected + # with HOLC Polygons. Census information can be joined via the "geoid" field. + # There are two field "holc_prop" and "tract_prop" which give the proportion + # of the HOLC polygon in the Census Tract and the proportion of Census Tract + # in the HOLC Polygon respectively.' + # https://github.com/americanpanorama/Census_HOLC_Research/blob/main/2010_Census_Tracts/README.md + self.TRACT_INPUT_FIELD: str = "geoid" + self.TRACT_PROPORTION_FIELD: str = "tract_prop" + self.HOLC_GRADE_AND_ID_FIELD: str = "holc_id" + self.CITY_INPUT_FIELD: str = "city" + + self.HOLC_GRADE_D_FIELD: str = "HOLC Grade D" + self.HOLC_GRADE_MANUAL_FIELD: str = "HOLC Grade (manually mapped)" + self.HOLC_GRADE_DERIVED_FIELD: str = "HOLC Grade (derived)" + + self.COLUMNS_TO_KEEP = [ + self.GEOID_TRACT_FIELD_NAME, + field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD, + field_names.HOLC_GRADE_D_TRACT_20_PERCENT_FIELD, + field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD, + field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD, + ] + + self.df: pd.DataFrame + + def extract(self) -> None: + logger.info("Downloading Mapping Inequality Data") + download_file_from_url( + file_url=self.MAPPING_INEQUALITY_CSV_URL, + download_file_name=self.MAPPING_INEQUALITY_CSV, + ) + + def transform(self) -> None: + logger.info("Transforming Mapping Inequality Data") + df: pd.DataFrame = pd.read_csv( + self.MAPPING_INEQUALITY_CSV, + dtype={self.TRACT_INPUT_FIELD: "string"}, + low_memory=False, + ) + + # rename Tract ID + df.rename( + columns={ + self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME, + }, + inplace=True, + ) + + # Keep the first character, which is the HOLC grade (A, B, C, D). + # TODO: investigate why this dataframe triggers these pylint errors. + # pylint: disable=unsupported-assignment-operation, unsubscriptable-object + df[self.HOLC_GRADE_DERIVED_FIELD] = df[ + self.HOLC_GRADE_AND_ID_FIELD + ].str[0:1] + + # Remove nonsense when the field has no grade or invalid grades. + valid_grades = ["A", "B", "C", "D"] + df.loc[ + # pylint: disable=unsubscriptable-object + ~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades), + self.HOLC_GRADE_DERIVED_FIELD, + ] = None + + # Some data needs to be manually mapped to its grade. + # TODO: Investigate more data that may need to be manually mapped. + holc_manually_mapped_df = pd.read_csv( + filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH, + low_memory=False, + ) + + # Join on the existing data + merged_df = df.merge( + right=holc_manually_mapped_df, + on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD], + how="left", + ) + + # Create a single field that combines the 'derived' grade D field with the + # manually mapped grade D field into a single grade D field. + merged_df[self.HOLC_GRADE_D_FIELD] = np.where( + (merged_df[self.HOLC_GRADE_DERIVED_FIELD] == "D") + | (merged_df[self.HOLC_GRADE_MANUAL_FIELD] == "D"), + True, + None, + ) + + # Start grouping by, to sum all of the grade D parts of each tract. + grouped_df = ( + merged_df.groupby( + by=[ + self.GEOID_TRACT_FIELD_NAME, + self.HOLC_GRADE_D_FIELD, + ], + # Keep the nulls, so we know the non-D proportion. + dropna=False, + )[self.TRACT_PROPORTION_FIELD] + .sum() + .reset_index() + ) + + # Create a field that is only the percent that is grade D. + grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] = np.where( + grouped_df[self.HOLC_GRADE_D_FIELD], + grouped_df[self.TRACT_PROPORTION_FIELD], + 0, + ) + + # Calculate some specific threshold cutoffs, for convenience. + grouped_df[field_names.HOLC_GRADE_D_TRACT_20_PERCENT_FIELD] = ( + grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] > 0.2 + ) + grouped_df[field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD] = ( + grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] > 0.5 + ) + grouped_df[field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD] = ( + grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] > 0.75 + ) + + # Drop the non-True values of `self.HOLC_GRADE_D_FIELD` -- we only + # want one row per tract for future joins. + # Note this means not all tracts will be in this data. + # Note: this singleton comparison warning may be a pylint bug: + # https://stackoverflow.com/questions/51657715/pylint-pandas-comparison-to-true-should-be-just-expr-or-expr-is-true-sin#comment90876517_51657715 + # pylint: disable=singleton-comparison + grouped_df = grouped_df[ + grouped_df[self.HOLC_GRADE_D_FIELD] == True # noqa: E712 + ] + + # Sort for convenience. + grouped_df.sort_values(by=self.GEOID_TRACT_FIELD_NAME, inplace=True) + + # Save to self. + self.df = grouped_df + + def load(self) -> None: + logger.info("Saving Mapping Inequality CSV") + # write nationwide csv + self.CSV_PATH.mkdir(parents=True, exist_ok=True) + self.df[self.COLUMNS_TO_KEEP].to_csv( + self.CSV_PATH / "usa.csv", index=False + ) diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index 6930a8cb..5903550e 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -3,6 +3,7 @@ { "cell_type": "code", "execution_count": null, + "id": "71c4acd0", "metadata": { "scrolled": true }, @@ -48,6 +49,7 @@ { "cell_type": "code", "execution_count": null, + "id": "2ce3170c", "metadata": { "scrolled": true }, @@ -79,6 +81,7 @@ { "cell_type": "code", "execution_count": null, + "id": "8bd39090", "metadata": { "scrolled": true }, @@ -105,6 +108,7 @@ { "cell_type": "code", "execution_count": null, + "id": "a251a0fb", "metadata": {}, "outputs": [], "source": [ @@ -138,6 +142,7 @@ { "cell_type": "code", "execution_count": null, + "id": "e43a9e23", "metadata": {}, "outputs": [], "source": [ @@ -160,6 +165,7 @@ { "cell_type": "code", "execution_count": null, + "id": "38c0dc2f", "metadata": { "scrolled": false }, @@ -186,8 +192,9 @@ { "cell_type": "code", "execution_count": null, + "id": "8c3e462c", "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ @@ -215,6 +222,7 @@ { "cell_type": "code", "execution_count": null, + "id": "d8ec43dc", "metadata": {}, "outputs": [], "source": [ @@ -247,13 +255,43 @@ { "cell_type": "code", "execution_count": null, + "id": "81826d29", + "metadata": {}, + "outputs": [], + "source": [ + "# Load mapping inequality data\n", + "HOLC_FACTORS = [\n", + " field_names.HOLC_GRADE_D_TRACT_20_PERCENT_FIELD,\n", + " field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD,\n", + " field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD,\n", + "]\n", + "mapping_inequality_path = (\n", + " DATA_DIR / \"dataset\" / \"mapping_inequality\" / \"usa.csv\"\n", + ")\n", + "mapping_inequality_df = pd.read_csv(\n", + " mapping_inequality_path,\n", + " dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n", + ")\n", + "\n", + "mapping_inequality_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65659c26", "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ "# Join all dataframes that use tracts\n", - "census_tract_dfs = [cejst_df, calenviroscreen_df, persistent_poverty_df]\n", + "census_tract_dfs = [\n", + " cejst_df,\n", + " calenviroscreen_df,\n", + " persistent_poverty_df,\n", + " mapping_inequality_df,\n", + "]\n", "\n", "merged_df = functools.reduce(\n", " lambda left, right: pd.merge(\n", @@ -281,6 +319,23 @@ { "cell_type": "code", "execution_count": null, + "id": "2de78f71", + "metadata": {}, + "outputs": [], + "source": [ + "# Special handling for HOLC.\n", + "# Fill in the null HOLC values as `False`. Otherwise the comparison tool will not run comparisons in states\n", + "# without HOLC scores, and for HOLC, we'd like to see it across the whole US.\n", + "for holc_factor in HOLC_FACTORS:\n", + " merged_df[holc_factor] = merged_df[holc_factor].fillna(False)\n", + "\n", + "merged_df[HOLC_FACTORS].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "980c0f66", "metadata": { "scrolled": true }, @@ -377,6 +432,16 @@ " other_census_tract_fields_to_keep=[],\n", " ),\n", " ]\n", + " # Insert indices for each of the HOLC factors.\n", + " # Note: since these involve no renaming, we write them using list comprehension.\n", + " + [\n", + " Index(\n", + " method_name=factor,\n", + " priority_communities_field=factor,\n", + " other_census_tract_fields_to_keep=[],\n", + " )\n", + " for factor in HOLC_FACTORS\n", + " ]\n", ")\n", "\n", "\n", @@ -429,6 +494,7 @@ { "cell_type": "code", "execution_count": null, + "id": "4b510cb1", "metadata": { "scrolled": true }, @@ -711,6 +777,7 @@ { "cell_type": "code", "execution_count": null, + "id": "2bcbcabf", "metadata": {}, "outputs": [], "source": [ @@ -816,6 +883,7 @@ { "cell_type": "code", "execution_count": null, + "id": "d1eec560", "metadata": { "scrolled": true }, @@ -1014,6 +1082,7 @@ { "cell_type": "code", "execution_count": null, + "id": "48005fad", "metadata": { "scrolled": true }, @@ -1190,6 +1259,7 @@ { "cell_type": "code", "execution_count": null, + "id": "7d095ebd", "metadata": {}, "outputs": [], "source": [ diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 1c845c49..8c462952 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -126,7 +126,9 @@ CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = ( "Percentage households below 100% of federal poverty line in 2009" ) CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009 = "Percent individuals age 25 or over with less than high school degree in 2009" -CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = "Unemployed civilians (percent) in 2009" +CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = ( + "Unemployed civilians (percent) in 2009" +) CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009" # Fields from 2010 ACS (loaded for comparison with the territories) @@ -188,3 +190,9 @@ EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD = ( EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, State, 95th percentile (communities)" ) + +# Mapping inequality data. +HOLC_GRADE_D_TRACT_PERCENT_FIELD: str = "Percent of tract that is HOLC Grade D" +HOLC_GRADE_D_TRACT_20_PERCENT_FIELD: str = "Tract is >20% HOLC Grade D" +HOLC_GRADE_D_TRACT_50_PERCENT_FIELD: str = "Tract is >50% HOLC Grade D" +HOLC_GRADE_D_TRACT_75_PERCENT_FIELD: str = "Tract is >75% HOLC Grade D"