diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 34803ea4..086697b1 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -276,7 +276,7 @@ class ScoreETL(ExtractTransformLoad): field_names.LIFE_EXPECTANCY_FIELD, field_names.ENERGY_BURDEN_FIELD, field_names.FEMA_RISK_FIELD, - field_names.URBAN_HERUISTIC_FIELD, + field_names.URBAN_HEURISTIC_FIELD, field_names.AIR_TOXICS_CANCER_RISK_FIELD, field_names.RESPITORY_HAZARD_FIELD, field_names.DIESEL_FIELD, diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 075dd072..fa786510 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -110,7 +110,7 @@ class PostScoreETL(ExtractTransformLoad): new_df_copy = new_df.rename( columns={"USPS": "State Abbreviation", "NAME": "County Name"}, - inplace=False + inplace=False, ) return new_df_copy diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index 81ab218f..b22fdf3a 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -97,7 +97,6 @@ class CensusACSETL(ExtractTransformLoad): f"Could not download data for state/territory with FIPS code {fips}" ) - self.df = pd.concat(dfs) self.df[self.GEOID_TRACT_FIELD_NAME] = self.df.index.to_series().apply( diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py index 0bb036d5..57749cc2 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py @@ -32,7 +32,7 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad): To enable the ETL code for EJSCREEN AoCs to run appropriately whether or not the person running it has access to that data, the following method checks whether the source file exists. - If it does exist, code can and should include to this data. If it does not exist, code should + If it does exist, code can and should include this data. If it does not exist, code should not reference this data. """ diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py index c9c45993..06233b44 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py @@ -17,7 +17,7 @@ class GeoCorrETL(ExtractTransformLoad): # Need to change hyperlink to S3 self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip" self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT" - self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag" + self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag" self.df: pd.DataFrame @@ -47,7 +47,7 @@ class GeoCorrETL(ExtractTransformLoad): self.df.rename( columns={ - "urban_heuristic_flag": self.URBAN_HERUISTIC_FIELD_NAME, + "urban_heuristic_flag": self.URBAN_HEURISTIC_FIELD_NAME, }, inplace=True, ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py index 24e2df18..17d398dc 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py @@ -26,7 +26,7 @@ class PersistentPovertyETL(ExtractTransformLoad): # self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/persistent_poverty_urban_rural.csv.zip" self.GEOID_TRACT_INPUT_FIELD_NAME_1 = "TRTID10" self.GEOID_TRACT_INPUT_FIELD_NAME_2 = "tractid" - # self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag" + # self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag" self.POVERTY_PREFIX = "Individuals in Poverty (percent)" self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract" diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index ac34f0fc..12c4d518 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -33,11 +33,15 @@ " sys.path.append(module_path)\n", "\n", "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n", + "from data_pipeline.etl.base import ExtractTransformLoad\n", "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n", "from data_pipeline.etl.sources.ejscreen_areas_of_concern.etl import (\n", " EJSCREENAreasOfConcernETL,\n", ")\n", "\n", + "\n", + "from data_pipeline.score import field_names\n", + "\n", "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n", "tqdm_notebook.pandas()" ] @@ -67,26 +71,8 @@ "\n", "# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n", "# and introducing the risk of misspelling the field name.)\n", - "\n", - "GEOID_FIELD_NAME = \"GEOID10\"\n", - "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n", "COUNTRY_FIELD_NAME = \"Country\"\n", - "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n", - "URBAN_HEURISTIC_FIELD = \"Urban Heuristic Flag\"\n", - "\n", - "CEJST_SCORE_FIELD = \"cejst_score\"\n", - "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n", - "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n", - "\n", - "LIFE_EXPECTANCY_FIELD = \"Life expectancy (years)\"\n", - "HEALTH_INSURANCE_FIELD = (\n", - " \"Current lack of health insurance among adults aged 18-64 years\"\n", - ")\n", - "BAD_HEALTH_FIELD = (\n", - " \"Physical health not good for >=14 days among adults aged >=18 years\"\n", - ")\n", - "MEDIAN_HOUSE_VALUE_FIELD = \"Median value ($) of owner-occupied housing units\"\n", "\n", "# Define some suffixes\n", "POPULATION_SUFFIX = \" (priority population)\"" @@ -103,17 +89,17 @@ "source": [ "# Load CEJST score data\n", "cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n", - "cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n", - "\n", - "# Create the CBG's Census Tract ID by dropping the last number from the FIPS CODE of the CBG.\n", - "# The CBG ID is the last one character.\n", - "# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n", - "cejst_df.loc[:, GEOID_TRACT_FIELD_NAME] = (\n", - " cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[:-1]\n", + "cejst_df = pd.read_csv(\n", + " cejst_data_path,\n", + " dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n", ")\n", "\n", + "# Create the state ID by taking the first two digits of the FIPS CODE of the tract.\n", + "# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n", "cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (\n", - " cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[0:2]\n", + " cejst_df.loc[:, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]\n", + " .astype(str)\n", + " .str[0:2]\n", ")\n", "\n", "cejst_df.head()" @@ -130,6 +116,11 @@ "\n", "# Load EJ Screen Areas of Concern\n", "# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n", + "# Note: this data is provided privately and is not currently publicly available.\n", + "# To enable the ETL code for EJSCREEN AoCs to run appropriately whether or not the person\n", + "# running it has access to that data, `ejscreen_areas_of_concern_data_exists` checks whether the source file exists.\n", + "# If it does exist, code can and should include this data. If it does not exist, code should\n", + "# not reference this data.\n", "ejscreen_areas_of_concern_df: pd.DataFrame = None\n", "\n", "if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n", @@ -139,7 +130,7 @@ " )\n", " ejscreen_areas_of_concern_df = pd.read_csv(\n", " ejscreen_areas_of_concern_csv,\n", - " dtype={GEOID_FIELD_NAME: \"string\"},\n", + " dtype={ExtractTransformLoad.GEOID_FIELD_NAME: \"string\"},\n", " low_memory=False,\n", " )\n", "else:\n", @@ -159,9 +150,12 @@ "# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n", "if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n", " # If available, merge EJSCREEN AoC data into CBG dfs.\n", - " cejst_df = cejst_df.merge(\n", - " ejscreen_areas_of_concern_df, on=GEOID_FIELD_NAME, how=\"outer\"\n", - " )\n", + " # TODO: When we get AoC data at the tract level, fix this.\n", + " # Right now commenting this out to avoid merging CBG-level areas of concern on a tract-level CEJST definition.\n", + " # cejst_df = cejst_df.merge(\n", + " # ejscreen_areas_of_concern_df, on=GEOID_FIELD_NAME, how=\"outer\"\n", + " # )\n", + " pass\n", "else:\n", " pass\n", "\n", @@ -182,12 +176,7 @@ "quantile = 0.9\n", "\n", "for field in [\n", - " \"Linguistic isolation (percent)\",\n", - " \"Diesel particulate matter (percentile)\",\n", - " \"Particulate matter (PM2.5) (percentile)\",\n", - " \"Traffic proximity and volume (percentile)\",\n", - " \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n", - " MEDIAN_HOUSE_VALUE_FIELD,\n", + " field_names.MEDIAN_HOUSE_VALUE_FIELD,\n", "]:\n", " print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n", " print(cejst_df[field].describe())\n", @@ -218,7 +207,8 @@ " DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n", ")\n", "calenviroscreen_df = pd.read_csv(\n", - " calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n", + " calenviroscreen_data_path,\n", + " dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n", ")\n", "\n", "# Convert priority community field to a bool.\n", @@ -241,7 +231,8 @@ " DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n", ")\n", "persistent_poverty_df = pd.read_csv(\n", - " persistent_poverty_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n", + " persistent_poverty_path,\n", + " dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n", ")\n", "\n", "# Since \"Persistent Poverty Census Tract\" is labeled in both the score file (at the CBG level) and this tract file,\n", @@ -271,60 +262,38 @@ "outputs": [], "source": [ "# Join all dataframes that use tracts\n", - "census_tract_dfs = [calenviroscreen_df, persistent_poverty_df]\n", + "census_tract_dfs = [cejst_df, calenviroscreen_df, persistent_poverty_df]\n", "\n", - "census_tract_df = functools.reduce(\n", + "merged_df = functools.reduce(\n", " lambda left, right: pd.merge(\n", - " left=left, right=right, on=GEOID_TRACT_FIELD_NAME, how=\"outer\"\n", + " left=left,\n", + " right=right,\n", + " on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME,\n", + " how=\"outer\",\n", " ),\n", " census_tract_dfs,\n", ")\n", "\n", - "tract_values = census_tract_df[GEOID_TRACT_FIELD_NAME].str.len().unique()\n", + "tract_values = (\n", + " merged_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].str.len().unique()\n", + ")\n", "if any(tract_values != [11]):\n", " print(tract_values)\n", " raise ValueError(\"Some of the census tract data has the wrong length.\")\n", "\n", - "if len(census_tract_df) > 74134:\n", + "if len(merged_df) > ExtractTransformLoad.EXPECTED_MAX_CENSUS_TRACTS:\n", " raise ValueError(\"Too many rows in the join.\")\n", "\n", - "census_tract_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8da016db", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Join tract indices and CEJST data.\n", - "# Note: we're joining on the census *tract*, so there will be multiple CBG entries joined to the same census tract row from CES,\n", - "# creating multiple rows of the same CES data.\n", - "merged_df = cejst_df.merge(\n", - " census_tract_df,\n", - " how=\"left\",\n", - " on=GEOID_TRACT_FIELD_NAME,\n", - ")\n", - "\n", - "if len(merged_df) > 220405:\n", - " raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n", - "\n", - "merged_df.head()\n", - "\n", - "\n", - "# merged_df.to_csv(\n", - "# path_or_buf=COMPARISON_OUTPUTS_DIR / \"merged.csv\", na_rep=\"\", index=False\n", - "# )" + "merged_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "274f6bc6", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Define a namedtuple for indices.\n", @@ -333,6 +302,7 @@ " field_names=[\n", " \"method_name\",\n", " \"priority_communities_field\",\n", + " # TODO: remove this field??\n", " # Note: this field only used by indices defined at the census tract level.\n", " \"other_census_tract_fields_to_keep\",\n", " ],\n", @@ -341,23 +311,23 @@ "# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n", "# (`census_tract_indices`).\n", "definition_l_factors = [\n", - " \"Climate Factor (Definition L)\",\n", - " \"Energy Factor (Definition L)\",\n", - " \"Transportation Factor (Definition L)\",\n", - " \"Housing Factor (Definition L)\",\n", - " \"Pollution Factor (Definition L)\",\n", - " \"Water Factor (Definition L)\",\n", - " \"Health Factor (Definition L)\",\n", - " \"Workforce Factor (Definition L)\",\n", + " field_names.L_CLIMATE,\n", + " field_names.L_ENERGY,\n", + " field_names.L_TRANSPORTATION,\n", + " field_names.L_HOUSING,\n", + " field_names.L_POLLUTION,\n", + " field_names.L_WATER,\n", + " field_names.L_HEALTH,\n", + " field_names.L_WORKFORCE,\n", " # Also include a combined factor for all the non-workforce elements.\n", - " \"Any Non-Workforce Factor (Definition L)\",\n", + " field_names.L_NON_WORKFORCE,\n", "]\n", "\n", - "census_block_group_indices = (\n", + "census_tract_indices = (\n", " [\n", " Index(\n", " method_name=\"Definition L\",\n", - " priority_communities_field=\"Definition L (communities)\",\n", + " priority_communities_field=field_names.SCORE_L_COMMUNITIES,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", " ]\n", @@ -375,7 +345,7 @@ " Index(\n", " # Note: we're renaming Score G as NMTC Modified for clarity, since that's what Score G is under the hood.\n", " method_name=\"NMTC Modified\",\n", - " priority_communities_field=\"Score G (communities)\",\n", + " priority_communities_field=field_names.SCORE_G_COMMUNITIES,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", " Index(\n", @@ -400,14 +370,22 @@ " ),\n", " Index(\n", " method_name=\"Score F\",\n", - " priority_communities_field=\"Score F (communities)\",\n", + " priority_communities_field=field_names.SCORE_F_COMMUNITIES,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", " Index(\n", - " method_name=\"Persistent Poverty (CBG)\",\n", - " priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n", + " method_name=\"CalEnviroScreen 4.0\",\n", + " priority_communities_field=\"calenviroscreen_priority_community\",\n", + " other_census_tract_fields_to_keep=[\n", + " CALENVIROSCREEN_SCORE_FIELD,\n", + " CALENVIROSCREEN_PERCENTILE_FIELD,\n", + " ],\n", + " ),\n", + " Index(\n", + " method_name=\"Persistent Poverty\",\n", + " priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n", " other_census_tract_fields_to_keep=[],\n", - " )\n", + " ),\n", " ]\n", ")\n", "\n", @@ -415,17 +393,17 @@ "ejscreen_areas_of_concern_census_block_group_indices = [\n", " Index(\n", " method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n", - " priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n", + " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", " Index(\n", " method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n", - " priority_communities_field=\"EJSCREEN Areas of Concern, National, 90th percentile (communities)\",\n", + " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", " Index(\n", " method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n", - " priority_communities_field=\"EJSCREEN Areas of Concern, National, 95th percentile (communities)\",\n", + " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", "]\n", @@ -433,42 +411,28 @@ "# Before including EJSCREEN AoC indicators are included, check whether or not the EJSCREEN AoC data is available locally.\n", "if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n", " # Add EJSCREEN AoCs to all of the CBG indices.\n", - " census_block_group_indices.extend(\n", - " ejscreen_areas_of_concern_census_block_group_indices\n", - " )\n", + " # TODO: When we get AoC data at the tract level, fix this.\n", + " # Right now commenting this out to avoid merging CBG-level areas of concern on a tract-level CEJST definition.\n", + " # census_block_group_indices.extend(\n", + " # ejscreen_areas_of_concern_census_block_group_indices\n", + " # )\n", + " pass\n", "else:\n", " pass\n", "\n", - "census_tract_indices = [\n", - " Index(\n", - " method_name=\"CalEnviroScreen 4.0\",\n", - " priority_communities_field=\"calenviroscreen_priority_community\",\n", - " other_census_tract_fields_to_keep=[\n", - " CALENVIROSCREEN_SCORE_FIELD,\n", - " CALENVIROSCREEN_PERCENTILE_FIELD,\n", - " ],\n", - " ),\n", - " Index(\n", - " method_name=\"Persistent Poverty\",\n", - " priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - "]\n", - "\n", "# These fields will be used for statistical comparisons.\n", "comparison_fields = [\n", - " \"Percent of individuals < 100% Federal Poverty Line\",\n", - " \"Percent of individuals < 200% Federal Poverty Line\",\n", - " \"Median household income (% of AMI)\",\n", - " \"Percent of households in linguistic isolation\",\n", - " \"Percent individuals age 25 or over with less than high school degree\",\n", - " \"Linguistic isolation (percent)\",\n", - " \"Unemployed civilians (percent)\",\n", - " \"Median household income in the past 12 months\",\n", - " URBAN_HEURISTIC_FIELD,\n", - " LIFE_EXPECTANCY_FIELD,\n", - " HEALTH_INSURANCE_FIELD,\n", - " BAD_HEALTH_FIELD,\n", + " field_names.POVERTY_LESS_THAN_100_FPL_FIELD,\n", + " field_names.POVERTY_LESS_THAN_200_FPL_FIELD,\n", + " field_names.MEDIAN_INCOME_PERCENT_AMI_FIELD,\n", + " field_names.LINGUISTIC_ISO_FIELD,\n", + " field_names.UNEMPLOYMENT_FIELD,\n", + " field_names.HIGH_SCHOOL_ED_FIELD,\n", + " field_names.MEDIAN_INCOME_FIELD,\n", + " field_names.URBAN_HEURISTIC_FIELD,\n", + " field_names.LIFE_EXPECTANCY_FIELD,\n", + " field_names.HEALTH_INSURANCE_FIELD,\n", + " field_names.PHYS_HEALTH_NOT_GOOD_FIELD,\n", "]" ] }, @@ -477,7 +441,7 @@ "execution_count": null, "id": "bfae9cf5", "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ @@ -491,10 +455,9 @@ " if df[priority_communities_field].dtype != bool:\n", " print(f\"Converting {priority_communities_field} to boolean.\")\n", "\n", - " # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n", + " # Calculate the population included as priority communities per tract. Will either be 0 or the population.\n", " df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n", - " df[priority_communities_field]\n", - " * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n", + " df[priority_communities_field] * df[field_names.TOTAL_POP_FIELD]\n", " )\n", "\n", " def calculate_state_comparison(\n", @@ -532,13 +495,11 @@ " summary_dict[\"division\"] = division_id\n", " summary_dict[\"Geography name\"] = division_id\n", "\n", - " total_cbgs_in_geography = len(frame)\n", - " total_population_in_geography = frame[\n", - " CENSUS_BLOCK_GROUP_POPULATION_FIELD\n", - " ].sum()\n", + " total_tracts_in_geography = len(frame)\n", + " total_population_in_geography = frame[field_names.TOTAL_POP_FIELD].sum()\n", "\n", - " if geography_field == URBAN_HEURISTIC_FIELD:\n", - " urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n", + " if geography_field == field_names.URBAN_HEURISTIC_FIELD:\n", + " urban_flag = frame[field_names.URBAN_HEURISTIC_FIELD].unique()[0]\n", " summary_dict[\"Urban vs Rural\"] = \"Urban\" if urban_flag else \"Rural\"\n", " summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n", "\n", @@ -547,14 +508,14 @@ " f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n", " ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n", "\n", - " summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n", - " f\"{priority_communities_field}\"\n", - " ].sum()\n", + " summary_dict[\n", + " f\"{priority_communities_field} (total tracts)\"\n", + " ] = frame[f\"{priority_communities_field}\"].sum()\n", "\n", " # Calculate some combinations of other variables.\n", - " summary_dict[f\"{priority_communities_field} (percent CBGs)\"] = (\n", - " summary_dict[f\"{priority_communities_field} (total CBGs)\"]\n", - " / total_cbgs_in_geography\n", + " summary_dict[f\"{priority_communities_field} (percent tracts)\"] = (\n", + " summary_dict[f\"{priority_communities_field} (total tracts)\"]\n", + " / total_tracts_in_geography\n", " )\n", "\n", " summary_dict[\n", @@ -566,7 +527,7 @@ "\n", " unwanted_keys = [\n", " f\"{priority_communities_field}{POPULATION_SUFFIX}\",\n", - " f\"{priority_communities_field} (total CBGs)\",\n", + " f\"{priority_communities_field} (total tracts)\",\n", " ]\n", "\n", " # Remove unneeded columns:\n", @@ -621,12 +582,12 @@ " )\n", "\n", " # Next, run the comparison by urban/rural\n", - " urban_grouped_df = df.groupby(URBAN_HEURISTIC_FIELD)\n", + " urban_grouped_df = df.groupby(field_names.URBAN_HEURISTIC_FIELD)\n", "\n", " # Run the comparison function on the groups.\n", " urban_grouped_df = urban_grouped_df.progress_apply(\n", " lambda frame: calculate_state_comparison(\n", - " frame, geography_field=URBAN_HEURISTIC_FIELD\n", + " frame, geography_field=field_names.URBAN_HEURISTIC_FIELD\n", " )\n", " )\n", "\n", @@ -716,8 +677,7 @@ "\n", "\n", "fields_to_analyze = [\n", - " index.priority_communities_field\n", - " for index in census_block_group_indices + census_tract_indices\n", + " index.priority_communities_field for index in census_tract_indices\n", "]\n", "\n", "# Convert all indices to boolean\n", @@ -742,7 +702,7 @@ " priority_communities_fields=fields_to_analyze,\n", ")\n", "\n", - "file_prefix = \"Priority CBGs – Different geographic groupings\"\n", + "file_prefix = \"Priority Tracts – Different geographic groupings\"\n", "\n", "state_distribution_df.to_csv(\n", " path_or_buf=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.csv\",\n", @@ -753,13 +713,7 @@ "write_state_distribution_excel(\n", " state_distribution_df=state_distribution_df,\n", " file_path=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.xlsx\",\n", - ")\n", - "\n", - "# Note: this is helpful because this file is extremely long-running, so it alerts the user when the first step\n", - "# of data analysis is done. Can be removed when converted into scripts. -LMB.\n", - "import os\n", - "\n", - "os.system(\"say 'state analysis is written.'\")" + ")" ] }, { @@ -769,7 +723,7 @@ "metadata": {}, "outputs": [], "source": [ - "directory = COMPARISON_OUTPUTS_DIR / \"cbg_basic_stats\"\n", + "directory = COMPARISON_OUTPUTS_DIR / \"tracts_basic_stats\"\n", "directory.mkdir(parents=True, exist_ok=True)\n", "\n", "# TODO: this Excel-writing function is extremely similar to other Excel-writing functions in this notebook.\n", @@ -846,14 +800,13 @@ " writer.save()\n", "\n", "\n", - "for index in census_block_group_indices:\n", + "for index in census_tract_indices:\n", " print(f\"Basic stats for {index.method_name}\")\n", " temp_df = merged_df\n", " temp_df[index.priority_communities_field] = (\n", " temp_df[index.priority_communities_field] == True\n", " )\n", "\n", - " # print(sum(temp_df[\"is_a_priority_cbg\"]))\n", " grouped_df = (\n", " temp_df.groupby(index.priority_communities_field).mean().reset_index()\n", " )\n", @@ -878,34 +831,34 @@ }, "outputs": [], "source": [ - "# Compare CBG scores to each other, running secondary analysis on\n", - "# characteristics of CBGs prioritized by one but not the other.\n", - "def get_cbg_score_comparison_df(\n", + "# Compare census tract scores to each other, running secondary analysis on\n", + "# characteristics of census tracts prioritized by one but not the other.\n", + "def get_census_tracts_score_comparison_df(\n", " df: pd.DataFrame,\n", - " method_a_priority_census_block_groups_field: str,\n", - " method_b_priority_census_block_groups_field: str,\n", + " method_a_priority_census_tracts_field: str,\n", + " method_b_priority_census_tracts_field: str,\n", " comparison_fields: typing.List[str],\n", ") -> pd.DataFrame:\n", - " \"\"\"Compare CBG scores to each other.\n", + " \"\"\"Compare tract scores to each other.\n", "\n", - " This comparison method analyzes characteristics of those census block groups, based on whether or not they are prioritized\n", + " This comparison method analyzes characteristics of those census tracts, based on whether or not they are prioritized\n", " or not by Method A and/or Method B.\n", "\n", - " E.g., it might show that CBGs prioritized by A but not B have a higher average income,\n", - " or that CBGs prioritized by B but not A have a lower percent of unemployed people.\n", + " E.g., it might show that tracts prioritized by A but not B have a higher average income,\n", + " or that tracts prioritized by B but not A have a lower percent of unemployed people.\n", " \"\"\"\n", " df_subset = df[\n", " [\n", - " method_a_priority_census_block_groups_field,\n", - " method_b_priority_census_block_groups_field,\n", + " method_a_priority_census_tracts_field,\n", + " method_b_priority_census_tracts_field,\n", " ]\n", " + comparison_fields\n", " ]\n", "\n", " grouped_df = df_subset.groupby(\n", " [\n", - " method_a_priority_census_block_groups_field,\n", - " method_b_priority_census_block_groups_field,\n", + " method_a_priority_census_tracts_field,\n", + " method_b_priority_census_tracts_field,\n", " ],\n", " dropna=False,\n", " )\n", @@ -915,10 +868,10 @@ "\n", " criteria_description_field_name = \"Description of criteria\"\n", " comparison_df[criteria_description_field_name] = comparison_df.apply(\n", - " func=lambda row: f\"CBGs that are {'not' if row[method_a_priority_census_block_groups_field] is False else ''} \"\n", - " + f\"prioritized by {method_a_priority_census_block_groups_field} \"\n", - " + f\"and are {'not' if row[method_b_priority_census_block_groups_field] is False else ''} \"\n", - " + f\"prioritized by {method_b_priority_census_block_groups_field}\",\n", + " func=lambda row: f\"Tracts that are {'not ' if row[method_a_priority_census_tracts_field] is False else ''}\"\n", + " + f\"prioritized by {method_a_priority_census_tracts_field} \"\n", + " + f\"and are {'not ' if row[method_b_priority_census_tracts_field] is False else ''}\"\n", + " + f\"prioritized by {method_b_priority_census_tracts_field}\",\n", " axis=1,\n", " )\n", "\n", @@ -934,7 +887,7 @@ " # Rename fields to reflect the mean aggregation\n", " comparison_df.rename(\n", " mapper={\n", - " comparison_field: f\"{comparison_field} (mean of CBGs)\"\n", + " comparison_field: f\"{comparison_field} (mean of tracts)\"\n", " for comparison_field in comparison_fields\n", " },\n", " axis=1,\n", @@ -944,8 +897,9 @@ " return comparison_df\n", "\n", "\n", - "def write_cbg_score_comparison_excel(\n", - " cbg_score_comparison_df: pd.DataFrame, file_path: pathlib.PosixPath\n", + "def write_census_tracts_score_comparison_excel(\n", + " census_tracts_score_comparison_df: pd.DataFrame,\n", + " file_path: pathlib.PosixPath,\n", ") -> None:\n", " \"\"\"Write the dataframe to excel with special formatting.\"\"\"\n", " # Create a Pandas Excel writer using XlsxWriter as the engine.\n", @@ -953,22 +907,27 @@ "\n", " # Convert the dataframe to an XlsxWriter Excel object. We also turn off the\n", " # index column at the left of the output dataframe.\n", - " cbg_score_comparison_df.to_excel(writer, sheet_name=\"Sheet1\", index=False)\n", + " census_tracts_score_comparison_df.to_excel(\n", + " writer, sheet_name=\"Sheet1\", index=False\n", + " )\n", "\n", " # Get the xlsxwriter workbook and worksheet objects.\n", " workbook = writer.book\n", " worksheet = writer.sheets[\"Sheet1\"]\n", " worksheet.autofilter(\n", - " 0, 0, cbg_score_comparison_df.shape[0], cbg_score_comparison_df.shape[1]\n", + " 0,\n", + " 0,\n", + " census_tracts_score_comparison_df.shape[0],\n", + " census_tracts_score_comparison_df.shape[1],\n", " )\n", "\n", " # Set a width parameter for all columns\n", " # Note: this is parameterized because every call to `set_column` requires setting the width.\n", " column_width = 15\n", "\n", - " for column in cbg_score_comparison_df.columns:\n", + " for column in census_tracts_score_comparison_df.columns:\n", " # Turn the column index into excel ranges (e.g., column #95 is \"CR\" and the range may be \"CR2:CR53\").\n", - " column_index = cbg_score_comparison_df.columns.get_loc(column)\n", + " column_index = census_tracts_score_comparison_df.columns.get_loc(column)\n", " column_character = get_excel_column_name(column_index)\n", "\n", " # Set all columns to larger width\n", @@ -977,7 +936,7 @@ " )\n", "\n", " # Add green to red conditional formatting.\n", - " column_ranges = f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n", + " column_ranges = f\"{column_character}2:{column_character}{len(census_tracts_score_comparison_df)+1}\"\n", " worksheet.conditional_format(\n", " column_ranges,\n", " # Min: green, max: red.\n", @@ -1010,13 +969,15 @@ " # Overwrite both the value and the format of each header cell\n", " # This is because xlsxwriter / pandas has a known bug where it can't wrap text for a dataframe.\n", " # See https://stackoverflow.com/questions/42562977/xlsxwriter-text-wrap-not-working.\n", - " for col_num, value in enumerate(cbg_score_comparison_df.columns.values):\n", + " for col_num, value in enumerate(\n", + " census_tracts_score_comparison_df.columns.values\n", + " ):\n", " worksheet.write(0, col_num, value, header_format)\n", "\n", " writer.save()\n", "\n", "\n", - "def compare_cbg_scores(\n", + "def compare_census_tracts_scores(\n", " df: pd.DataFrame,\n", " index_a: Index,\n", " index_b: Index,\n", @@ -1024,39 +985,39 @@ " comparison_fields: typing.List[str],\n", "):\n", " # Secondary comparison DF\n", - " cbg_score_comparison_df = get_cbg_score_comparison_df(\n", + " census_tracts_score_comparison_df = get_census_tracts_score_comparison_df(\n", " df=df,\n", - " method_a_priority_census_block_groups_field=index_a.priority_communities_field,\n", - " method_b_priority_census_block_groups_field=index_b.priority_communities_field,\n", + " method_a_priority_census_tracts_field=index_a.priority_communities_field,\n", + " method_b_priority_census_tracts_field=index_b.priority_communities_field,\n", " comparison_fields=comparison_fields,\n", " )\n", "\n", " # Write secondary comparison to CSV.\n", - " file_name_part = f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n", + " file_name_part = f\"Census tracts comparison output - {index_a.method_name} and {index_b.method_name}\"\n", " output_dir.mkdir(parents=True, exist_ok=True)\n", " file_path = output_dir / (file_name_part + \".csv\")\n", " file_path_xlsx = output_dir / (file_name_part + \".xlsx\")\n", "\n", - " cbg_score_comparison_df.to_csv(\n", + " census_tracts_score_comparison_df.to_csv(\n", " path_or_buf=file_path,\n", " na_rep=\"\",\n", " index=False,\n", " )\n", "\n", - " write_cbg_score_comparison_excel(\n", - " cbg_score_comparison_df=cbg_score_comparison_df,\n", + " write_census_tracts_score_comparison_excel(\n", + " census_tracts_score_comparison_df=census_tracts_score_comparison_df,\n", " file_path=file_path_xlsx,\n", " )\n", "\n", "\n", - "for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n", + "for (index_a, index_b) in itertools.combinations(census_tract_indices, 2):\n", " print(f\"Comparing {index_a} and {index_b}.\")\n", - " compare_cbg_scores(\n", + " compare_census_tracts_scores(\n", " df=merged_df,\n", " index_a=index_a,\n", " index_b=index_b,\n", " comparison_fields=comparison_fields,\n", - " output_dir=COMPARISON_OUTPUTS_DIR / \"cbg_score_comparisons\",\n", + " output_dir=COMPARISON_OUTPUTS_DIR / \"census_tracts_score_comparisons\",\n", " )" ] }, @@ -1094,35 +1055,44 @@ " return docx_file_path\n", "\n", "\n", - "def get_markdown_comparing_census_block_group_indices(\n", - " census_block_group_indices=typing.List[Index],\n", + "def get_markdown_comparing_census_tract_indices(\n", + " census_tract_indices=typing.List[Index],\n", " df=pd.DataFrame,\n", " state_field=GEOID_STATE_FIELD_NAME,\n", ") -> str:\n", - " \"\"\"Generate a Markdown string of analysis of multiple CBG indices.\"\"\"\n", - " count_field_name = \"Count of CBGs\"\n", - "\n", - " # List of all states/territories in their FIPS codes:\n", - " state_ids = sorted(df[state_field].unique())\n", - " state_names = \", \".join(\n", - " [us.states.lookup(state_id).name for state_id in state_ids]\n", - " )\n", + " \"\"\"Generate a Markdown string of analysis of multiple census tract indices.\"\"\"\n", + " count_field_name = \"Count of census tracts\"\n", "\n", " # Create markdown content for comparisons.\n", " markdown_content = f\"\"\"\n", - "# Comparing multiple indices at the census block group level\n", - " \n", + "# Comparing multiple indices at the census tract level\n", + "\n", "(This report was calculated on {datetime.today().strftime('%Y-%m-%d')}.)\n", "\n", - "This report compares the following indices: {\", \".join([index.method_name for index in census_block_group_indices])}.\n", - "\n", - "This report analyzes the following US states and territories: {state_names}.\n", + "This report compares the following indices: {\", \".join([index.method_name for index in census_tract_indices])}.\n", "\n", "\"\"\"\n", "\n", - " for (index1, index2) in itertools.combinations(\n", - " census_block_group_indices, 2\n", - " ):\n", + " for (index1, index2) in itertools.combinations(census_tract_indices, 2):\n", + " # First, find out geographic overlap in indices by finding all state and territory\n", + " # names where both indices are not null.\n", + " df_subset_for_states = df[\n", + " [\n", + " state_field,\n", + " index1.priority_communities_field,\n", + " index2.priority_communities_field,\n", + " ]\n", + " ]\n", + "\n", + " df_subset_for_states = df_subset_for_states.dropna()\n", + "\n", + " # List of all states/territories in their FIPS codes:\n", + " # TODO: move \"This report analyzes the following US states and territories\" inside the comparison?\n", + " state_ids = sorted(df_subset_for_states[state_field].unique())\n", + " this_comparison_state_names = \", \".join(\n", + " [us.states.lookup(state_id).name for state_id in state_ids]\n", + " )\n", + "\n", " # Group all data by their different values on Priority Communities Field for Index1 vs Priority Communities Field for Index2.\n", " count_df = (\n", " df.groupby(\n", @@ -1130,99 +1100,100 @@ " index1.priority_communities_field,\n", " index2.priority_communities_field,\n", " ]\n", - " )[GEOID_FIELD_NAME]\n", + " )[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]\n", " .count()\n", " .reset_index(name=count_field_name)\n", " )\n", "\n", - " total_cbgs = count_df[count_field_name].sum()\n", + " total_census_tracts = count_df[count_field_name].sum()\n", "\n", " # Returns a series\n", - " true_true_cbgs_series = count_df.loc[\n", + " true_true_census_tracts_series = count_df.loc[\n", " count_df[index1.priority_communities_field]\n", " & count_df[index2.priority_communities_field],\n", " count_field_name,\n", " ]\n", - " true_false_cbgs_series = count_df.loc[\n", + " true_false_census_tracts_series = count_df.loc[\n", " count_df[index1.priority_communities_field]\n", " & ~count_df[index2.priority_communities_field],\n", " count_field_name,\n", " ]\n", - " false_true_cbgs_series = count_df.loc[\n", + " false_true_census_tracts_series = count_df.loc[\n", " ~count_df[index1.priority_communities_field]\n", " & count_df[index2.priority_communities_field],\n", " count_field_name,\n", " ]\n", - " false_false_cbgs_series = count_df.loc[\n", + " false_false_census_tracts_series = count_df.loc[\n", " ~count_df[index1.priority_communities_field]\n", " & ~count_df[index2.priority_communities_field],\n", " count_field_name,\n", " ]\n", "\n", " # Convert from series to a scalar value, including accounting for if no data exists for that pairing.\n", - " true_true_cbgs = (\n", - " true_true_cbgs_series.iloc[0]\n", - " if len(true_true_cbgs_series) > 0\n", + " true_true_census_tracts = (\n", + " true_true_census_tracts_series.iloc[0]\n", + " if len(true_true_census_tracts_series) > 0\n", " else 0\n", " )\n", - " true_false_cbgs = (\n", - " true_false_cbgs_series.iloc[0]\n", - " if len(true_false_cbgs_series) > 0\n", + " true_false_census_tracts = (\n", + " true_false_census_tracts_series.iloc[0]\n", + " if len(true_false_census_tracts_series) > 0\n", " else 0\n", " )\n", - " false_true_cbgs = (\n", - " false_true_cbgs_series.iloc[0]\n", - " if len(false_true_cbgs_series) > 0\n", + " false_true_census_tracts = (\n", + " false_true_census_tracts_series.iloc[0]\n", + " if len(false_true_census_tracts_series) > 0\n", " else 0\n", " )\n", - " false_false_cbgs = (\n", - " false_false_cbgs_series.iloc[0]\n", - " if len(false_false_cbgs_series) > 0\n", + " false_false_census_tracts = (\n", + " false_false_census_tracts_series.iloc[0]\n", + " if len(false_false_census_tracts_series) > 0\n", " else 0\n", " )\n", "\n", " markdown_content += (\n", " \"*** \\n\\n\"\n", " \"There are \"\n", - " f\"{true_true_cbgs} ({true_true_cbgs / total_cbgs:.0%}) \"\n", - " f\"census block groups that are both {index1.method_name} priority communities and {index2.method_name} priority communities.\\n\\n\"\n", + " f\"{true_true_census_tracts} ({true_true_census_tracts / total_census_tracts:.0%}) \"\n", + " f\"census tracts that are both {index1.method_name} priority communities and {index2.method_name} priority communities.\\n\\n\"\n", " \"There are \"\n", - " f\"{true_false_cbgs} ({true_false_cbgs / total_cbgs:.0%}) \"\n", - " f\"census block groups that are {index1.method_name} priority communities but not {index2.method_name} priority communities.\\n\\n\"\n", + " f\"{true_false_census_tracts} ({true_false_census_tracts / total_census_tracts:.0%}) \"\n", + " f\"census tracts that are {index1.method_name} priority communities but not {index2.method_name} priority communities.\\n\\n\"\n", " \"There are \"\n", - " f\"{false_true_cbgs} ({false_true_cbgs / total_cbgs:.0%}) \"\n", - " f\"census block groups that are not {index1.method_name} priority communities but are {index2.method_name} priority communities.\\n\\n\"\n", + " f\"{false_true_census_tracts} ({false_true_census_tracts / total_census_tracts:.0%}) \"\n", + " f\"census tracts that are not {index1.method_name} priority communities but are {index2.method_name} priority communities.\\n\\n\"\n", " \"There are \"\n", - " f\"{false_false_cbgs} ({false_false_cbgs / total_cbgs:.0%}) \"\n", - " f\"census block groups that are neither {index1.method_name} priority communities nor {index2.method_name} priority communities.\\n\\n\"\n", + " f\"{false_false_census_tracts} ({false_false_census_tracts / total_census_tracts:.0%}) \"\n", + " f\"census tracts that are neither {index1.method_name} priority communities nor {index2.method_name} priority communities.\\n\\n\"\n", + " f\"This comparison analyzed the following US states and territories: {this_comparison_state_names}.\\n\\n\"\n", " \"\\n\\n\"\n", " )\n", "\n", " return markdown_content\n", "\n", "\n", - "def get_comparison_census_block_group_indices(\n", - " census_block_group_indices=typing.List[Index],\n", + "def get_comparison_census_tract_indices(\n", + " census_tract_indices=typing.List[Index],\n", " df=pd.DataFrame,\n", " state_field=GEOID_STATE_FIELD_NAME,\n", ") -> pathlib.PosixPath:\n", - " markdown_content = get_markdown_comparing_census_block_group_indices(\n", - " census_block_group_indices=census_block_group_indices,\n", - " df=merged_with_state_information_df,\n", + " markdown_content = get_markdown_comparing_census_tract_indices(\n", + " census_tract_indices=census_tract_indices,\n", + " df=df,\n", " )\n", "\n", " comparison_docx_file_path = write_markdown_and_docx_content(\n", " markdown_content=markdown_content,\n", " file_dir=COMPARISON_OUTPUTS_DIR,\n", - " file_name_without_extension=f\"Comparison report - All CBG indices\",\n", + " file_name_without_extension=f\"Comparison report - All census tract indices\",\n", " )\n", "\n", " return comparison_docx_file_path\n", "\n", "\n", - "# Compare multiple scores at the CBG level\n", - "get_comparison_census_block_group_indices(\n", - " census_block_group_indices=census_block_group_indices,\n", + "# Compare multiple scores at the census tract level\n", + "get_comparison_census_tract_indices(\n", + " census_tract_indices=census_tract_indices,\n", " df=merged_with_state_information_df,\n", ")" ] @@ -1230,478 +1201,15 @@ { "cell_type": "code", "execution_count": null, - "id": "4f44426c", + "id": "983abcea", "metadata": {}, "outputs": [], "source": [ - "# This cell defines a variety of comparison functions. It does not run them.\n", + "# Note: this is helpful because this file is long-running, so it alerts the user when the\n", + "# data analysis is done. Can be removed when converted into scripts. -LMB.\n", + "import os\n", "\n", - "# Define a namedtuple for column names, which need to be shared between multiple parts of this comparison pipeline.\n", - "# Named tuples are useful here because they provide guarantees that for each instance, all properties are defined and\n", - "# can be accessed as properties (rather than as strings).\n", - "\n", - "# Note: if you'd like to add a field used throughout the comparison process, add it in three places.\n", - "# For an example `new_field`,\n", - "# 1. in this namedtuple, add the field as a string in `field_names` (e.g., `field_names=[..., \"new_field\"])`)\n", - "# 2. in the function `get_comparison_field_names`, define how the field name should be created from input data\n", - "# (e.g., `...new_field=f\"New field compares {method_a_name} to {method_b_name}\")\n", - "# 3. In the function `get_comparison_markdown_content`, add some reporting on the new field to the markdown content.\n", - "# (e.g., `The statistics indicate that {calculation_based_on_new_field} percent of census tracts are different between scores.`)\n", - "ComparisonFieldNames = collections.namedtuple(\n", - " typename=\"ComparisonFieldNames\",\n", - " field_names=[\n", - " \"any_tract_has_at_least_one_method_a_cbg\",\n", - " \"method_b_tract_has_at_least_one_method_a_cbg\",\n", - " \"method_b_tract_has_100_percent_method_a_cbg\",\n", - " \"method_b_non_priority_tract_has_at_least_one_method_a_cbg\",\n", - " \"method_b_non_priority_tract_has_100_percent_method_a_cbg\",\n", - " ],\n", - ")\n", - "\n", - "\n", - "def get_comparison_field_names(\n", - " method_a_name: str,\n", - " method_b_name: str,\n", - ") -> ComparisonFieldNames:\n", - " comparison_field_names = ComparisonFieldNames(\n", - " any_tract_has_at_least_one_method_a_cbg=(\n", - " f\"Any tract has at least one {method_a_name} Priority CBG?\"\n", - " ),\n", - " method_b_tract_has_at_least_one_method_a_cbg=(\n", - " f\"{method_b_name} priority tract has at least one {method_a_name} CBG?\"\n", - " ),\n", - " method_b_tract_has_100_percent_method_a_cbg=(\n", - " f\"{method_b_name} tract has 100% {method_a_name} priority CBGs?\"\n", - " ),\n", - " method_b_non_priority_tract_has_at_least_one_method_a_cbg=(\n", - " f\"Non-priority {method_b_name} tract has at least one {method_a_name} priority CBG?\"\n", - " ),\n", - " method_b_non_priority_tract_has_100_percent_method_a_cbg=(\n", - " f\"Non-priority {method_b_name} tract has 100% {method_a_name} priority CBGs?\"\n", - " ),\n", - " )\n", - " return comparison_field_names\n", - "\n", - "\n", - "def get_df_with_only_shared_states(\n", - " df: pd.DataFrame,\n", - " field_a: str,\n", - " field_b: str,\n", - " state_field=GEOID_STATE_FIELD_NAME,\n", - ") -> pd.DataFrame:\n", - " \"\"\"\n", - " Useful for looking at shared geographies across two fields.\n", - "\n", - " For a data frame and two fields, return a data frame only for states where there are non-null\n", - " values for both fields in that state (or territory).\n", - "\n", - " This is useful, for example, when running a comparison of CalEnviroScreen (only in California) against\n", - " a draft score that's national, and returning only the data for California for the entire data frame.\n", - " \"\"\"\n", - " field_a_states = df.loc[df[field_a].notnull(), state_field].unique()\n", - " field_b_states = df.loc[df[field_b].notnull(), state_field].unique()\n", - "\n", - " shared_states = list(set(field_a_states) & set(field_b_states))\n", - "\n", - " df = df.loc[df[state_field].isin(shared_states), :]\n", - "\n", - " return df\n", - "\n", - "\n", - "def get_comparison_df(\n", - " df: pd.DataFrame,\n", - " method_a_priority_census_block_groups_field: str,\n", - " method_b_priority_census_tracts_field: str,\n", - " other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],\n", - " comparison_field_names: ComparisonFieldNames,\n", - " output_dir: pathlib.PosixPath,\n", - ") -> None:\n", - " \"\"\"Produces a comparison report for any two given boolean columns representing priority fields.\n", - "\n", - " Args:\n", - " df: a pandas dataframe including the data for this comparison.\n", - " method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority\n", - " community field that defines communities at the level of census block groups (CBGs).\n", - " method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority\n", - " community field that defines communities at the level of census tracts.\n", - " other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level\n", - "\n", - " Returns:\n", - " df: a pandas dataframe with one row with the results of this comparison\n", - " \"\"\"\n", - "\n", - " def calculate_comparison(frame: pd.DataFrame) -> pd.DataFrame:\n", - " \"\"\"\n", - " This method will be applied to a `group_by` object.\n", - "\n", - " Note: It inherits from outer scope `method_a_priority_census_block_groups_field`, `method_b_priority_census_tracts_field`,\n", - " and `other_census_tract_fields_to_keep`.\n", - " \"\"\"\n", - " # Keep all the tract values at the Census Tract Level\n", - " for field in other_census_tract_fields_to_keep:\n", - " if len(frame[field].unique()) != 1:\n", - " raise ValueError(\n", - " f\"There are different values per CBG for field {field}.\"\n", - " \"`other_census_tract_fields_to_keep` can only be used for fields at the census tract level.\"\n", - " )\n", - "\n", - " df = frame.loc[\n", - " frame.index[0],\n", - " [\n", - " GEOID_TRACT_FIELD_NAME,\n", - " method_b_priority_census_tracts_field,\n", - " ]\n", - " + other_census_tract_fields_to_keep,\n", - " ]\n", - "\n", - " # Convenience constant for whether the tract is or is not a method B priority community.\n", - " is_a_method_b_priority_tract = frame.loc[\n", - " frame.index[0], [method_b_priority_census_tracts_field]\n", - " ][0]\n", - "\n", - " # Recall that NaN values are not falsy, so we need to check if `is_a_method_b_priority_tract` is True.\n", - " is_a_method_b_priority_tract = is_a_method_b_priority_tract is True\n", - "\n", - " # Calculate whether the tract (whether or not it is a comparison priority tract) includes CBGs that are priority\n", - " # according to the current CBG score.\n", - " df[comparison_field_names.any_tract_has_at_least_one_method_a_cbg] = (\n", - " frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n", - " )\n", - "\n", - " # Calculate comparison\n", - " # A comparison priority tract has at least one CBG that is a priority CBG.\n", - " df[\n", - " comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n", - " ] = (\n", - " frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n", - " if is_a_method_b_priority_tract\n", - " else None\n", - " )\n", - "\n", - " # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n", - " df[\n", - " comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n", - " ] = (\n", - " frame.loc[:, method_a_priority_census_block_groups_field].mean()\n", - " == 1\n", - " if is_a_method_b_priority_tract\n", - " else None\n", - " )\n", - "\n", - " # Calculate the inverse\n", - " # A tract that is _not_ a comparison priority has at least one CBG priority CBG.\n", - " df[\n", - " comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg\n", - " ] = (\n", - " frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n", - " if not is_a_method_b_priority_tract\n", - " else None\n", - " )\n", - "\n", - " # A tract that is _not_ a comparison priority has all of its contained CBGs as CBG priority CBGs.\n", - " df[\n", - " comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n", - " ] = (\n", - " frame.loc[:, method_a_priority_census_block_groups_field].mean()\n", - " == 1\n", - " if not is_a_method_b_priority_tract\n", - " else None\n", - " )\n", - "\n", - " # For all remaining fields, calculate the average\n", - " # TODO: refactor to vectorize to make faster.\n", - " for field in [\n", - " \"Poverty (Less than 200% of federal poverty line)\",\n", - " \"Percent of households in linguistic isolation\",\n", - " \"Percent individuals age 25 or over with less than high school degree\",\n", - " \"Unemployed civilians (percent)\",\n", - " ]:\n", - " df[f\"{field} (average of CBGs)\"] = frame.loc[:, field].mean()\n", - "\n", - " return df\n", - "\n", - " # Group all data by the census tract.\n", - " grouped_df = df.groupby(GEOID_TRACT_FIELD_NAME)\n", - "\n", - " # Run the comparison function on the groups.\n", - " comparison_df = grouped_df.progress_apply(calculate_comparison)\n", - "\n", - " return comparison_df\n", - "\n", - "\n", - "def get_comparison_markdown_content(\n", - " original_df: pd.DataFrame,\n", - " comparison_df: pd.DataFrame,\n", - " comparison_field_names: ComparisonFieldNames,\n", - " method_a_name: str,\n", - " method_b_name: str,\n", - " method_a_priority_census_block_groups_field: str,\n", - " method_b_priority_census_tracts_field: str,\n", - " state_field: str = GEOID_STATE_FIELD_NAME,\n", - ") -> str:\n", - " # Prepare some constants for use in the following Markdown content.\n", - " total_cbgs = len(original_df)\n", - "\n", - " # List of all states/territories in their FIPS codes:\n", - " state_ids = sorted(original_df[state_field].unique())\n", - " state_names = \", \".join(\n", - " [us.states.lookup(state_id).name for state_id in state_ids]\n", - " )\n", - "\n", - " # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n", - " # TODO: investigate why sums are sometimes series and sometimes scalar.\n", - " method_a_priority_cbgs = (\n", - " original_df.loc[:, method_a_priority_census_block_groups_field]\n", - " .sum()\n", - " .squeeze()\n", - " )\n", - " method_a_priority_cbgs_percent = (\n", - " f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n", - " )\n", - "\n", - " total_tracts_count = len(comparison_df)\n", - "\n", - " method_b_priority_tracts_count = comparison_df.loc[\n", - " :, method_b_priority_census_tracts_field\n", - " ].sum()\n", - "\n", - " method_b_priority_tracts_count_percent = (\n", - " f\"{method_b_priority_tracts_count / total_tracts_count:.0%}\"\n", - " )\n", - " method_b_non_priority_tracts_count = (\n", - " total_tracts_count - method_b_priority_tracts_count\n", - " )\n", - "\n", - " method_a_tracts_count = (\n", - " comparison_df.loc[\n", - " :, comparison_field_names.any_tract_has_at_least_one_method_a_cbg\n", - " ]\n", - " .sum()\n", - " .squeeze()\n", - " )\n", - " method_a_tracts_count_percent = (\n", - " f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n", - " )\n", - "\n", - " # Method A priority community stats\n", - " method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n", - " :, comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n", - " ].sum()\n", - " method_b_tracts_with_at_least_one_method_a_cbg_percent = f\"{method_b_tracts_with_at_least_one_method_a_cbg / method_b_priority_tracts_count:.0%}\"\n", - "\n", - " method_b_tracts_with_at_100_percent_method_a_cbg = comparison_df.loc[\n", - " :, comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n", - " ].sum()\n", - " method_b_tracts_with_at_100_percent_method_a_cbg_percent = f\"{method_b_tracts_with_at_100_percent_method_a_cbg / method_b_priority_tracts_count:.0%}\"\n", - "\n", - " # Method A non-priority community stats\n", - " method_b_non_priority_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n", - " :,\n", - " comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg,\n", - " ].sum()\n", - "\n", - " method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent = f\"{method_b_non_priority_tracts_with_at_least_one_method_a_cbg / method_b_non_priority_tracts_count:.0%}\"\n", - "\n", - " method_b_non_priority_tracts_with_100_percent_method_a_cbg = comparison_df.loc[\n", - " :,\n", - " comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg,\n", - " ].sum()\n", - " method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent = f\"{method_b_non_priority_tracts_with_100_percent_method_a_cbg / method_b_non_priority_tracts_count:.0%}\"\n", - "\n", - " # Create markdown content for comparisons.\n", - " markdown_content = f\"\"\"\n", - "# {method_a_name} compared to {method_b_name}\n", - "\n", - "(This report was calculated on {datetime.today().strftime('%Y-%m-%d')}.)\n", - "\n", - "This report analyzes the following US states and territories: {state_names}.\n", - "\n", - "Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.\n", - "\n", - "Within the geographic area analyzed, there are {method_b_priority_tracts_count} census tracts designated as priority communities by {method_b_name}, out of {total_tracts_count} total tracts ({method_b_priority_tracts_count_percent}). \n", - "\n", - "Within the geographic region analyzed, there are {method_a_priority_cbgs} census block groups considered as priority communities by {method_a_name}, out of {total_cbgs} CBGs ({method_a_priority_cbgs_percent}). They occupy {method_a_tracts_count} census tracts ({method_a_tracts_count_percent}) of the geographic area analyzed.\n", - "\n", - "Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_least_one_method_a_cbg} ({method_b_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by {method_a_name}.\n", - "\n", - "Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_100_percent_method_a_cbg} ({method_b_tracts_with_at_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by {method_a_name}.\n", - "\n", - "Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_at_least_one_method_a_cbg} ({method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n", - "\n", - "Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_100_percent_method_a_cbg} ({method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.\n", - "\"\"\"\n", - "\n", - " return markdown_content\n", - "\n", - "\n", - "def get_secondary_comparison_df(\n", - " comparison_df: pd.DataFrame,\n", - " comparison_field_names: ComparisonFieldNames,\n", - " method_b_priority_census_tracts_field: str,\n", - ") -> pd.DataFrame:\n", - " \"\"\"A secondary level of comparison.\n", - "\n", - " The first level of comparison identifies census tracts prioritized by Method A,\n", - " compared to whether or not they're prioritized by Method B.\n", - "\n", - " This comparison method analyzes characteristics of those census tracts, based on whether or not they are prioritized\n", - " or not by Method A and/or Method B.\n", - "\n", - "\n", - " E.g., it might show that tracts prioritized by A but not B have a higher average income,\n", - " or that tracts prioritized by B but not A have a lower percent of unemployed people.\"\"\"\n", - " grouped_df = comparison_df.groupby(\n", - " [\n", - " method_b_priority_census_tracts_field,\n", - " comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg,\n", - " comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg,\n", - " ],\n", - " dropna=False,\n", - " )\n", - "\n", - " # Run the comparison function on the groups.\n", - " secondary_comparison_df = grouped_df.mean().reset_index()\n", - "\n", - " return secondary_comparison_df\n", - "\n", - "\n", - "def execute_comparison(\n", - " df: pd.DataFrame,\n", - " method_a_name: str,\n", - " method_b_name: str,\n", - " method_a_priority_census_block_groups_field: str,\n", - " method_b_priority_census_tracts_field: str,\n", - " other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],\n", - ") -> pathlib.PosixPath:\n", - " \"\"\"Execute an individual comparison by creating the data frame and writing the report.\n", - "\n", - " Args:\n", - " df: a pandas dataframe including the data for this comparison.\n", - " method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority\n", - " community field that defines communities at the level of census block groups (CBGs).\n", - " method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority\n", - " community field that defines communities at the level of census tracts.\n", - " other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level\n", - "\n", - " Returns:\n", - " df: a pandas dataframe with one row with the results of this comparison\n", - "\n", - " \"\"\"\n", - " comparison_field_names = get_comparison_field_names(\n", - " method_a_name=method_a_name, method_b_name=method_b_name\n", - " )\n", - "\n", - " # Create or use a directory for outputs grouped by Method A.\n", - " output_dir = COMPARISON_OUTPUTS_DIR / method_a_name\n", - " output_dir.mkdir(parents=True, exist_ok=True)\n", - "\n", - " df_with_only_shared_states = get_df_with_only_shared_states(\n", - " df=df,\n", - " field_a=method_a_priority_census_block_groups_field,\n", - " field_b=method_b_priority_census_tracts_field,\n", - " )\n", - "\n", - " comparison_df = get_comparison_df(\n", - " df=df_with_only_shared_states,\n", - " method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,\n", - " method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n", - " comparison_field_names=comparison_field_names,\n", - " other_census_tract_fields_to_keep=other_census_tract_fields_to_keep,\n", - " output_dir=output_dir,\n", - " )\n", - "\n", - " # Write comparison to CSV.\n", - " file_path = (\n", - " output_dir\n", - " / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n", - " )\n", - " comparison_df.to_csv(\n", - " path_or_buf=file_path,\n", - " na_rep=\"\",\n", - " index=False,\n", - " )\n", - "\n", - " # Secondary comparison DF\n", - " secondary_comparison_df = get_secondary_comparison_df(\n", - " comparison_df=comparison_df,\n", - " comparison_field_names=comparison_field_names,\n", - " method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n", - " )\n", - "\n", - " # Write secondary comparison to CSV.\n", - " file_path = (\n", - " output_dir\n", - " / f\"Secondary Comparison Output - {method_a_name} and {method_b_name}.csv\"\n", - " )\n", - " secondary_comparison_df.to_csv(\n", - " path_or_buf=file_path,\n", - " na_rep=\"\",\n", - " index=False,\n", - " )\n", - "\n", - " markdown_content = get_comparison_markdown_content(\n", - " original_df=df_with_only_shared_states,\n", - " comparison_df=comparison_df,\n", - " comparison_field_names=comparison_field_names,\n", - " method_a_name=method_a_name,\n", - " method_b_name=method_b_name,\n", - " method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,\n", - " method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n", - " )\n", - "\n", - " comparison_docx_file_path = write_markdown_and_docx_content(\n", - " markdown_content=markdown_content,\n", - " file_dir=output_dir,\n", - " file_name_without_extension=f\"Comparison report - {method_a_name} and {method_b_name}\",\n", - " )\n", - "\n", - " return comparison_docx_file_path\n", - "\n", - "\n", - "def execute_comparisons(\n", - " df: pd.DataFrame,\n", - " census_block_group_indices: typing.List[Index],\n", - " census_tract_indices: typing.List[Index],\n", - "):\n", - " \"\"\"Create multiple comparison reports.\"\"\"\n", - " comparison_docx_file_paths = []\n", - " for cbg_index in census_block_group_indices:\n", - " for census_tract_index in census_tract_indices:\n", - " print(\n", - " f\"Running comparisons for {cbg_index.method_name} against {census_tract_index.method_name}...\"\n", - " )\n", - "\n", - " comparison_docx_file_path = execute_comparison(\n", - " df=df,\n", - " method_a_name=cbg_index.method_name,\n", - " method_b_name=census_tract_index.method_name,\n", - " method_a_priority_census_block_groups_field=cbg_index.priority_communities_field,\n", - " method_b_priority_census_tracts_field=census_tract_index.priority_communities_field,\n", - " other_census_tract_fields_to_keep=census_tract_index.other_census_tract_fields_to_keep,\n", - " )\n", - "\n", - " comparison_docx_file_paths.append(comparison_docx_file_path)\n", - "\n", - " return comparison_docx_file_paths" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "908e0ad4", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# Actually execute the functions\n", - "file_paths = execute_comparisons(\n", - " df=merged_df,\n", - " census_block_group_indices=census_block_group_indices,\n", - " census_tract_indices=census_tract_indices,\n", - ")\n", - "\n", - "print(file_paths)" + "os.system(\"say 'data analysis is written.'\")" ] } ], @@ -1721,7 +1229,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 14220883..caacaa6e 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -118,7 +118,7 @@ UNDER_5_FIELD = "Individuals under 5 years old" OVER_64_FIELD = "Individuals over 64 years old" # Urban Rural Map -URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag" +URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag" # Housing value MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"