mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-26 18:51:17 -07:00
Adding VA and CO ETL from mapping for environmental justice (#1177)
Adding the mapping for environmental justice data, which contains information about VA and CO, to the ETL pipeline.
This commit is contained in:
parent
1d399d3ca9
commit
6a00b29f5d
6 changed files with 209 additions and 115 deletions
|
@ -1,4 +1,9 @@
|
||||||
DATASET_LIST = [
|
DATASET_LIST = [
|
||||||
|
{
|
||||||
|
"name": "mapping_for_ej",
|
||||||
|
"module_dir": "mapping_for_ej",
|
||||||
|
"class_name": "MappingForEJETL",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "census_acs",
|
"name": "census_acs",
|
||||||
"module_dir": "census_acs",
|
"module_dir": "census_acs",
|
||||||
|
|
|
@ -0,0 +1,99 @@
|
||||||
|
import pandas as pd
|
||||||
|
import geopandas as gpd
|
||||||
|
|
||||||
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.score import field_names
|
||||||
|
from data_pipeline.config import settings
|
||||||
|
|
||||||
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MappingForEJETL(ExtractTransformLoad):
|
||||||
|
def __init__(self):
|
||||||
|
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
|
||||||
|
|
||||||
|
self.MAPPING_FOR_EJ_VA_URL = (
|
||||||
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
|
||||||
|
)
|
||||||
|
self.MAPPING_FOR_EJ_CO_URL = (
|
||||||
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
|
||||||
|
)
|
||||||
|
self.VA_SHP_FILE_PATH = self.TMP_PATH / "mej_virginia_7_1.shp"
|
||||||
|
self.CO_SHP_FILE_PATH = self.TMP_PATH / "mej_colorado_final.shp"
|
||||||
|
|
||||||
|
# Defining variables
|
||||||
|
self.COLUMNS_TO_KEEP = [
|
||||||
|
self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
field_names.MAPPING_FOR_EJ_FINAL_PERCENTILE_FIELD,
|
||||||
|
field_names.MAPPING_FOR_EJ_FINAL_SCORE_FIELD,
|
||||||
|
field_names.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_FIELD,
|
||||||
|
]
|
||||||
|
|
||||||
|
# Choosing constants.
|
||||||
|
# In our current score implementation, about 17% of CO and 20% of VA tracts are
|
||||||
|
# identified as disadvantaged. Consequently, the rank-based threshold is 20%.
|
||||||
|
# Using the scores to calculate which are priority communities doesn't quite track
|
||||||
|
# with this distribution, and so I've opted to choose roughly 20% of both states.
|
||||||
|
self.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_PERCENTILE_THRESHOLD = 80
|
||||||
|
|
||||||
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
def extract(self) -> None:
|
||||||
|
logger.info("Downloading Mapping for EJ Data")
|
||||||
|
super().extract(
|
||||||
|
self.MAPPING_FOR_EJ_VA_URL,
|
||||||
|
self.TMP_PATH,
|
||||||
|
)
|
||||||
|
super().extract(
|
||||||
|
self.MAPPING_FOR_EJ_CO_URL,
|
||||||
|
self.TMP_PATH,
|
||||||
|
)
|
||||||
|
|
||||||
|
def transform(self) -> None:
|
||||||
|
logger.info("Transforming Mapping for EJ Data")
|
||||||
|
|
||||||
|
# Join (here, it's just concatenating) the two dataframes from
|
||||||
|
# CO and VA
|
||||||
|
self.df = pd.concat(
|
||||||
|
[
|
||||||
|
gpd.read_file(self.VA_SHP_FILE_PATH),
|
||||||
|
gpd.read_file(self.CO_SHP_FILE_PATH),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fill Census tract to get it to be 11 digits, incl. leading 0s
|
||||||
|
# Note that VA and CO should never have leading 0s, so this isn't
|
||||||
|
# strictly necessary, but if in the future, there are more states
|
||||||
|
# this seems like a reasonable thing to include.
|
||||||
|
self.df[self.GEOID_TRACT_FIELD_NAME] = (
|
||||||
|
self.df["fips_tract"].astype(str).str.zfill(11)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Note that there are tracts in this dataset that do not have a final ranking
|
||||||
|
# because they are missing data. I've retained them to be consistent with other ETLs.
|
||||||
|
self.df = self.df.rename(
|
||||||
|
columns={
|
||||||
|
"fin_rank": field_names.MAPPING_FOR_EJ_FINAL_PERCENTILE_FIELD,
|
||||||
|
"fin_score": field_names.MAPPING_FOR_EJ_FINAL_SCORE_FIELD,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate prioritized communities based on percentile, only
|
||||||
|
# for tracts that have complete data
|
||||||
|
self.df[field_names.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_FIELD] = (
|
||||||
|
self.df[field_names.MAPPING_FOR_EJ_FINAL_PERCENTILE_FIELD]
|
||||||
|
>= self.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_PERCENTILE_THRESHOLD
|
||||||
|
)
|
||||||
|
|
||||||
|
def load(self) -> None:
|
||||||
|
logger.info("Saving Mapping for EJ CSV")
|
||||||
|
# write selected states csv
|
||||||
|
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
|
self.CSV_PATH / "co_va.csv", index=False
|
||||||
|
)
|
||||||
|
|
||||||
|
def validate(self) -> None:
|
||||||
|
logger.info("Validating Mapping For EJ Data")
|
||||||
|
pass
|
|
@ -43,6 +43,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
"from data_pipeline.score import field_names\n",
|
"from data_pipeline.score import field_names\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"%load_ext lab_black\n",
|
||||||
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
||||||
"tqdm_notebook.pandas()"
|
"tqdm_notebook.pandas()"
|
||||||
]
|
]
|
||||||
|
@ -101,9 +102,7 @@
|
||||||
"# Create the state ID by taking the first two digits of the FIPS CODE of the tract.\n",
|
"# Create the state ID by taking the first two digits of the FIPS CODE of the tract.\n",
|
||||||
"# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n",
|
"# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n",
|
||||||
"cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (\n",
|
"cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (\n",
|
||||||
" cejst_df.loc[:, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]\n",
|
" cejst_df.loc[:, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].astype(str).str[0:2]\n",
|
||||||
" .astype(str)\n",
|
|
||||||
" .str[0:2]\n",
|
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"cejst_df.head()"
|
"cejst_df.head()"
|
||||||
|
@ -113,9 +112,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "a251a0fb",
|
"id": "a251a0fb",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Load EJSCREEN Areas of Concern data.\n",
|
"# Load EJSCREEN Areas of Concern data.\n",
|
||||||
|
@ -149,9 +146,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "e43a9e23",
|
"id": "e43a9e23",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Merge EJSCREEN AoCs into CEJST data.\n",
|
"# Merge EJSCREEN AoCs into CEJST data.\n",
|
||||||
|
@ -174,9 +169,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "38c0dc2f",
|
"id": "38c0dc2f",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Analyze one field at a time (useful for setting thresholds)\n",
|
"# Analyze one field at a time (useful for setting thresholds)\n",
|
||||||
|
@ -214,35 +207,71 @@
|
||||||
"CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
|
"CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
|
||||||
"CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
|
"CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"calenviroscreen_data_path = (\n",
|
"calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
|
||||||
" DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
|
|
||||||
")\n",
|
|
||||||
"calenviroscreen_df = pd.read_csv(\n",
|
"calenviroscreen_df = pd.read_csv(\n",
|
||||||
" calenviroscreen_data_path,\n",
|
" calenviroscreen_data_path,\n",
|
||||||
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Convert priority community field to a bool.\n",
|
"# Convert priority community field to a bool.\n",
|
||||||
"calenviroscreen_df[\n",
|
"calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n",
|
||||||
" CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
|
" CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
|
||||||
"] = calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].astype(bool)\n",
|
"].astype(bool)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"calenviroscreen_df.head()"
|
"calenviroscreen_df.head()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b1ac2854-80c8-42a8-85e8-84c5684bbe43",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Mapping for EJ\n",
|
||||||
|
"mapping_for_ej_path = DATA_DIR / \"dataset\" / \"mapping_for_ej\" / \"co_va.csv\"\n",
|
||||||
|
"\n",
|
||||||
|
"mapping_for_ej_df = pd.read_csv(\n",
|
||||||
|
" mapping_for_ej_path,\n",
|
||||||
|
" dtype={\n",
|
||||||
|
" ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\",\n",
|
||||||
|
" field_names.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_FIELD: \"bool\",\n",
|
||||||
|
" },\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"mapping_for_ej_df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b1ac2854-80c8-42a8-85e8-84c5684bbe43",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Mapping for EJ\n",
|
||||||
|
"mapping_for_ej_path = DATA_DIR / \"dataset\" / \"mapping_for_ej\" / \"co_va.csv\"\n",
|
||||||
|
"\n",
|
||||||
|
"mapping_for_ej_df = pd.read_csv(\n",
|
||||||
|
" mapping_for_ej_path,\n",
|
||||||
|
" dtype={\n",
|
||||||
|
" ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\",\n",
|
||||||
|
" field_names.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_FIELD: \"bool\",\n",
|
||||||
|
" },\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"mapping_for_ej_df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "d8ec43dc",
|
"id": "d8ec43dc",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Load persistent poverty data\n",
|
"# Load persistent poverty data\n",
|
||||||
"persistent_poverty_path = (\n",
|
"persistent_poverty_path = DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
|
||||||
" DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
|
|
||||||
")\n",
|
|
||||||
"persistent_poverty_df = pd.read_csv(\n",
|
"persistent_poverty_df = pd.read_csv(\n",
|
||||||
" persistent_poverty_path,\n",
|
" persistent_poverty_path,\n",
|
||||||
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
||||||
|
@ -255,9 +284,7 @@
|
||||||
"PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n",
|
"PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"persistent_poverty_df.rename(\n",
|
"persistent_poverty_df.rename(\n",
|
||||||
" columns={\n",
|
" columns={PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD},\n",
|
||||||
" PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD\n",
|
|
||||||
" },\n",
|
|
||||||
" inplace=True,\n",
|
" inplace=True,\n",
|
||||||
" errors=\"raise\",\n",
|
" errors=\"raise\",\n",
|
||||||
")\n",
|
")\n",
|
||||||
|
@ -269,9 +296,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "81826d29",
|
"id": "81826d29",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Load mapping inequality data\n",
|
"# Load mapping inequality data\n",
|
||||||
|
@ -280,9 +305,7 @@
|
||||||
" field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD,\n",
|
" field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD,\n",
|
||||||
" field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD,\n",
|
" field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD,\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
"mapping_inequality_path = (\n",
|
"mapping_inequality_path = DATA_DIR / \"dataset\" / \"mapping_inequality\" / \"usa.csv\"\n",
|
||||||
" DATA_DIR / \"dataset\" / \"mapping_inequality\" / \"usa.csv\"\n",
|
|
||||||
")\n",
|
|
||||||
"mapping_inequality_df = pd.read_csv(\n",
|
"mapping_inequality_df = pd.read_csv(\n",
|
||||||
" mapping_inequality_path,\n",
|
" mapping_inequality_path,\n",
|
||||||
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
||||||
|
@ -329,9 +352,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "605af1ff",
|
"id": "605af1ff",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Load alternative energy-related definition\n",
|
"# Load alternative energy-related definition\n",
|
||||||
|
@ -350,9 +371,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "fe4a2939",
|
"id": "fe4a2939",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Load Michigan EJSCREEN\n",
|
"# Load Michigan EJSCREEN\n",
|
||||||
|
@ -404,6 +423,7 @@
|
||||||
" energy_definition_alternative_draft_df,\n",
|
" energy_definition_alternative_draft_df,\n",
|
||||||
" michigan_ejscreen_df,\n",
|
" michigan_ejscreen_df,\n",
|
||||||
" cdc_svi_index_df,\n",
|
" cdc_svi_index_df,\n",
|
||||||
|
" mapping_for_ej_df,\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"merged_df = functools.reduce(\n",
|
"merged_df = functools.reduce(\n",
|
||||||
|
@ -416,9 +436,7 @@
|
||||||
" census_tract_dfs,\n",
|
" census_tract_dfs,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"tract_values = (\n",
|
"tract_values = merged_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].str.len().unique()\n",
|
||||||
" merged_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].str.len().unique()\n",
|
|
||||||
")\n",
|
|
||||||
"if any(tract_values != [11]):\n",
|
"if any(tract_values != [11]):\n",
|
||||||
" print(tract_values)\n",
|
" print(tract_values)\n",
|
||||||
" raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
|
" raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
|
||||||
|
@ -433,9 +451,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "2de78f71",
|
"id": "2de78f71",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Special handling for HOLC.\n",
|
"# Special handling for HOLC.\n",
|
||||||
|
@ -555,6 +571,10 @@
|
||||||
" priority_communities_field=\"calenviroscreen_priority_community\",\n",
|
" priority_communities_field=\"calenviroscreen_priority_community\",\n",
|
||||||
" ),\n",
|
" ),\n",
|
||||||
" Index(\n",
|
" Index(\n",
|
||||||
|
" method_name=\"Mapping for EJ\",\n",
|
||||||
|
" priority_communities_field=field_names.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_FIELD,\n",
|
||||||
|
" ),\n",
|
||||||
|
" Index(\n",
|
||||||
" method_name=\"EPA RSEI Aggregate Microdata\",\n",
|
" method_name=\"EPA RSEI Aggregate Microdata\",\n",
|
||||||
" priority_communities_field=field_names.EPA_RSEI_SCORE_THRESHOLD_FIELD,\n",
|
" priority_communities_field=field_names.EPA_RSEI_SCORE_THRESHOLD_FIELD,\n",
|
||||||
" ),\n",
|
" ),\n",
|
||||||
|
@ -708,13 +728,13 @@
|
||||||
" summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
|
" summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" for priority_communities_field in priority_communities_fields:\n",
|
" for priority_communities_field in priority_communities_fields:\n",
|
||||||
" summary_dict[\n",
|
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
|
||||||
" f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
|
" f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
|
||||||
" ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n",
|
" ].sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" summary_dict[\n",
|
" summary_dict[f\"{priority_communities_field} (total tracts)\"] = frame[\n",
|
||||||
" f\"{priority_communities_field} (total tracts)\"\n",
|
" f\"{priority_communities_field}\"\n",
|
||||||
" ] = frame[f\"{priority_communities_field}\"].sum()\n",
|
" ].sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Calculate some combinations of other variables.\n",
|
" # Calculate some combinations of other variables.\n",
|
||||||
" summary_dict[f\"{priority_communities_field} (percent tracts)\"] = (\n",
|
" summary_dict[f\"{priority_communities_field} (percent tracts)\"] = (\n",
|
||||||
|
@ -722,9 +742,7 @@
|
||||||
" / total_tracts_in_geography\n",
|
" / total_tracts_in_geography\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" summary_dict[\n",
|
" summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n",
|
||||||
" f\"{priority_communities_field} (percent population)\"\n",
|
|
||||||
" ] = (\n",
|
|
||||||
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
|
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
|
||||||
" / total_population_in_geography\n",
|
" / total_population_in_geography\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
|
@ -770,9 +788,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # Run the comparison function on the groups.\n",
|
" # Run the comparison function on the groups.\n",
|
||||||
" region_distribution_df = region_grouped_df.progress_apply(\n",
|
" region_distribution_df = region_grouped_df.progress_apply(\n",
|
||||||
" lambda frame: calculate_state_comparison(\n",
|
" lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n",
|
||||||
" frame, geography_field=\"region\"\n",
|
|
||||||
" )\n",
|
|
||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Next, run the comparison by division\n",
|
" # Next, run the comparison by division\n",
|
||||||
|
@ -780,9 +796,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # Run the comparison function on the groups.\n",
|
" # Run the comparison function on the groups.\n",
|
||||||
" division_distribution_df = division_grouped_df.progress_apply(\n",
|
" division_distribution_df = division_grouped_df.progress_apply(\n",
|
||||||
" lambda frame: calculate_state_comparison(\n",
|
" lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
|
||||||
" frame, geography_field=\"division\"\n",
|
|
||||||
" )\n",
|
|
||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Next, run the comparison by urban/rural\n",
|
" # Next, run the comparison by urban/rural\n",
|
||||||
|
@ -837,9 +851,7 @@
|
||||||
" column_character = get_excel_column_name(column_index)\n",
|
" column_character = get_excel_column_name(column_index)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Set all columns to larger width\n",
|
" # Set all columns to larger width\n",
|
||||||
" worksheet.set_column(\n",
|
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
|
||||||
" f\"{column_character}:{column_character}\", column_width\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" # Special formatting for all percent columns\n",
|
" # Special formatting for all percent columns\n",
|
||||||
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
||||||
|
@ -854,7 +866,9 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # Special formatting for columns that capture the percent of population considered priority.\n",
|
" # Special formatting for columns that capture the percent of population considered priority.\n",
|
||||||
" if \"(percent population)\" in column:\n",
|
" if \"(percent population)\" in column:\n",
|
||||||
" column_ranges = f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
|
" column_ranges = (\n",
|
||||||
|
" f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
|
||||||
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Add green to red conditional formatting.\n",
|
" # Add green to red conditional formatting.\n",
|
||||||
" worksheet.conditional_format(\n",
|
" worksheet.conditional_format(\n",
|
||||||
|
@ -880,18 +894,14 @@
|
||||||
" writer.save()\n",
|
" writer.save()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"fields_to_analyze = [\n",
|
"fields_to_analyze = [index.priority_communities_field for index in census_tract_indices]\n",
|
||||||
" index.priority_communities_field for index in census_tract_indices\n",
|
|
||||||
"]\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Convert all indices to boolean\n",
|
"# Convert all indices to boolean\n",
|
||||||
"for field_to_analyze in fields_to_analyze:\n",
|
"for field_to_analyze in fields_to_analyze:\n",
|
||||||
" if \"Areas of Concern\" in field_to_analyze:\n",
|
" if \"Areas of Concern\" in field_to_analyze:\n",
|
||||||
" print(f\"Converting {field_to_analyze} to boolean.\")\n",
|
" print(f\"Converting {field_to_analyze} to boolean.\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(\n",
|
" merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(value=0)\n",
|
||||||
" value=0\n",
|
|
||||||
" )\n",
|
|
||||||
" merged_df[field_to_analyze] = merged_df[field_to_analyze].astype(bool)\n",
|
" merged_df[field_to_analyze] = merged_df[field_to_analyze].astype(bool)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -924,9 +934,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "2bcbcabf",
|
"id": "2bcbcabf",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"directory = COMPARISON_OUTPUTS_DIR / \"tracts_basic_stats\"\n",
|
"directory = COMPARISON_OUTPUTS_DIR / \"tracts_basic_stats\"\n",
|
||||||
|
@ -960,14 +968,10 @@
|
||||||
" column_character = get_excel_column_name(column_index)\n",
|
" column_character = get_excel_column_name(column_index)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Set all columns to larger width\n",
|
" # Set all columns to larger width\n",
|
||||||
" worksheet.set_column(\n",
|
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
|
||||||
" f\"{column_character}:{column_character}\", column_width\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" # Add green to red conditional formatting.\n",
|
" # Add green to red conditional formatting.\n",
|
||||||
" column_ranges = (\n",
|
" column_ranges = f\"{column_character}2:{column_character}{len(basic_stats_df)+1}\"\n",
|
||||||
" f\"{column_character}2:{column_character}{len(basic_stats_df)+1}\"\n",
|
|
||||||
" )\n",
|
|
||||||
" worksheet.conditional_format(\n",
|
" worksheet.conditional_format(\n",
|
||||||
" column_ranges,\n",
|
" column_ranges,\n",
|
||||||
" # Min: green, max: red.\n",
|
" # Min: green, max: red.\n",
|
||||||
|
@ -980,11 +984,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # Special formatting for all percent columns\n",
|
" # Special formatting for all percent columns\n",
|
||||||
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
||||||
" if (\n",
|
" if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
|
||||||
" \"percent \" in column\n",
|
|
||||||
" or \"(percent)\" in column\n",
|
|
||||||
" or \"Percent \" in column\n",
|
|
||||||
" ):\n",
|
|
||||||
" # Make these columns percentages.\n",
|
" # Make these columns percentages.\n",
|
||||||
" percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
|
" percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
|
||||||
" worksheet.set_column(\n",
|
" worksheet.set_column(\n",
|
||||||
|
@ -1013,15 +1013,9 @@
|
||||||
" temp_df[index.priority_communities_field] == True\n",
|
" temp_df[index.priority_communities_field] == True\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" grouped_df = (\n",
|
" grouped_df = temp_df.groupby(index.priority_communities_field).mean().reset_index()\n",
|
||||||
" temp_df.groupby(index.priority_communities_field).mean().reset_index()\n",
|
" result_df = grouped_df[[index.priority_communities_field] + comparison_fields]\n",
|
||||||
" )\n",
|
" result_df.to_csv(directory / f\"{index.method_name} Basic Stats.csv\", index=False)\n",
|
||||||
" result_df = grouped_df[\n",
|
|
||||||
" [index.priority_communities_field] + comparison_fields\n",
|
|
||||||
" ]\n",
|
|
||||||
" result_df.to_csv(\n",
|
|
||||||
" directory / f\"{index.method_name} Basic Stats.csv\", index=False\n",
|
|
||||||
" )\n",
|
|
||||||
" write_basic_stats_excel(\n",
|
" write_basic_stats_excel(\n",
|
||||||
" basic_stats_df=result_df,\n",
|
" basic_stats_df=result_df,\n",
|
||||||
" file_path=directory / f\"{index.method_name} Basic Stats.xlsx\",\n",
|
" file_path=directory / f\"{index.method_name} Basic Stats.xlsx\",\n",
|
||||||
|
@ -1070,9 +1064,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # Also add in the count of census tracts.\n",
|
" # Also add in the count of census tracts.\n",
|
||||||
" count_field_name = \"Count of census tracts\"\n",
|
" count_field_name = \"Count of census tracts\"\n",
|
||||||
" comparison_df[count_field_name] = grouped_df.size().to_frame(\n",
|
" comparison_df[count_field_name] = grouped_df.size().to_frame(count_field_name)\n",
|
||||||
" count_field_name\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" comparison_df = comparison_df.reset_index()\n",
|
" comparison_df = comparison_df.reset_index()\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -1087,9 +1079,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # Put criteria description column first.\n",
|
" # Put criteria description column first.\n",
|
||||||
" columns_to_put_first = (\n",
|
" columns_to_put_first = (\n",
|
||||||
" [criteria_description_field_name]\n",
|
" [criteria_description_field_name] + fields_to_group_by + [count_field_name]\n",
|
||||||
" + fields_to_group_by\n",
|
|
||||||
" + [count_field_name]\n",
|
|
||||||
" )\n",
|
" )\n",
|
||||||
" new_column_order = columns_to_put_first + [\n",
|
" new_column_order = columns_to_put_first + [\n",
|
||||||
" col for col in comparison_df.columns if col not in columns_to_put_first\n",
|
" col for col in comparison_df.columns if col not in columns_to_put_first\n",
|
||||||
|
@ -1120,9 +1110,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # Convert the dataframe to an XlsxWriter Excel object. We also turn off the\n",
|
" # Convert the dataframe to an XlsxWriter Excel object. We also turn off the\n",
|
||||||
" # index column at the left of the output dataframe.\n",
|
" # index column at the left of the output dataframe.\n",
|
||||||
" census_tracts_score_comparison_df.to_excel(\n",
|
" census_tracts_score_comparison_df.to_excel(writer, sheet_name=\"Sheet1\", index=False)\n",
|
||||||
" writer, sheet_name=\"Sheet1\", index=False\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" # Get the xlsxwriter workbook and worksheet objects.\n",
|
" # Get the xlsxwriter workbook and worksheet objects.\n",
|
||||||
" workbook = writer.book\n",
|
" workbook = writer.book\n",
|
||||||
|
@ -1144,9 +1132,7 @@
|
||||||
" column_character = get_excel_column_name(column_index)\n",
|
" column_character = get_excel_column_name(column_index)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Set all columns to larger width\n",
|
" # Set all columns to larger width\n",
|
||||||
" worksheet.set_column(\n",
|
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
|
||||||
" f\"{column_character}:{column_character}\", column_width\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" # Add green to red conditional formatting.\n",
|
" # Add green to red conditional formatting.\n",
|
||||||
" column_ranges = f\"{column_character}2:{column_character}{len(census_tracts_score_comparison_df)+1}\"\n",
|
" column_ranges = f\"{column_character}2:{column_character}{len(census_tracts_score_comparison_df)+1}\"\n",
|
||||||
|
@ -1162,11 +1148,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # Special formatting for all percent columns\n",
|
" # Special formatting for all percent columns\n",
|
||||||
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
||||||
" if (\n",
|
" if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
|
||||||
" \"percent \" in column\n",
|
|
||||||
" or \"(percent)\" in column\n",
|
|
||||||
" or \"Percent \" in column\n",
|
|
||||||
" ):\n",
|
|
||||||
" # Make these columns percentages.\n",
|
" # Make these columns percentages.\n",
|
||||||
" percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
|
" percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
|
||||||
" worksheet.set_column(\n",
|
" worksheet.set_column(\n",
|
||||||
|
@ -1182,9 +1164,7 @@
|
||||||
" # Overwrite both the value and the format of each header cell\n",
|
" # Overwrite both the value and the format of each header cell\n",
|
||||||
" # This is because xlsxwriter / pandas has a known bug where it can't wrap text for a dataframe.\n",
|
" # This is because xlsxwriter / pandas has a known bug where it can't wrap text for a dataframe.\n",
|
||||||
" # See https://stackoverflow.com/questions/42562977/xlsxwriter-text-wrap-not-working.\n",
|
" # See https://stackoverflow.com/questions/42562977/xlsxwriter-text-wrap-not-working.\n",
|
||||||
" for col_num, value in enumerate(\n",
|
" for col_num, value in enumerate(census_tracts_score_comparison_df.columns.values):\n",
|
||||||
" census_tracts_score_comparison_df.columns.values\n",
|
|
||||||
" ):\n",
|
|
||||||
" worksheet.write(0, col_num, value, header_format)\n",
|
" worksheet.write(0, col_num, value, header_format)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" writer.save()\n",
|
" writer.save()\n",
|
||||||
|
@ -1415,9 +1395,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "7d095ebd",
|
"id": "7d095ebd",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Note: this is helpful because this file is long-running, so it alerts the user when the\n",
|
"# Note: this is helpful because this file is long-running, so it alerts the user when the\n",
|
||||||
|
@ -1444,7 +1422,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.6"
|
"version": "3.9.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
|
@ -553,5 +553,17 @@ FPL_200_SERIES = "Is low income?"
|
||||||
FPL_200_AND_COLLEGE_ATTENDANCE_SERIES = (
|
FPL_200_AND_COLLEGE_ATTENDANCE_SERIES = (
|
||||||
"Is low income and has a low percent of higher ed students?"
|
"Is low income and has a low percent of higher ed students?"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Mapping for Environmental Justice columns
|
||||||
|
MAPPING_FOR_EJ_FINAL_PERCENTILE_FIELD = (
|
||||||
|
"Mapping for Environmental Justice Final Percentile"
|
||||||
|
)
|
||||||
|
MAPPING_FOR_EJ_FINAL_SCORE_FIELD = (
|
||||||
|
"Mapping for Environmental Justice Final Score"
|
||||||
|
)
|
||||||
|
MAPPING_FOR_EJ_PRIORITY_COMMUNITY_FIELD = (
|
||||||
|
"Mapping for Environmental Justice Priority Community"
|
||||||
|
)
|
||||||
|
|
||||||
# End of names for individual factors being exceeded
|
# End of names for individual factors being exceeded
|
||||||
####
|
####
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue