diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index ccd69fc0..ca32bb65 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -22,8 +22,9 @@ class ExtractTransformLoad: FILES_PATH: Path = settings.APP_ROOT / "files" GEOID_FIELD_NAME: str = "GEOID10" GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT" - # TODO: investigate. Census says there are only 217,740 CBGs in the US. + # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods. EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405 + EXPECTED_MAX_CENSUS_TRACTS: int = 73076 def get_yaml_config(self) -> None: """Reads the YAML configuration file for the dataset and stores diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index 4077aaaf..84652b1a 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -64,6 +64,11 @@ DATASET_LIST = [ "module_dir": "geocorr", "class_name": "GeoCorrETL", }, + { + "name": "persistent_poverty", + "module_dir": "persistent_poverty", + "class_name": "PersistentPovertyETL", + }, { "name": "ejscreen_areas_of_concern", "module_dir": "ejscreen_areas_of_concern", diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 2959c99e..c9cc2498 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -83,19 +83,46 @@ class ScoreETL(ExtractTransformLoad): # Urban Rural Map self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag" + # Persistent poverty + self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract" + # EJ Areas of Concern - self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 70th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 75th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 80th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 85th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 90th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 95th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 70th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 75th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 80th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 85th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 90th percentile (communities)" - self.EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = "EJSCREEN Areas of Concern, National, 95th percentile (communities)" + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 70th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 75th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 80th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 85th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 90th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 95th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 70th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 75th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 80th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 85th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 90th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 95th percentile (communities)" + ) # dataframes self.df: pd.DataFrame @@ -109,6 +136,7 @@ class ScoreETL(ExtractTransformLoad): self.doe_energy_burden_df: pd.DataFrame self.national_risk_index_df: pd.DataFrame self.geocorr_urban_rural_df: pd.DataFrame + self.persistent_poverty_df: pd.DataFrame self.ejscreen_areas_of_concern_df: pd.DataFrame def data_sets(self) -> list: @@ -221,6 +249,11 @@ class ScoreETL(ExtractTransformLoad): renamed_field=self.URBAN_HERUISTIC_FIELD_NAME, bucket=None, ), + DataSet( + input_field=self.PERSISTENT_POVERTY_FIELD, + renamed_field=self.PERSISTENT_POVERTY_FIELD, + bucket=None, + ), DataSet( input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME, renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME, @@ -281,7 +314,6 @@ class ScoreETL(ExtractTransformLoad): renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME, bucket=None, ), - # The following data sets have buckets, because they're used in Score C DataSet( input_field="CANCER", @@ -481,6 +513,16 @@ class ScoreETL(ExtractTransformLoad): low_memory=False, ) + # Load persistent poverty + persistent_poverty_csv = ( + self.DATA_PATH / "dataset" / "persistent_poverty" / "usa.csv" + ) + self.persistent_poverty_df = pd.read_csv( + persistent_poverty_csv, + dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + # Load EJ Screen Areas of Concern ejscreen_areas_of_concern_csv = ( self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern" / "usa.csv" @@ -779,6 +821,7 @@ class ScoreETL(ExtractTransformLoad): self.cdc_life_expectancy_df, self.doe_energy_burden_df, self.geocorr_urban_rural_df, + self.persistent_poverty_df, ] census_tract_df = self._join_tract_dfs(census_tract_dfs) @@ -830,7 +873,11 @@ class ScoreETL(ExtractTransformLoad): # TODO do this at the same time as calculating percentiles in future refactor for data_set in data_sets: # Skip GEOID_FIELD_NAME, because it's a string. - if data_set.renamed_field == self.GEOID_FIELD_NAME: + # Skip `PERSISTENT_POVERTY_FIELD` because it's a straight pass-through. + if data_set.renamed_field in ( + self.GEOID_FIELD_NAME, + self.PERSISTENT_POVERTY_FIELD, + ): continue df[data_set.renamed_field] = pd.to_numeric( diff --git a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/README.md b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/README.md new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py new file mode 100644 index 00000000..24e2df18 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py @@ -0,0 +1,174 @@ +import functools +import pandas as pd + +from data_pipeline.config import settings +from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.utils import ( + get_module_logger, + unzip_file_from_url, +) + +logger = get_module_logger(__name__) + + +class PersistentPovertyETL(ExtractTransformLoad): + """Persistent poverty data. + + Loaded from `https://s4.ad.brown.edu/Projects/Diversity/Researcher/LTDB.htm`. + + Codebook: `https://s4.ad.brown.edu/Projects/Diversity/Researcher/LTBDDload/Dfiles/codebooks.pdf`. + """ + + def __init__(self): + self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty" + + # Need to change hyperlink to S3 + # self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/persistent_poverty_urban_rural.csv.zip" + self.GEOID_TRACT_INPUT_FIELD_NAME_1 = "TRTID10" + self.GEOID_TRACT_INPUT_FIELD_NAME_2 = "tractid" + # self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag" + + self.POVERTY_PREFIX = "Individuals in Poverty (percent)" + self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract" + + self.COLUMNS_TO_KEEP = [ + self.GEOID_TRACT_FIELD_NAME, + f"{self.POVERTY_PREFIX} (1990)", + f"{self.POVERTY_PREFIX} (2000)", + f"{self.POVERTY_PREFIX} (2010)", + self.PERSISTENT_POVERTY_FIELD, + ] + + self.df: pd.DataFrame + + def _join_input_dfs(self, dfs: list) -> pd.DataFrame: + df = functools.reduce( + lambda df_a, df_b: pd.merge( + left=df_a, + right=df_b, + # All data frames will now have this field for tract. + on=self.GEOID_TRACT_FIELD_NAME, + how="outer", + ), + dfs, + ) + + # Left-pad the tracts with 0s + expected_length_of_census_tract_field = 11 + df[self.GEOID_TRACT_FIELD_NAME] = ( + df[self.GEOID_TRACT_FIELD_NAME] + .astype(str) + .apply(lambda x: x.zfill(expected_length_of_census_tract_field)) + ) + + # Sanity check the join. + if len(df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()) != 1: + raise ValueError( + f"One of the input CSVs uses {self.GEOID_TRACT_FIELD_NAME} with a different length." + ) + + if len(df) > self.EXPECTED_MAX_CENSUS_TRACTS: + raise ValueError(f"Too many rows in the join: {len(df)}") + + return df + + def extract(self) -> None: + logger.info("Starting to download 86MB persistent poverty file.") + + unzipped_file_path = self.TMP_PATH / "persistent_poverty" + + unzip_file_from_url( + file_url=settings.AWS_JUSTICE40_DATASOURCES_URL + + "/LTDB_Std_All_Sample.zip", + download_path=self.TMP_PATH, + unzipped_file_path=unzipped_file_path, + ) + + file_names = [ + "ltdb_std_1990_sample.csv", + "ltdb_std_2000_sample.csv", + "ltdb_std_2010_sample.csv", + ] + + temporary_input_dfs = [] + + for file_name in file_names: + print(file_name) + temporary_input_df = pd.read_csv( + filepath_or_buffer=unzipped_file_path + / f"ltdb_std_all_sample/{file_name}", + dtype={ + self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string", + self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string", + }, + low_memory=False, + encoding="latin1", + ) + + # Some CSVs have self.GEOID_TRACT_INPUT_FIELD_NAME_1 as the name of the tract field, + # and some have self.GEOID_TRACT_INPUT_FIELD_NAME_2. Rename them both to the same tract name. + temporary_input_df.rename( + columns={ + self.GEOID_TRACT_INPUT_FIELD_NAME_1: self.GEOID_TRACT_FIELD_NAME, + self.GEOID_TRACT_INPUT_FIELD_NAME_2: self.GEOID_TRACT_FIELD_NAME, + }, + inplace=True, + # Ignore errors b/c of the different field names in different CSVs. + errors="ignore", + ) + + temporary_input_dfs.append(temporary_input_df) + + self.df = self._join_input_dfs(temporary_input_dfs) + + def transform(self) -> None: + logger.info("Starting persistent poverty transform") + transformed_df = self.df + + # Note: the fields are defined as following. + # dpovXX Description: persons for whom poverty status is determined + # npovXX Description: persons in poverty + transformed_df[f"{self.POVERTY_PREFIX} (1990)"] = ( + transformed_df["NPOV90"] / transformed_df["DPOV90"] + ) + transformed_df[f"{self.POVERTY_PREFIX} (2000)"] = ( + transformed_df["NPOV00"] / transformed_df["DPOV00"] + ) + # Note: for 2010, they use ACS data ending in 2012 that has 2010 as its midpoint year. + transformed_df[f"{self.POVERTY_PREFIX} (2010)"] = ( + transformed_df["npov12"] / transformed_df["dpov12"] + ) + + poverty_threshold = 0.2 + + transformed_df[self.PERSISTENT_POVERTY_FIELD] = ( + ( + transformed_df[f"{self.POVERTY_PREFIX} (1990)"] + >= poverty_threshold + ) + & ( + transformed_df[f"{self.POVERTY_PREFIX} (2000)"] + >= poverty_threshold + ) + & ( + transformed_df[f"{self.POVERTY_PREFIX} (2010)"] + >= poverty_threshold + ) + ) + + self.df = transformed_df + + def load(self) -> None: + logger.info("Saving persistent poverty data.") + + # mkdir census + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + + self.df[self.COLUMNS_TO_KEEP].to_csv( + path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False + ) + + def validate(self) -> None: + logger.info("Validating persistent poverty data.") + + pass diff --git a/data/data-pipeline/data_pipeline/ipython/ACS Validate.ipynb b/data/data-pipeline/data_pipeline/ipython/ACS Validate.ipynb index ac5baca4..cb89a067 100644 --- a/data/data-pipeline/data_pipeline/ipython/ACS Validate.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/ACS Validate.ipynb @@ -36,12 +36,8 @@ "DATA_PATH = Path.cwd().parent / \"data\"\n", "TMP_PATH: Path = DATA_PATH / \"tmp\"\n", "ACS_YEAR = \"2019\"\n", - "OUTPUT_PATH = (\n", - " DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n", - " )\n", - "CENSUS_USA_CSV = (\n", - " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n", - " )" + "OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n", + "CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\"" ] }, { @@ -52,12 +48,12 @@ "outputs": [], "source": [ "cbg_usa_df = pd.read_csv(\n", - " CENSUS_USA_CSV,\n", - " names=['GEOID10'],\n", - " dtype={\"GEOID10\": \"string\"},\n", - " low_memory=False,\n", - " header=None\n", - " )" + " CENSUS_USA_CSV,\n", + " names=[\"GEOID10\"],\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + " header=None,\n", + ")" ] }, { @@ -163,10 +159,10 @@ "outputs": [], "source": [ "acs_df = pd.read_csv(\n", - " OUTPUT_PATH / \"usa.csv\",\n", - " dtype={\"GEOID10\": \"string\"},\n", - " low_memory=False,\n", - " )" + " OUTPUT_PATH / \"usa.csv\",\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + ")" ] }, { @@ -292,9 +288,7 @@ "metadata": {}, "outputs": [], "source": [ - "merged_df = cbg_usa_df.merge(\n", - " acs_df, on=\"GEOID10\", how=\"left\"\n", - " )" + "merged_df = cbg_usa_df.merge(acs_df, on=\"GEOID10\", how=\"left\")" ] }, { diff --git a/data/data-pipeline/data_pipeline/ipython/EJScreen Validate.ipynb b/data/data-pipeline/data_pipeline/ipython/EJScreen Validate.ipynb index 4c2826d0..0546e39c 100644 --- a/data/data-pipeline/data_pipeline/ipython/EJScreen Validate.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/EJScreen Validate.ipynb @@ -35,12 +35,8 @@ "source": [ "DATA_PATH = Path.cwd().parent / \"data\"\n", "TMP_PATH: Path = DATA_PATH / \"tmp\"\n", - "OUTPUT_PATH = (\n", - " DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n", - " )\n", - "CENSUS_USA_CSV = (\n", - " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n", - " )" + "OUTPUT_PATH = DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n", + "CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\"" ] }, { @@ -51,12 +47,12 @@ "outputs": [], "source": [ "cbg_usa_df = pd.read_csv(\n", - " CENSUS_USA_CSV,\n", - " names=['GEOID10'],\n", - " dtype={\"GEOID10\": \"string\"},\n", - " low_memory=False,\n", - " header=None\n", - " )" + " CENSUS_USA_CSV,\n", + " names=[\"GEOID10\"],\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + " header=None,\n", + ")" ] }, { @@ -162,10 +158,10 @@ "outputs": [], "source": [ "ejscreen_df = pd.read_csv(\n", - " OUTPUT_PATH / \"usa.csv\",\n", - " dtype={\"ID\": \"string\"},\n", - " low_memory=False,\n", - " )" + " OUTPUT_PATH / \"usa.csv\",\n", + " dtype={\"ID\": \"string\"},\n", + " low_memory=False,\n", + ")" ] }, { @@ -176,9 +172,9 @@ "outputs": [], "source": [ "ejscreen_df.rename(\n", - " columns={\"ID\": \"GEOID10\"},\n", - " inplace=True,\n", - " )" + " columns={\"ID\": \"GEOID10\"},\n", + " inplace=True,\n", + ")" ] }, { @@ -458,9 +454,7 @@ "metadata": {}, "outputs": [], "source": [ - "merged_df = cbg_usa_df.merge(\n", - " ejscreen_df, on=\"GEOID10\", how=\"left\"\n", - " )" + "merged_df = cbg_usa_df.merge(ejscreen_df, on=\"GEOID10\", how=\"left\")" ] }, { @@ -1092,9 +1086,7 @@ "id": "d1a7b71d", "metadata": {}, "outputs": [], - "source": [ - "\n" - ] + "source": [] } ], "metadata": { diff --git a/data/data-pipeline/data_pipeline/ipython/Score Validate.ipynb b/data/data-pipeline/data_pipeline/ipython/Score Validate.ipynb index aa65eafe..5520c4ad 100644 --- a/data/data-pipeline/data_pipeline/ipython/Score Validate.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/Score Validate.ipynb @@ -35,12 +35,8 @@ "source": [ "DATA_PATH = Path.cwd().parent / \"data\"\n", "TMP_PATH: Path = DATA_PATH / \"tmp\"\n", - "OUTPUT_PATH = (\n", - " DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n", - " )\n", - "CENSUS_USA_CSV = (\n", - " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n", - " )" + "OUTPUT_PATH = DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n", + "CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\"" ] }, { @@ -51,12 +47,12 @@ "outputs": [], "source": [ "cbg_usa_df = pd.read_csv(\n", - " CENSUS_USA_CSV,\n", - " names=['GEOID10'],\n", - " dtype={\"GEOID10\": \"string\"},\n", - " low_memory=False,\n", - " header=None\n", - " )" + " CENSUS_USA_CSV,\n", + " names=[\"GEOID10\"],\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + " header=None,\n", + ")" ] }, { @@ -162,10 +158,10 @@ "outputs": [], "source": [ "score_df = pd.read_csv(\n", - " OUTPUT_PATH / \"usa.csv\",\n", - " dtype={\"GEOID10\": \"string\"},\n", - " low_memory=False,\n", - " )" + " OUTPUT_PATH / \"usa.csv\",\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + ")" ] }, { @@ -381,9 +377,7 @@ "metadata": {}, "outputs": [], "source": [ - "merged_df = cbg_usa_df.merge(\n", - " score_df, on=\"GEOID10\", how=\"left\"\n", - " )" + "merged_df = cbg_usa_df.merge(score_df, on=\"GEOID10\", how=\"left\")" ] }, { diff --git a/data/data-pipeline/data_pipeline/ipython/Score_Dissolve_Script.ipynb b/data/data-pipeline/data_pipeline/ipython/Score_Dissolve_Script.ipynb index 8e57da6d..52ac54f3 100644 --- a/data/data-pipeline/data_pipeline/ipython/Score_Dissolve_Script.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/Score_Dissolve_Script.ipynb @@ -33,7 +33,9 @@ "source": [ "def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n", " state_gdf = gpd.read_file(file_name)\n", - " state_repr = state_gdf.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")\n", + " state_repr = state_gdf.to_crs(\n", + " \"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\"\n", + " )\n", " state_merged = state_repr.merge(usa_df, on=\"GEOID10\", how=\"left\")\n", " state_merged_simplified = state_merged[\n", " [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n", @@ -67,9 +69,9 @@ "\n", "def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):\n", " # dissolve tracts by bucket\n", - " state_attr = state_tracts[[\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]].reset_index(\n", - " drop=True\n", - " )\n", + " state_attr = state_tracts[\n", + " [\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]\n", + " ].reset_index(drop=True)\n", " state_dissolve = state_attr.dissolve(by=\"D_SCORE_bucket\", aggfunc=agg_func)\n", " return state_dissolve\n", "\n", @@ -91,10 +93,12 @@ " gdf_compressed = gpd.GeoDataFrame(\n", " compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", " )\n", - " gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\")\n", + " gdf_compressed.to_file(\n", + " CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\"\n", + " )\n", "\n", "\n", - "def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):\n", + "def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets: int):\n", " print(f\"Processing file {file_name}...\")\n", " state_merged_simplified = merge_and_simplify_file(file_name, usa_df)\n", " state_tracts = aggregate_to_tracts(state_merged_simplified)\n", @@ -115,7 +119,9 @@ "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n", "CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n", "CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n", - "score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)" + "score_df = pd.read_csv(\n", + " CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False\n", + ")" ] }, { @@ -185,9 +191,9 @@ }, "outputs": [], "source": [ - "for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n", - " state_gdf = gpd.read_file(file_name)\n", - " master_df = master_df.append(state_gdf)" + "for file_name in CENSUS_GEOJSON_DIR.rglob(\"*.json\"):\n", + " state_gdf = gpd.read_file(file_name)\n", + " master_df = master_df.append(state_gdf)" ] }, { @@ -672,7 +678,9 @@ }, "outputs": [], "source": [ - "usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")" + "usa_merged_compressed.to_file(\n", + " CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\"\n", + ")" ] }, { @@ -684,8 +692,8 @@ "outputs": [], "source": [ "usa_simplified = usa_merged[\n", - " [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n", - " ].reset_index(drop=True)" + " [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n", + "].reset_index(drop=True)" ] }, { @@ -696,9 +704,7 @@ }, "outputs": [], "source": [ - "usa_simplified.rename(\n", - " columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n", - " )" + "usa_simplified.rename(columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True)" ] }, { @@ -714,8 +720,8 @@ "outputs": [], "source": [ "usa_cbg_compressed = gpd.GeoDataFrame(\n", - " usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", - " )" + " usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", + ")" ] }, { @@ -726,7 +732,9 @@ }, "outputs": [], "source": [ - "usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")" + "usa_cbg_compressed.to_file(\n", + " CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\"\n", + ")" ] }, { @@ -764,8 +772,8 @@ "outputs": [], "source": [ "tracts_compressed = gpd.GeoDataFrame(\n", - " usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", - " )" + " usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", + ")" ] }, { @@ -776,7 +784,9 @@ }, "outputs": [], "source": [ - "tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")" + "tracts_compressed.to_file(\n", + " CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\"\n", + ")" ] }, { @@ -877,8 +887,8 @@ "outputs": [], "source": [ "gdf_compressed = gpd.GeoDataFrame(\n", - " compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", - " )" + " compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", + ")" ] }, { @@ -917,7 +927,9 @@ }, "outputs": [], "source": [ - "gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")" + "gdf_compressed.to_file(\n", + " CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\"\n", + ")" ] } ], diff --git a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb index abb5dea7..731b7371 100644 --- a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb @@ -39,7 +39,9 @@ "source": [ "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", - "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\"))" + "censusdata.printtable(\n", + " censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\")\n", + ")" ], "outputs": [], "metadata": { @@ -65,8 +67,8 @@ " year=ACS_YEAR,\n", " geo=censusdata.censusgeo(\n", " [\n", - " (\"state\", fips) \n", - " #, (\"county\", \"*\"), (\"block group\", \"*\")\n", + " (\"state\", fips)\n", + " # , (\"county\", \"*\"), (\"block group\", \"*\")\n", " ]\n", " ),\n", " var=[\"B23025_005E\", \"B23025_003E\", \"B19013_001E\"],\n", @@ -75,7 +77,9 @@ "\n", "df = pd.concat(dfs)\n", "\n", - "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n", + "df[GEOID_FIELD_NAME] = df.index.to_series().apply(\n", + " func=fips_from_censusdata_censusgeo\n", + ")\n", "\n", "df.head()" ], @@ -90,7 +94,13 @@ "source": [ "columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n", "\n", - "df.rename(columns={\"GEOID10\": \"GEOID2\", \"B19013_001E\": \"Median household income (State)\"}, inplace=True)\n", + "df.rename(\n", + " columns={\n", + " \"GEOID10\": \"GEOID2\",\n", + " \"B19013_001E\": \"Median household income (State)\",\n", + " },\n", + " inplace=True,\n", + ")\n", "\n", "# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)" ], diff --git a/data/data-pipeline/data_pipeline/ipython/county_lookup.ipynb b/data/data-pipeline/data_pipeline/ipython/county_lookup.ipynb index 5cc1ab1a..6140f318 100644 --- a/data/data-pipeline/data_pipeline/ipython/county_lookup.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/county_lookup.ipynb @@ -20,7 +20,7 @@ "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", - " \n", + "\n", "from data_pipeline.utils import unzip_file_from_url\n", "from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes" ], @@ -57,9 +57,16 @@ "cell_type": "code", "execution_count": null, "source": [ - "counties_df = pd.read_csv(CENSUS_COUNTIES_TXT, sep=\"\\t\", dtype={\"GEOID\": \"string\", \"USPS\": \"string\"}, low_memory=False)\n", - "counties_df = counties_df[['USPS', 'GEOID', 'NAME']]\n", - "counties_df.rename(columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True)\n", + "counties_df = pd.read_csv(\n", + " CENSUS_COUNTIES_TXT,\n", + " sep=\"\\t\",\n", + " dtype={\"GEOID\": \"string\", \"USPS\": \"string\"},\n", + " low_memory=False,\n", + ")\n", + "counties_df = counties_df[[\"USPS\", \"GEOID\", \"NAME\"]]\n", + "counties_df.rename(\n", + " columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True\n", + ")\n", "counties_df.head()" ], "outputs": [], @@ -69,8 +76,17 @@ "cell_type": "code", "execution_count": null, "source": [ - "states_df = pd.read_csv(STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"})\n", - "states_df.rename(columns={\"fips\": \"State Code\", \"state_name\": \"State Name\", \"state_abbreviation\": \"State Abbreviation\"}, inplace=True)\n", + "states_df = pd.read_csv(\n", + " STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"}\n", + ")\n", + "states_df.rename(\n", + " columns={\n", + " \"fips\": \"State Code\",\n", + " \"state_name\": \"State Name\",\n", + " \"state_abbreviation\": \"State Abbreviation\",\n", + " },\n", + " inplace=True,\n", + ")\n", "states_df.head()" ], "outputs": [], @@ -80,7 +96,7 @@ "cell_type": "code", "execution_count": null, "source": [ - "county_state_merged = counties_df.join(states_df, rsuffix=' Other')\n", + "county_state_merged = counties_df.join(states_df, rsuffix=\" Other\")\n", "del county_state_merged[\"State Abbreviation Other\"]\n", "county_state_merged.head()" ], @@ -102,7 +118,7 @@ "cell_type": "code", "execution_count": null, "source": [ - "score_county_state_merged = score_df.join(county_state_merged, rsuffix='_OTHER')\n", + "score_county_state_merged = score_df.join(county_state_merged, rsuffix=\"_OTHER\")\n", "del score_county_state_merged[\"GEOID_OTHER\"]\n", "score_county_state_merged.head()" ], diff --git a/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb b/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb index eb5b55bf..204dd05c 100644 --- a/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb @@ -35,7 +35,7 @@ "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n", "\n", "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n", - "tqdm_notebook.pandas()\n" + "tqdm_notebook.pandas()" ] }, { @@ -89,14 +89,12 @@ " \"Poverty (Less than 200% of federal poverty line)\",\n", " \"Percent individuals age 25 or over with less than high school degree\",\n", " \"Unemployed civilians (percent)\",\n", - " \"Linguistic isolation (percent)\"\n", + " \"Linguistic isolation (percent)\",\n", "]\n", "\n", "column_to_plot = columns_to_plot[0]\n", "print(f\"Plotting {column_to_plot}\")\n", - "print(cejst_df[\n", - " column_to_plot\n", - "].hist())" + "print(cejst_df[column_to_plot].hist())" ] }, { diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index 0a50968a..6e76e46b 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -152,15 +152,17 @@ "CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n", "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n", "\n", - "calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n", + "calenviroscreen_data_path = (\n", + " DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n", + ")\n", "calenviroscreen_df = pd.read_csv(\n", " calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n", ")\n", "\n", "# Convert priority community field to a bool.\n", - "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n", + "calenviroscreen_df[\n", " CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n", - "].astype(bool)\n", + "] = calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].astype(bool)\n", "\n", "calenviroscreen_df.head()" ] @@ -168,19 +170,33 @@ { "cell_type": "code", "execution_count": null, - "id": "1bf54af1", - "metadata": { - "scrolled": true - }, + "id": "df458f08", + "metadata": {}, "outputs": [], "source": [ - "# Load HUD data\n", - "hud_recap_data_path = DATA_DIR / \"dataset\" / \"hud_recap\" / \"usa.csv\"\n", - "hud_recap_df = pd.read_csv(\n", - " hud_recap_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n", + "# Load persistent poverty data\n", + "persistent_poverty_path = (\n", + " DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n", + ")\n", + "persistent_poverty_df = pd.read_csv(\n", + " persistent_poverty_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n", ")\n", "\n", - "hud_recap_df.head()" + "# Since \"Persistent Poverty Census Tract\" is labeled in both the score file (at the CBG level) and this tract file,\n", + "# rename this field so it's easy to access the tract-level scores directly.\n", + "\n", + "PERSISTENT_POVERTY_TRACT_LEVEL_FIELD = \"Persistent Poverty, Tract Level\"\n", + "PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n", + "\n", + "persistent_poverty_df.rename(\n", + " columns={\n", + " PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD\n", + " },\n", + " inplace=True,\n", + " errors=\"raise\",\n", + ")\n", + "\n", + "persistent_poverty_df" ] }, { @@ -193,7 +209,7 @@ "outputs": [], "source": [ "# Join all dataframes that use tracts\n", - "census_tract_dfs = [calenviroscreen_df, hud_recap_df]\n", + "census_tract_dfs = [calenviroscreen_df, persistent_poverty_df]\n", "\n", "census_tract_df = functools.reduce(\n", " lambda left, right: pd.merge(\n", @@ -231,7 +247,6 @@ " on=GEOID_TRACT_FIELD_NAME,\n", ")\n", "\n", - "\n", "if len(merged_df) > 220405:\n", " raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n", "\n", @@ -317,6 +332,11 @@ " other_census_tract_fields_to_keep=[],\n", " ),\n", " Index(\n", + " method_name=\"Persistent Poverty (CBG)\",\n", + " priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", " method_name=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n", " priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n", " other_census_tract_fields_to_keep=[],\n", @@ -325,6 +345,11 @@ "\n", "census_tract_indices = [\n", " Index(\n", + " method_name=\"Persistent Poverty\",\n", + " priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", " method_name=\"CalEnviroScreen 4.0\",\n", " priority_communities_field=\"calenviroscreen_priority_community\",\n", " other_census_tract_fields_to_keep=[\n", @@ -332,11 +357,6 @@ " CALENVIROSCREEN_PERCENTILE_FIELD,\n", " ],\n", " ),\n", - " Index(\n", - " method_name=\"HUD RECAP\",\n", - " priority_communities_field=\"hud_recap_priority_community\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", "]" ] }, @@ -361,7 +381,8 @@ "\n", " # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n", " df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n", - " df[priority_communities_field] * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n", + " df[priority_communities_field]\n", + " * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n", " )\n", "\n", " def calculate_state_comparison(\n", @@ -400,7 +421,9 @@ " summary_dict[\"Geography name\"] = division_id\n", "\n", " total_cbgs_in_geography = len(frame)\n", - " total_population_in_geography = frame[CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum()\n", + " total_population_in_geography = frame[\n", + " CENSUS_BLOCK_GROUP_POPULATION_FIELD\n", + " ].sum()\n", "\n", " if geography_field == URBAN_HEURISTIC_FIELD:\n", " urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n", @@ -408,9 +431,9 @@ " summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n", "\n", " for priority_communities_field in priority_communities_fields:\n", - " summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n", + " summary_dict[\n", " f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n", - " ].sum()\n", + " ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n", "\n", " summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n", " f\"{priority_communities_field}\"\n", @@ -422,7 +445,9 @@ " / total_cbgs_in_geography\n", " )\n", "\n", - " summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n", + " summary_dict[\n", + " f\"{priority_communities_field} (percent population)\"\n", + " ] = (\n", " summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n", " / total_population_in_geography\n", " )\n", @@ -468,7 +493,9 @@ "\n", " # Run the comparison function on the groups.\n", " region_distribution_df = region_grouped_df.progress_apply(\n", - " lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n", + " lambda frame: calculate_state_comparison(\n", + " frame, geography_field=\"region\"\n", + " )\n", " )\n", "\n", " # Next, run the comparison by division\n", @@ -476,7 +503,9 @@ "\n", " # Run the comparison function on the groups.\n", " division_distribution_df = division_grouped_df.progress_apply(\n", - " lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n", + " lambda frame: calculate_state_comparison(\n", + " frame, geography_field=\"division\"\n", + " )\n", " )\n", "\n", " # Next, run the comparison by urban/rural\n", @@ -531,7 +560,9 @@ " column_character = get_excel_column_name(column_index)\n", "\n", " # Set all columns to larger width\n", - " worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n", + " worksheet.set_column(\n", + " f\"{column_character}:{column_character}\", column_width\n", + " )\n", "\n", " # Special formatting for all percent columns\n", " # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n", @@ -546,9 +577,7 @@ "\n", " # Special formatting for columns that capture the percent of population considered priority.\n", " if \"(percent population)\" in column:\n", - " column_ranges = (\n", - " f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n", - " )\n", + " column_ranges = f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n", "\n", " # Add green to red conditional formatting.\n", " worksheet.conditional_format(\n", @@ -661,7 +690,9 @@ "\n", " # Put criteria description column first.\n", " new_column_order = [criteria_description_field_name] + [\n", - " col for col in comparison_df.columns if col != criteria_description_field_name\n", + " col\n", + " for col in comparison_df.columns\n", + " if col != criteria_description_field_name\n", " ]\n", "\n", " comparison_df = comparison_df[new_column_order]\n", @@ -707,12 +738,12 @@ " column_character = get_excel_column_name(column_index)\n", "\n", " # Set all columns to larger width\n", - " worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n", + " worksheet.set_column(\n", + " f\"{column_character}:{column_character}\", column_width\n", + " )\n", "\n", " # Add green to red conditional formatting.\n", - " column_ranges = (\n", - " f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n", - " )\n", + " column_ranges = f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n", " worksheet.conditional_format(\n", " column_ranges,\n", " # Min: green, max: red.\n", @@ -725,7 +756,11 @@ "\n", " # Special formatting for all percent columns\n", " # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n", - " if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n", + " if (\n", + " \"percent \" in column\n", + " or \"(percent)\" in column\n", + " or \"Percent \" in column\n", + " ):\n", " # Make these columns percentages.\n", " percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n", " worksheet.set_column(\n", @@ -763,9 +798,7 @@ " )\n", "\n", " # Write secondary comparison to CSV.\n", - " file_name_part = (\n", - " f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n", - " )\n", + " file_name_part = f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n", " output_dir.mkdir(parents=True, exist_ok=True)\n", " file_path = output_dir / (file_name_part + \".csv\")\n", " file_path_xlsx = output_dir / (file_name_part + \".xlsx\")\n", @@ -777,7 +810,8 @@ " )\n", "\n", " write_cbg_score_comparison_excel(\n", - " cbg_score_comparison_df=cbg_score_comparison_df, file_path=file_path_xlsx\n", + " cbg_score_comparison_df=cbg_score_comparison_df,\n", + " file_path=file_path_xlsx,\n", " )\n", "\n", "\n", @@ -808,11 +842,15 @@ "cell_type": "code", "execution_count": null, "id": "eeb9699d", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "def write_markdown_and_docx_content(\n", - " markdown_content: str, file_dir: pathlib.PosixPath, file_name_without_extension: str\n", + " markdown_content: str,\n", + " file_dir: pathlib.PosixPath,\n", + " file_name_without_extension: str,\n", ") -> pathlib.PosixPath:\n", " \"\"\"Write Markdown content to both .md and .docx files.\"\"\"\n", " # Set the file paths for both files.\n", @@ -844,7 +882,9 @@ "\n", " # List of all states/territories in their FIPS codes:\n", " state_ids = sorted(df[state_field].unique())\n", - " state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n", + " state_names = \", \".join(\n", + " [us.states.lookup(state_id).name for state_id in state_ids]\n", + " )\n", "\n", " # Create markdown content for comparisons.\n", " markdown_content = f\"\"\"\n", @@ -858,11 +898,16 @@ "\n", "\"\"\"\n", "\n", - " for (index1, index2) in itertools.combinations(census_block_group_indices, 2):\n", + " for (index1, index2) in itertools.combinations(\n", + " census_block_group_indices, 2\n", + " ):\n", " # Group all data by their different values on Priority Communities Field for Index1 vs Priority Communities Field for Index2.\n", " count_df = (\n", " df.groupby(\n", - " [index1.priority_communities_field, index2.priority_communities_field]\n", + " [\n", + " index1.priority_communities_field,\n", + " index2.priority_communities_field,\n", + " ]\n", " )[GEOID_FIELD_NAME]\n", " .count()\n", " .reset_index(name=count_field_name)\n", @@ -894,16 +939,24 @@ "\n", " # Convert from series to a scalar value, including accounting for if no data exists for that pairing.\n", " true_true_cbgs = (\n", - " true_true_cbgs_series.iloc[0] if len(true_true_cbgs_series) > 0 else 0\n", + " true_true_cbgs_series.iloc[0]\n", + " if len(true_true_cbgs_series) > 0\n", + " else 0\n", " )\n", " true_false_cbgs = (\n", - " true_false_cbgs_series.iloc[0] if len(true_false_cbgs_series) > 0 else 0\n", + " true_false_cbgs_series.iloc[0]\n", + " if len(true_false_cbgs_series) > 0\n", + " else 0\n", " )\n", " false_true_cbgs = (\n", - " false_true_cbgs_series.iloc[0] if len(false_true_cbgs_series) > 0 else 0\n", + " false_true_cbgs_series.iloc[0]\n", + " if len(false_true_cbgs_series) > 0\n", + " else 0\n", " )\n", " false_false_cbgs = (\n", - " false_false_cbgs_series.iloc[0] if len(false_false_cbgs_series) > 0 else 0\n", + " false_false_cbgs_series.iloc[0]\n", + " if len(false_false_cbgs_series) > 0\n", + " else 0\n", " )\n", "\n", " markdown_content += (\n", @@ -1095,15 +1148,20 @@ "\n", " # Calculate comparison\n", " # A comparison priority tract has at least one CBG that is a priority CBG.\n", - " df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (\n", + " df[\n", + " comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n", + " ] = (\n", " frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n", " if is_a_method_b_priority_tract\n", " else None\n", " )\n", "\n", " # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n", - " df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (\n", - " frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n", + " df[\n", + " comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n", + " ] = (\n", + " frame.loc[:, method_a_priority_census_block_groups_field].mean()\n", + " == 1\n", " if is_a_method_b_priority_tract\n", " else None\n", " )\n", @@ -1122,7 +1180,8 @@ " df[\n", " comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n", " ] = (\n", - " frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n", + " frame.loc[:, method_a_priority_census_block_groups_field].mean()\n", + " == 1\n", " if not is_a_method_b_priority_tract\n", " else None\n", " )\n", @@ -1163,14 +1222,20 @@ "\n", " # List of all states/territories in their FIPS codes:\n", " state_ids = sorted(original_df[state_field].unique())\n", - " state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n", + " state_names = \", \".join(\n", + " [us.states.lookup(state_id).name for state_id in state_ids]\n", + " )\n", "\n", " # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n", " # TODO: investigate why sums are sometimes series and sometimes scalar.\n", " method_a_priority_cbgs = (\n", - " original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()\n", + " original_df.loc[:, method_a_priority_census_block_groups_field]\n", + " .sum()\n", + " .squeeze()\n", + " )\n", + " method_a_priority_cbgs_percent = (\n", + " f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n", " )\n", - " method_a_priority_cbgs_percent = f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n", "\n", " total_tracts_count = len(comparison_df)\n", "\n", @@ -1192,7 +1257,9 @@ " .sum()\n", " .squeeze()\n", " )\n", - " method_a_tracts_count_percent = f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n", + " method_a_tracts_count_percent = (\n", + " f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n", + " )\n", "\n", " # Method A priority community stats\n", " method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n", @@ -1323,7 +1390,8 @@ "\n", " # Write comparison to CSV.\n", " file_path = (\n", - " output_dir / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n", + " output_dir\n", + " / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n", " )\n", " comparison_df.to_csv(\n", " path_or_buf=file_path,\n", diff --git a/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb b/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb index 0eda22b5..0785a189 100644 --- a/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb @@ -101,17 +101,25 @@ "outputs": [], "source": [ "geocorr_urban_rural_map = pd.read_csv(\n", - " os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n", - " encoding = \"ISO-8859-1\",\n", + " os.path.join(GEOCORR_DATA_DIR, \"geocorr2014_2125804280.csv\"),\n", + " encoding=\"ISO-8859-1\",\n", " skiprows=[1],\n", - " dtype='str',\n", + " dtype=\"str\",\n", ")\n", "\n", - "geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n", - "geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n", + "geocorr_urban_rural_map[\"pop10\"] = pd.to_numeric(\n", + " geocorr_urban_rural_map[\"pop10\"]\n", + ")\n", + "geocorr_urban_rural_map[\"afact\"] = pd.to_numeric(\n", + " geocorr_urban_rural_map[\"afact\"]\n", + ")\n", "\n", - "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n", - "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)" + "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = (\n", + " geocorr_urban_rural_map[\"county\"] + geocorr_urban_rural_map[\"tract\"]\n", + ") # + geocorr_urban_rural_map['bg']\n", + "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[\n", + " GEOID_TRACT_FIELD_NAME\n", + "].str.replace(\".\", \"\", regex=False)" ] }, { @@ -139,15 +147,9 @@ "metadata": {}, "outputs": [], "source": [ - "geocorr_urban_rural_map = geocorr_urban_rural_map[[\n", - " GEOID_TRACT_FIELD_NAME,\n", - " 'ur',\n", - " 'ua',\n", - " 'cntyname',\n", - " 'uaname',\n", - " 'pop10',\n", - " 'afact'\n", - "]]" + "geocorr_urban_rural_map = geocorr_urban_rural_map[\n", + " [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\", \"cntyname\", \"uaname\", \"pop10\", \"afact\"]\n", + "]" ] }, { @@ -165,7 +167,9 @@ "metadata": {}, "outputs": [], "source": [ - "geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)" + "geocorr_urban_rural_map.groupby(\n", + " [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\"], dropna=False\n", + ").size().sort_values(ascending=False)" ] }, { @@ -175,7 +179,9 @@ "metadata": {}, "outputs": [], "source": [ - "geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']" + "geocorr_urban_rural_map.loc[\n", + " geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == \"36117020302\"\n", + "]" ] }, { @@ -185,8 +191,12 @@ "metadata": {}, "outputs": [], "source": [ - "total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n", - "total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n", + "total_geo_population = (\n", + " geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME)\n", + " .agg({\"pop10\": np.sum})\n", + " .reset_index()\n", + ")\n", + "total_geo_population.rename(columns={\"pop10\": \"total_population\"}, inplace=True)\n", "total_geo_population.head()" ] }, @@ -197,8 +207,16 @@ "metadata": {}, "outputs": [], "source": [ - "geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n", - "geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n", + "geocorr_urban_rural_with_total_pop_map = (\n", + " geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, \"ur\"])\n", + " .agg({\"pop10\": np.sum})\n", + " .reset_index()\n", + ")\n", + "geocorr_urban_rural_with_total_pop_map = (\n", + " geocorr_urban_rural_with_total_pop_map.merge(\n", + " total_geo_population, how=\"inner\", on=GEOID_TRACT_FIELD_NAME\n", + " )\n", + ")\n", "geocorr_urban_rural_with_total_pop_map.head()" ] }, @@ -209,7 +227,10 @@ "metadata": {}, "outputs": [], "source": [ - "geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']" + "geocorr_urban_rural_with_total_pop_map[\"afact\"] = (\n", + " geocorr_urban_rural_with_total_pop_map[\"pop10\"]\n", + " / geocorr_urban_rural_with_total_pop_map[\"total_population\"]\n", + ")" ] }, { @@ -229,7 +250,10 @@ "metadata": {}, "outputs": [], "source": [ - "geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']" + "geocorr_urban_rural_with_total_pop_map.loc[\n", + " geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME]\n", + " == \"01001020200\"\n", + "]" ] }, { @@ -239,12 +263,16 @@ "metadata": {}, "outputs": [], "source": [ - "urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n", - "urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n", + "urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(\n", + " index=GEOID_TRACT_FIELD_NAME, columns=\"ur\", values=[\"pop10\", \"afact\"]\n", + ")\n", + "urban_rural_map.columns = [\n", + " \"_\".join(col).strip() for col in urban_rural_map.columns.values\n", + "]\n", "urban_rural_map.reset_index(inplace=True)\n", - "urban_rural_map['urban_heuristic_flag'] = 0\n", - "mask = urban_rural_map['afact_U'] >= 0.5\n", - "urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1" + "urban_rural_map[\"urban_heuristic_flag\"] = 0\n", + "mask = urban_rural_map[\"afact_U\"] >= 0.5\n", + "urban_rural_map.loc[mask, \"urban_heuristic_flag\"] = 1" ] }, { @@ -256,12 +284,13 @@ "source": [ "urban_rural_map.rename(\n", " columns={\n", - " 'pop10_R': 'population_in_rural_areas',\n", - " 'pop10_U': 'population_in_urban_areas',\n", - " 'afact_R': 'perc_population_in_rural_areas',\n", - " 'afact_U': 'perc_population_in_urban_areas',\n", - " }, \n", - " inplace=True)" + " \"pop10_R\": \"population_in_rural_areas\",\n", + " \"pop10_U\": \"population_in_urban_areas\",\n", + " \"afact_R\": \"perc_population_in_rural_areas\",\n", + " \"afact_U\": \"perc_population_in_urban_areas\",\n", + " },\n", + " inplace=True,\n", + ")" ] }, {