diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index 59dafe2b..f0b2dbe0 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -89,6 +89,11 @@ DATASET_LIST = [ "module_dir": "hud_recap", "class_name": "HudRecapETL", }, + { + "name": "energy_definition_alternative_draft", + "module_dir": "energy_definition_alternative_draft", + "class_name": "EnergyDefinitionAlternativeDraft", + }, { "name": "tree_equity_score", "module_dir": "tree_equity_score", diff --git a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/README.md b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/README.md new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py new file mode 100644 index 00000000..c8a95fe1 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py @@ -0,0 +1,113 @@ +from pathlib import Path +import pandas as pd + +from data_pipeline.config import settings +from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.score import field_names +from data_pipeline.utils import get_module_logger, unzip_file_from_url + +logger = get_module_logger(__name__) + + +class EnergyDefinitionAlternativeDraft(ExtractTransformLoad): + def __init__(self): + self.DEFINITION_ALTERNATIVE_FILE_URL = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + + "/alternative DAC definition.csv.zip" + ) + + self.OUTPUT_PATH: Path = ( + self.DATA_PATH / "dataset" / "energy_definition_alternative_draft" + ) + + self.TRACT_INPUT_COLUMN_NAME = "GEOID" + self.ALTERNATIVE_DEFINITION_INPUT_COLUMN_NAME = "J40_DAC" + + # Constants for output + self.COLUMNS_TO_KEEP = [ + self.GEOID_TRACT_FIELD_NAME, + field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE, + field_names.COAL_EMPLOYMENT, + field_names.OUTAGE_EVENTS, + field_names.HOMELESSNESS, + field_names.DISABLED_POPULATION, + field_names.OUTAGE_DURATION, + field_names.JOB_ACCESS, + field_names.FOSSIL_ENERGY_EMPLOYMENT, + field_names.FOOD_DESERT, + field_names.INCOMPLETE_PLUMBING, + field_names.NON_GRID_CONNECTED_HEATING_FUEL, + field_names.PARKS, + field_names.GREATER_THAN_30_MIN_COMMUTE, + field_names.INTERNET_ACCESS, + field_names.MOBILE_HOME, + field_names.SINGLE_PARENT, + field_names.TRANSPORTATION_COSTS, + ] + + self.df: pd.DataFrame + + def extract(self) -> None: + logger.info("Starting data download.") + + unzip_file_from_url( + file_url=self.DEFINITION_ALTERNATIVE_FILE_URL, + download_path=self.TMP_PATH, + unzipped_file_path=self.TMP_PATH + / "energy_definition_alternative_draft", + ) + + self.df = pd.read_csv( + filepath_or_buffer=self.TMP_PATH + / "energy_definition_alternative_draft" + / "J40 alternative DAC definition.csv", + # The following need to remain as strings for all of their digits, not get converted to numbers. + dtype={ + self.TRACT_INPUT_COLUMN_NAME: "string", + }, + low_memory=False, + ) + + def transform(self) -> None: + logger.info("Starting transforms.") + + self.df = self.df.rename( + columns={ + self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, + self.ALTERNATIVE_DEFINITION_INPUT_COLUMN_NAME: field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE, + "Coal_Emp_Ratio": field_names.COAL_EMPLOYMENT, + "COUNT_Outage_Events": field_names.OUTAGE_EVENTS, + "den_hmls_pop": field_names.HOMELESSNESS, + "disability_pct": field_names.DISABLED_POPULATION, + "Duration_in_Minutes": field_names.OUTAGE_DURATION, + "emp_ovrll_ndx": field_names.JOB_ACCESS, + "FE_Emp_Ratio": field_names.FOSSIL_ENERGY_EMPLOYMENT, + "Food_LAhalfand10": field_names.FOOD_DESERT, + "incomplete_plumbing_pct": field_names.INCOMPLETE_PLUMBING, + "nongrid_heat_pct": field_names.NON_GRID_CONNECTED_HEATING_FUEL, + "num_parks": field_names.PARKS, + "Per_MoT_Dur_gte30": field_names.GREATER_THAN_30_MIN_COMMUTE, + "Per_NoInt": field_names.INTERNET_ACCESS, + "population_mobile_home_pct": field_names.MOBILE_HOME, + "single_parent_pct": field_names.SINGLE_PARENT, + "t_ami": field_names.TRANSPORTATION_COSTS, + } + ) + + # Convert to boolean: + self.df[field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE] = \ + self.df[field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE + ].astype('bool') + + def validate(self) -> None: + logger.info("Validating data") + + pass + + def load(self) -> None: + logger.info("Saving CSV") + + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + self.df[self.COLUMNS_TO_KEEP].to_csv( + path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False + ) diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index e5e02974..fa7b46bd 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -276,6 +276,25 @@ "mapping_inequality_df" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "605af1ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Load alternative energy-related definition \n", + "energy_definition_alternative_draft_path = (\n", + " DATA_DIR / \"dataset\" / \"energy_definition_alternative_draft\" / \"usa.csv\"\n", + ")\n", + "energy_definition_alternative_draft_df = pd.read_csv(\n", + " energy_definition_alternative_draft_path,\n", + " dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n", + ")\n", + "\n", + "energy_definition_alternative_draft_df" + ] + }, { "cell_type": "code", "execution_count": null, @@ -291,6 +310,7 @@ " calenviroscreen_df,\n", " persistent_poverty_df,\n", " mapping_inequality_df,\n", + " energy_definition_alternative_draft_df,\n", "]\n", "\n", "merged_df = functools.reduce(\n", @@ -431,6 +451,11 @@ " priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", + " Index(\n", + " method_name=field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE,\n", + " priority_communities_field=field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE,\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", " ]\n", " # Insert indices for each of the HOLC factors.\n", " # Note: since these involve no renaming, we write them using list comprehension.\n", diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index bdca25d5..935643a8 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -230,6 +230,27 @@ IMPENETRABLE_SURFACES_FIELD = "Percent impenetrable surface areas" READING_FIELD = "Third grade reading proficiency" LOW_READING_FIELD = "Low third grade reading proficiency" +# Alternative energy-related definition of DACs +ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE = ( + "Energy-related alternative definition of communities" +) +COAL_EMPLOYMENT = "Coal employment" +OUTAGE_EVENTS = "Outage Events" +HOMELESSNESS = "Homelessness" +DISABLED_POPULATION = "Disabled population" +OUTAGE_DURATION = "Outage Duration" +JOB_ACCESS = "JobAccess" +FOSSIL_ENERGY_EMPLOYMENT = "Fossil energy employment" +FOOD_DESERT = "Food Desert" +INCOMPLETE_PLUMBING = "Incomplete Plumbing" +NON_GRID_CONNECTED_HEATING_FUEL = "Non-grid-connected heating fuel" +PARKS = "Parks" +GREATER_THAN_30_MIN_COMMUTE = "Greater than 30 min commute" +INTERNET_ACCESS = "Internet Access" +MOBILE_HOME = "Mobile Home" +SINGLE_PARENT = "Single Parent" +TRANSPORTATION_COSTS = "Transportation Costs" + ##### # Names for individual factors being exceeded # Climate Change