diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 084ab495..a745e66b 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -17,6 +17,9 @@ from . import constants logger = get_module_logger(__name__) +# Define the DAC variable +DISADVANTAGED_COMMUNITIES_FIELD = field_names.SCORE_M_COMMUNITIES + class PostScoreETL(ExtractTransformLoad): """ @@ -184,17 +187,9 @@ class PostScoreETL(ExtractTransformLoad): merged_df["Total population"].fillna(0.0).astype(int) ) - # list the null score tracts - null_tract_df = merged_df[ - merged_df[field_names.SCORE_L_COMMUNITIES].isnull() - ] - - # subtract data sets - # this follows the XOR pattern outlined here: - # https://stackoverflow.com/a/37313953 - de_duplicated_df = pd.concat( - [merged_df, null_tract_df, null_tract_df] - ).drop_duplicates(keep=False) + de_duplicated_df = merged_df.dropna( + subset=[DISADVANTAGED_COMMUNITIES_FIELD] + ) # set the score to the new df return de_duplicated_df @@ -333,7 +328,7 @@ class PostScoreETL(ExtractTransformLoad): # Rename score column downloadable_df_copy = downloadable_df.rename( columns={ - field_names.SCORE_M_COMMUNITIES: "Identified as disadvantaged (v0.1)" + DISADVANTAGED_COMMUNITIES_FIELD: "Identified as disadvantaged (v0.1)" }, inplace=False, ) diff --git a/data/data-pipeline/data_pipeline/score/score_m.py b/data/data-pipeline/data_pipeline/score/score_m.py index 7ee94645..0138a19a 100644 --- a/data/data-pipeline/data_pipeline/score/score_m.py +++ b/data/data-pipeline/data_pipeline/score/score_m.py @@ -1,3 +1,4 @@ +from typing import Tuple import numpy as np import pandas as pd @@ -27,7 +28,7 @@ class ScoreM(Score): column_from_decennial_census: str, combined_column_name: str, threshold_cutoff_for_island_areas: float, - ) -> (pd.DataFrame, str): + ) -> Tuple[pd.DataFrame, str]: """Steps to set thresholds for island areas. This function is fairly logically complicated. It takes the following steps: