From 2b35a8937ac5b310c51152ed9eebfb511428782b Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Thu, 27 Jan 2022 17:22:39 -0500 Subject: [PATCH] Hot fix for Score M (#1182) * fixes * pr feedback * tuple --- .../data_pipeline/etl/score/etl_score_post.py | 19 +++++++------------ .../data_pipeline/score/score_m.py | 3 ++- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 084ab495..a745e66b 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -17,6 +17,9 @@ from . import constants logger = get_module_logger(__name__) +# Define the DAC variable +DISADVANTAGED_COMMUNITIES_FIELD = field_names.SCORE_M_COMMUNITIES + class PostScoreETL(ExtractTransformLoad): """ @@ -184,17 +187,9 @@ class PostScoreETL(ExtractTransformLoad): merged_df["Total population"].fillna(0.0).astype(int) ) - # list the null score tracts - null_tract_df = merged_df[ - merged_df[field_names.SCORE_L_COMMUNITIES].isnull() - ] - - # subtract data sets - # this follows the XOR pattern outlined here: - # https://stackoverflow.com/a/37313953 - de_duplicated_df = pd.concat( - [merged_df, null_tract_df, null_tract_df] - ).drop_duplicates(keep=False) + de_duplicated_df = merged_df.dropna( + subset=[DISADVANTAGED_COMMUNITIES_FIELD] + ) # set the score to the new df return de_duplicated_df @@ -333,7 +328,7 @@ class PostScoreETL(ExtractTransformLoad): # Rename score column downloadable_df_copy = downloadable_df.rename( columns={ - field_names.SCORE_M_COMMUNITIES: "Identified as disadvantaged (v0.1)" + DISADVANTAGED_COMMUNITIES_FIELD: "Identified as disadvantaged (v0.1)" }, inplace=False, ) diff --git a/data/data-pipeline/data_pipeline/score/score_m.py b/data/data-pipeline/data_pipeline/score/score_m.py index 7ee94645..0138a19a 100644 --- a/data/data-pipeline/data_pipeline/score/score_m.py +++ b/data/data-pipeline/data_pipeline/score/score_m.py @@ -1,3 +1,4 @@ +from typing import Tuple import numpy as np import pandas as pd @@ -27,7 +28,7 @@ class ScoreM(Score): column_from_decennial_census: str, combined_column_name: str, threshold_cutoff_for_island_areas: float, - ) -> (pd.DataFrame, str): + ) -> Tuple[pd.DataFrame, str]: """Steps to set thresholds for island areas. This function is fairly logically complicated. It takes the following steps: