From 29419dd2fd77d2213b9ba0cc4f5777f04a4edf83 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com> Date: Tue, 2 Aug 2022 16:28:05 -0400 Subject: [PATCH] Rescaling linguistic isolation (#1750) Rescales linguistic isolation to drop puerto rico --- .../data_pipeline/etl/score/etl_score.py | 113 +++++++++--------- 1 file changed, 57 insertions(+), 56 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index c736e3fc..5d528dd6 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -264,6 +264,7 @@ class ScoreETL(ExtractTransformLoad): df: pd.DataFrame, input_column_name: str, output_column_name_root: str, + drop_tracts: list = None, ascending: bool = True, ) -> pd.DataFrame: """Creates percentiles. @@ -298,10 +299,15 @@ class ScoreETL(ExtractTransformLoad): something like "3rd grade reading proficiency" and `output_column_name_root` may be something like "Low 3rd grade reading proficiency". """ - if ( - output_column_name_root - != field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD - ): + + # We have two potential options for assessing how to calculate percentiles. + # For the vast majority of columns, we will simply calculate percentiles overall. + # However, for Linguistic Isolation and Agricultural Value Loss, there exist conditions + # for which we drop out tracts from consideration in the percentile. More details on those + # are below, for them, we provide a list of tracts to not include. + # Because of the fancy transformations below, I have removed the urban / rural percentiles, + # which are now deprecated. + if not drop_tracts: # Create the "basic" percentile. df[ f"{output_column_name_root}" @@ -309,62 +315,23 @@ class ScoreETL(ExtractTransformLoad): ] = df[input_column_name].rank(pct=True, ascending=ascending) else: - # For agricultural loss, we are using whether there is value at all to determine percentile and then - # filling places where the value is False with 0 + tmp_series = df[input_column_name].where( + ~df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts), + np.nan, + ) + logger.info( + f"Creating special case column for percentiles from {input_column_name}" + ) df[ f"{output_column_name_root}" f"{field_names.PERCENTILE_FIELD_SUFFIX}" - ] = ( - df.where( - df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float) - == 1.0 - )[input_column_name] - .rank(ascending=ascending, pct=True) - .fillna( - df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float) - ) - ) + ] = tmp_series.rank(ascending=ascending, pct=True) - # Create the urban/rural percentiles. - urban_rural_percentile_fields_to_combine = [] - for (urban_or_rural_string, urban_heuristic_bool) in [ - ("urban", True), - ("rural", False), - ]: - # Create a field with only those values - this_category_only_value_field = ( - f"{input_column_name} (value {urban_or_rural_string} only)" - ) - df[this_category_only_value_field] = np.where( - df[field_names.URBAN_HEURISTIC_FIELD] == urban_heuristic_bool, - df[input_column_name], - None, - ) - - # Calculate the percentile for only this category - this_category_only_percentile_field = ( - f"{output_column_name_root} " - f"(percentile {urban_or_rural_string} only)" - ) - df[this_category_only_percentile_field] = df[ - this_category_only_value_field - ].rank( - pct=True, - # Set ascending to the parameter value. - ascending=ascending, - ) - - # Add the field name to this list. Later, we'll combine this list. - urban_rural_percentile_fields_to_combine.append( - this_category_only_percentile_field - ) - - # Combine both urban and rural into one field: - df[ - f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}" - ] = df[urban_rural_percentile_fields_to_combine].mean( - axis=1, skipna=True - ) + # Check that "drop tracts" were dropped (quicker than creating a fixture?) + assert df[df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts)][ + f"{output_column_name_root}" + f"{field_names.PERCENTILE_FIELD_SUFFIX}" + ].isna().sum() == len(drop_tracts), "Not all tracts were dropped" return df @@ -523,13 +490,47 @@ class ScoreETL(ExtractTransformLoad): df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric) # Convert all columns to numeric and do math + # Note that we have a few special conditions here, that we handle explicitly. + # For *Linguistic Isolation*, we do NOT want to include Puerto Rico in the percentile + # calculation. This is because linguistic isolation as a category doesn't make much sense + # in Puerto Rico, where Spanish is a recognized language. Thus, we construct a list + # of tracts to drop from the percentile calculation. + # + # For *Expected Agricultural Loss*, we only want to include in the percentile tracts + # in which there is some agricultural value. This helps us adjust the data such that we have + # the ability to discern which tracts truly are at the 90th percentile, since many tracts have 0 value. + for numeric_column in numeric_columns: + drop_tracts = [] + if ( + numeric_column + == field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD + ): + drop_tracts = df_copy[ + ~df_copy[field_names.AGRICULTURAL_VALUE_BOOL_FIELD] + .astype(bool) + .fillna(False) + ][field_names.GEOID_TRACT_FIELD].to_list() + logger.info( + f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss" + ) + + elif numeric_column == field_names.LINGUISTIC_ISO_FIELD: + drop_tracts = df_copy[ + # 72 is the FIPS code for Puerto Rico + df_copy[field_names.GEOID_TRACT_FIELD].str.startswith("72") + ][field_names.GEOID_TRACT_FIELD].to_list() + logger.info( + f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation" + ) + df_copy = self._add_percentiles_to_df( df=df_copy, input_column_name=numeric_column, # For this use case, the input name and output name root are the same. output_column_name_root=numeric_column, ascending=True, + drop_tracts=drop_tracts, ) # Min-max normalization: