Cleaning up quick code (#1349)

Did some quick, mostly cosmetic changes and updates to the quick launch changes. This mostly entailed changing strings to constants and cleaning up some code to make it neater. Changes -- PR AMI, updating ag loss, and dropping pr from some threshold counts.
2025-09-29 15:13:17 -07:00 · 2022-03-02 16:50:04 -05:00 · 2022-03-02 16:50:04 -05:00 · aea49cbb5a
commit aea49cbb5a
parent df268d4d91
6 changed files with 341 additions and 348 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -22,7 +22,6 @@ DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
 DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
 DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"

-
 # Score paths
 DATA_SCORE_DIR = DATA_PATH / "score"

@ -66,6 +65,9 @@ CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
 # Drop FIPS codes from map
 DROP_FIPS_CODES = ["66", "78"]

+# Drop FIPS codes from incrementing
+DROP_FIPS_FROM_NON_WTD_THRESHOLDS = "72"
+
 # Percent prefixes for rounding
 PERCENT_PREFIXES_SUFFIXES = [
    "Percent",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -298,33 +298,20 @@ class ScoreETL(ExtractTransformLoad):
            ] = df[input_column_name].rank(pct=True, ascending=ascending)

        else:
-            # For agricultural loss, we are using whether there is value at all to determine percentile
-            # This is not the most thoughtfully written code, but it works.
-
-            # Take only rows with agrivalue
-            tmp_df = df[df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD] == 1][
-                [input_column_name, field_names.GEOID_TRACT_FIELD]
-            ].copy()
-
-            # Construct a percentile only among those tracts
-            tmp_df["temporary_ranking"] = tmp_df[input_column_name].transform(
-                lambda x: x.rank(pct=True, ascending=True)
-            )
-
-            # # Create a map for just those tracts and map it onto the df
-            temporary_ranking = tmp_df.set_index(field_names.GEOID_TRACT_FIELD)[
-                "temporary_ranking"
-            ].to_dict()
-
+            # For agricultural loss, we are using whether there is value at all to determine percentile and then
+            # filling places where the value is False with 0
            df[
                f"{output_column_name_root}"
                f"{field_names.PERCENTILE_FIELD_SUFFIX}"
-            ] = np.where(
-                df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].isna(),
-                np.nan,
-                df[field_names.GEOID_TRACT_FIELD]
-                .map(temporary_ranking)
-                .fillna(0),
+            ] = (
+                df.where(
+                    df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
+                    == 1.0
+                )[input_column_name]
+                .rank(ascending=ascending, pct=True)
+                .fillna(
+                    df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
+                )
            )

        # Create the urban/rural percentiles.
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
@ -264,7 +264,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            low_memory=False,
        )

-        logger.info("Pulling PR info down.")
+        logger.info("Pulling PR tract list down.")
+        # This step is necessary because PR is not in geocorr at the level that gets joined
        pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
        download_file_from_url(
            file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
@ -273,11 +274,11 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            filepath_or_buffer=self.get_tmp_path()
            / "pr_tracts"
            / "pr_tracts.csv",
-            # Skip second row, which has descriptions.
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={"GEOID10_TRACT": str},
            low_memory=False,
        )
+        self.pr_tracts["State Abbreviation"] = "PR"

        # Download MSA median incomes
        logger.info("Starting download of MSA median incomes.")
@ -298,12 +299,10 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        msa_median_incomes_df = self._transform_msa_median_incomes()
        state_median_incomes_df = self._transform_state_median_incomes()

-        # Adds 945 PR tracts
-        geocorr_df_plus_pr = geocorr_df.merge(
-            self.pr_tracts, how="outer", indicator=True
-        )
+        # Adds 945 PR tracts to the geocorr dataframe
+        geocorr_df_plus_pr = geocorr_df.merge(self.pr_tracts, how="outer")

-        # Join tracts on MSA incomes (this is where we lose PR)
+        # Join tracts on MSA incomes
        merged_df = geocorr_df_plus_pr.merge(
            msa_median_incomes_df, on=self.MSA_ID_FIELD_NAME, how="left"
        )
--- a/data/data-pipeline/data_pipeline/score/score_m.py
+++ b/data/data-pipeline/data_pipeline/score/score_m.py
@ -5,6 +5,7 @@ import pandas as pd
 from data_pipeline.score.score import Score
 import data_pipeline.score.field_names as field_names
 from data_pipeline.utils import get_module_logger
+import data_pipeline.etl.score.constants as constants

 logger = get_module_logger(__name__)

@ -195,7 +196,8 @@ class ScoreM(Score):
        )

        self._increment_total_eligibility_exceeded(
-            climate_eligibility_columns, skip_fips=("72")
+            climate_eligibility_columns,
+            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )

        return self.df[climate_eligibility_columns].any(axis="columns")
@ -239,7 +241,8 @@ class ScoreM(Score):
        )

        self._increment_total_eligibility_exceeded(
-            energy_eligibility_columns, skip_fips=("72")
+            energy_eligibility_columns,
+            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )

        return self.df[energy_eligibility_columns].any(axis="columns")
@ -290,7 +293,8 @@ class ScoreM(Score):
        )

        self._increment_total_eligibility_exceeded(
-            transportion_eligibility_columns, skip_fips=("72")
+            transportion_eligibility_columns,
+            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )

        return self.df[transportion_eligibility_columns].any(axis="columns")
@ -351,7 +355,8 @@ class ScoreM(Score):
        )

        self._increment_total_eligibility_exceeded(
-            housing_eligibility_columns, skip_fips=("72")
+            housing_eligibility_columns,
+            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )

        return self.df[housing_eligibility_columns].any(axis="columns")
@ -402,7 +407,8 @@ class ScoreM(Score):
        )

        self._increment_total_eligibility_exceeded(
-            pollution_eligibility_columns, skip_fips=("72")
+            pollution_eligibility_columns,
+            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )

        return self.df[pollution_eligibility_columns].any(axis="columns")
@ -432,7 +438,7 @@ class ScoreM(Score):

        self._increment_total_eligibility_exceeded(
            [field_names.WASTEWATER_DISCHARGE_LOW_INCOME_LOW_HIGHER_ED_FIELD],
-            skip_fips=("72"),
+            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )

        return self.df[
@ -510,7 +516,8 @@ class ScoreM(Score):
        )

        self._increment_total_eligibility_exceeded(
-            health_eligibility_columns, skip_fips=("72")
+            health_eligibility_columns,
+            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )

        return self.df[health_eligibility_columns].any(axis="columns")