Download column order completed (#1077)

* Download column order completed * Kameron changes * Lucas and Beth column order changes * cdc_places update * passing score * pandas error * checkpoint * score passing * rounding complete - percentages still showing one decimal * fixing tests * fixing percentages * updating comment * int percentages! 🎉🎉 * forgot to pass back to df * passing tests Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
2025-09-22 02:41:12 -07:00 · 2022-01-13 15:04:16 -05:00 · 2022-01-13 15:04:16 -05:00 · d686bb856e
commit d686bb856e
parent 98ff4bd9d8
13 changed files with 232 additions and 133 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
@ -2,6 +2,7 @@ import pandas as pd

 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.utils import get_module_logger, download_file_from_url
+from data_pipeline.score import field_names

 logger = get_module_logger(__name__)

@ -49,6 +50,20 @@ class CDCPlacesETL(ExtractTransformLoad):
            values=self.CDC_VALUE_FIELD_NAME,
        )

+        # rename columns to be used in score
+        rename_fields = {
+            "Current asthma among adults aged >=18 years": field_names.ASTHMA_FIELD,
+            "Coronary heart disease among adults aged >=18 years": field_names.HEART_DISEASE_FIELD,
+            "Cancer (excluding skin cancer) among adults aged >=18 years": field_names.CANCER_FIELD,
+            "Diagnosed diabetes among adults aged >=18 years": field_names.DIABETES_FIELD,
+            "Physical health not good for >=14 days among adults aged >=18 years": field_names.PHYS_HEALTH_NOT_GOOD_FIELD,
+        }
+        self.df.rename(
+            columns=rename_fields,
+            inplace=True,
+            errors="raise",
+        )
+
        # Make the index (the census tract ID) a column, not the index.
        self.df.reset_index(inplace=True)

--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -5,6 +5,7 @@ from data_pipeline.etl.sources.census_acs.etl_utils import (
    retrieve_census_acs_data,
 )
 from data_pipeline.utils import get_module_logger
+from data_pipeline.score import field_names

 logger = get_module_logger(__name__)

@ -22,7 +23,7 @@ class CensusACSETL(ExtractTransformLoad):
            self.TOTAL_UNEMPLOYED_FIELD,
            self.TOTAL_IN_LABOR_FORCE,
        ]
-        self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
+        self.UNEMPLOYED_FIELD_NAME = "Unemployment (percent)"

        self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
        self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
@ -353,18 +354,29 @@ class CensusACSETL(ExtractTransformLoad):
            + df[self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PRIVATE]
        ) / df[self.COLLEGE_ATTENDANCE_TOTAL_POPULATION_ASKED]

+        # strip columns
+        df = df[self.COLUMNS_TO_KEEP]
+
        # Save results to self.
        self.df = df

+        # rename columns to be used in score
+        rename_fields = {
+            "Percent of individuals < 200% Federal Poverty Line": field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
+        }
+        self.df.rename(
+            columns=rename_fields,
+            inplace=True,
+            errors="raise",
+        )
+
    def load(self) -> None:
        logger.info("Saving Census ACS Data")

        # mkdir census
        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

-        self.df[self.COLUMNS_TO_KEEP].to_csv(
-            path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
-        )
+        self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)

    def validate(self) -> None:
        logger.info("Validating Census ACS Data")
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
@ -5,6 +5,7 @@ from data_pipeline.etl.sources.census_acs.etl_utils import (
    retrieve_census_acs_data,
 )
 from data_pipeline.utils import get_module_logger
+from data_pipeline.score import field_names

 logger = get_module_logger(__name__)

@ -73,7 +74,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
            self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE,
        ]

-        self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
+        self.UNEMPLOYED_FIELD_NAME = "Unemployment (percent)"

        self.POVERTY_FIELDS = [
            "C17002_001E",  # Estimate!!Total,
@ -149,15 +150,6 @@ class CensusACS2010ETL(ExtractTransformLoad):
            + df["C17002_007E"]
        ) / df["C17002_001E"]

-        # Save results to self.
-        self.df = df
-
-    def load(self) -> None:
-        logger.info("Saving Census ACS Data")
-
-        # mkdir census
-        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
-
        columns_to_include = [
            self.GEOID_TRACT_FIELD_NAME,
            self.UNEMPLOYED_FIELD_NAME,
@ -166,7 +158,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
            self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
        ]

-        output_df = self.df[columns_to_include]
+        output_df = df[columns_to_include]

        # Add the year to the end of every column, so when it's all joined in the
        # score df, it's obvious which year this data is from.
@ -178,7 +170,26 @@ class CensusACS2010ETL(ExtractTransformLoad):
                    }
                )

-        output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
+        # rename columns to be used in score
+        rename_fields = {
+            "Percent of individuals < 100% Federal Poverty Line in 2010": field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
+        }
+        output_df.rename(
+            columns=rename_fields,
+            inplace=True,
+            errors="raise",
+        )
+
+        # Save results to self.
+        self.df = output_df
+
+    def load(self) -> None:
+        logger.info("Saving Census ACS Data")
+
+        # mkdir census
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+
+        self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)

    def validate(self) -> None:
        logger.info("Validating Census ACS Data")
--- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
@ -6,6 +6,7 @@ import pandas as pd

 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.utils import get_module_logger
+from data_pipeline.score import field_names

 pd.options.mode.chained_assignment = "raise"

@ -141,7 +142,9 @@ class CensusDecennialETL(ExtractTransformLoad):
            "PBG036014"  # Total!!Female!!In labor force!!Civilian!!Unemployed
        )

-        self.UNEMPLOYMENT_FIELD_NAME = "Unemployed civilians (percent) in 2009"
+        self.UNEMPLOYMENT_FIELD_NAME = (
+            field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009
+        )

        var_list = [
            self.MEDIAN_INCOME_FIELD,