From f284d75098b69a41d01912e611da7487e012b26f Mon Sep 17 00:00:00 2001 From: lucasmbrown-usds Date: Wed, 21 Sep 2022 14:54:27 -0400 Subject: [PATCH] renaming geocorr to geocorr_urban --- data/data-pipeline/data_pipeline/etl/constants.py | 6 +++--- .../data-pipeline/data_pipeline/etl/score/etl_score.py | 2 +- .../etl/sources/census_acs_median_income/etl.py | 10 +++++----- .../etl/sources/{geocorr => geocorr_urban}/README.md | 0 .../etl/sources/{geocorr => geocorr_urban}/__init__.py | 0 .../etl/sources/{geocorr => geocorr_urban}/etl.py | 8 ++++---- .../data_pipeline/ipython/urban_vs_rural.ipynb | 2 +- .../data_pipeline/tests/score/fixtures.py | 2 +- .../data_pipeline/tests/sources/geocorr/test_etl.py | 6 +++--- 9 files changed, 18 insertions(+), 18 deletions(-) rename data/data-pipeline/data_pipeline/etl/sources/{geocorr => geocorr_urban}/README.md (100%) rename data/data-pipeline/data_pipeline/etl/sources/{geocorr => geocorr_urban}/__init__.py (100%) rename data/data-pipeline/data_pipeline/etl/sources/{geocorr => geocorr_urban}/etl.py (92%) diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index f0d5b171..b246dcc1 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -90,9 +90,9 @@ DATASET_LIST = [ "is_memory_intensive": False, }, { - "name": "geocorr", - "module_dir": "geocorr", - "class_name": "GeoCorrETL", + "name": "geocorr_urban", + "module_dir": "geocorr_urban", + "class_name": "GeoCorrUrbanETL", "is_memory_intensive": False, }, { diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 5a865d5f..e708920d 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -153,7 +153,7 @@ class ScoreETL(ExtractTransformLoad): # Load GeoCorr Urban Rural Map geocorr_urban_rural_csv = ( - constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv" + constants.DATA_PATH / "dataset" / "geocorr_urban" / "usa.csv" ) self.geocorr_urban_rural_df = pd.read_csv( geocorr_urban_rural_csv, diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py index a39f8891..fa07f830 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py @@ -76,7 +76,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): self.pr_tracts: pd.DataFrame def _transform_geocorr(self) -> pd.DataFrame: - # Transform the geocorr data + # Transform the geocorr_urban data geocorr_df = self.raw_geocorr_df # Strip the unnecessary period from the tract ID: @@ -244,12 +244,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): file_url=settings.AWS_JUSTICE40_DATASOURCES_URL + "/geocorr2014_all_states_tracts_only.csv.zip", download_path=self.get_tmp_path(), - unzipped_file_path=self.get_tmp_path() / "geocorr", + unzipped_file_path=self.get_tmp_path() / "geocorr_urban", ) self.raw_geocorr_df = pd.read_csv( filepath_or_buffer=self.get_tmp_path() - / "geocorr" + / "geocorr_urban" / "geocorr2014_all_states_tracts_only.csv", # Skip second row, which has descriptions. skiprows=[1], @@ -265,7 +265,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): ) logger.info("Pulling PR tract list down.") - # This step is necessary because PR is not in geocorr at the level that gets joined + # This step is necessary because PR is not in geocorr_urban at the level that gets joined pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv" download_file_from_url( file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file @@ -307,7 +307,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): msa_median_incomes_df = self._transform_msa_median_incomes() state_median_incomes_df = self._transform_state_median_incomes() - # Adds 945 PR tracts to the geocorr dataframe + # Adds 945 PR tracts to the geocorr_urban dataframe geocorr_df_plus_pr = geocorr_df.merge(self.pr_tracts, how="outer") # Join tracts on MSA incomes diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr/README.md b/data/data-pipeline/data_pipeline/etl/sources/geocorr_urban/README.md similarity index 100% rename from data/data-pipeline/data_pipeline/etl/sources/geocorr/README.md rename to data/data-pipeline/data_pipeline/etl/sources/geocorr_urban/README.md diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr_urban/__init__.py similarity index 100% rename from data/data-pipeline/data_pipeline/etl/sources/geocorr/__init__.py rename to data/data-pipeline/data_pipeline/etl/sources/geocorr_urban/__init__.py diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr_urban/etl.py similarity index 92% rename from data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py rename to data/data-pipeline/data_pipeline/etl/sources/geocorr_urban/etl.py index c8e30d10..4d775ed4 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr_urban/etl.py @@ -10,13 +10,13 @@ from data_pipeline.utils import ( logger = get_module_logger(__name__) -class GeoCorrETL(ExtractTransformLoad): - NAME = "geocorr" +class GeoCorrUrbanETL(ExtractTransformLoad): + NAME = "geocorr_urban" GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False def __init__(self): - self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr" + self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr_urban" # Need to change hyperlink to S3 @@ -37,7 +37,7 @@ class GeoCorrETL(ExtractTransformLoad): def extract(self) -> None: logger.info( - "Starting to download 2MB GeoCorr Urban Rural Census Tract Map file." + "Starting to download 2MB geocorr_urban Urban Rural Census Tract Map file." ) unzip_file_from_url( file_url=settings.AWS_JUSTICE40_DATASOURCES_URL diff --git a/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb b/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb index 1059229a..bd509350 100644 --- a/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb @@ -334,7 +334,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/data/data-pipeline/data_pipeline/tests/score/fixtures.py b/data/data-pipeline/data_pipeline/tests/score/fixtures.py index 744ebfa6..b3a5f4d8 100644 --- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py +++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py @@ -161,7 +161,7 @@ def fuds_df(): @pytest.fixture() def geocorr_urban_rural_df(): geocorr_urban_rural_csv = ( - constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv" + constants.DATA_PATH / "dataset" / "geocorr_urban" / "usa.csv" ) return pd.read_csv( geocorr_urban_rural_csv, diff --git a/data/data-pipeline/data_pipeline/tests/sources/geocorr/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/geocorr/test_etl.py index bb065aac..57564972 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/geocorr/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/geocorr/test_etl.py @@ -1,15 +1,15 @@ import pathlib from data_pipeline.tests.sources.example.test_etl import TestETL -from data_pipeline.etl.sources.geocorr.etl import GeoCorrETL +from data_pipeline.etl.sources.geocorr_urban.etl import GeoCorrUrbanETL class TestGeoCorrETL(TestETL): - _ETL_CLASS = GeoCorrETL + _ETL_CLASS = GeoCorrUrbanETL _SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data" _SAMPLE_DATA_FILE_NAME = "geocorr_urban_rural.csv" _SAMPLE_DATA_ZIP_FILE_NAME = "geocorr_urban_rural.csv.zip" - _EXTRACT_TMP_FOLDER_NAME = "GeoCorrETL" + _EXTRACT_TMP_FOLDER_NAME = "GeoCorrUrbanETL" def setup_method(self, _method, filename=__file__): """Invoke `setup_method` from Parent, but using the current file name.