diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index 6325f972..e0d51952 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -10,6 +10,8 @@ from data_pipeline.etl.score.etl_utils import ( from data_pipeline.score import field_names from data_pipeline.utils import download_file_from_url from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings + logger = get_module_logger(__name__) @@ -20,7 +22,11 @@ class CDCLifeExpectancy(ExtractTransformLoad): NAME = "cdc_life_expectancy" - USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV" + else: + USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" + LOAD_YAML_CONFIG: bool = False LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)" INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID" @@ -29,8 +35,12 @@ class CDCLifeExpectancy(ExtractTransformLoad): # For some reason, LEEP does not include Maine or Wisconsin in its "All of # USA" file. Load these separately. - WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV" - MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV" + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV" + MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV" + else: + WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV" + MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV" TRACT_INPUT_COLUMN_NAME = "Tract ID" STATE_INPUT_COLUMN_NAME = "STATE2KX" diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py index 62c94981..fc5589ce 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py @@ -6,6 +6,7 @@ from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.score import field_names from data_pipeline.utils import download_file_from_url from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -22,7 +23,14 @@ class CDCPlacesETL(ExtractTransformLoad): def __init__(self): self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places" - self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD" + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.CDC_PLACES_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv" + ) + else: + self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD" + self.COLUMNS_TO_KEEP: typing.List[str] = [ self.GEOID_TRACT_FIELD_NAME, field_names.DIABETES_FIELD, diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py index 6379ceda..c4f9853e 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py @@ -3,6 +3,7 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -16,7 +17,13 @@ class CDCSVIIndex(ExtractTransformLoad): def __init__(self): self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index" - self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv" + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.CDC_SVI_INDEX_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "cdc_svi_index/SVI2018_US.csv" + ) + else: + self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv" self.CDC_RPL_THEMES_THRESHOLD = 0.90 diff --git a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py index 770e4a5e..c3ecb5fb 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py @@ -4,6 +4,7 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -37,10 +38,16 @@ class ChildOpportunityIndex(ExtractTransformLoad): PUERTO_RICO_EXPECTED_IN_DATA = False def __init__(self): - self.SOURCE_URL = ( - "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-" - "3a0ededa30a0?format=csv" - ) + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.SOURCE_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "child_opportunity_index/raw.zip" + ) + else: + self.SOURCE_URL = ( + "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-" + "3a0ededa30a0?format=csv" + ) # TODO: Decide about nixing this self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME diff --git a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py index 62622c3e..f68c2d78 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py @@ -5,6 +5,7 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -13,10 +14,15 @@ class TravelCompositeETL(ExtractTransformLoad): """ETL class for the DOT Travel Disadvantage Dataset""" NAME = "travel_composite" - # Commenting below temporarily to get stating going. In the next PR we'll have env vars to control - # data source endpoints to be either "source" or "aws" - # SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip" - SOURCE_URL = "https://justice40-data.s3.amazonaws.com/data-sources/raw-data-sources/dot_travel_composite/Shapefile_and_Metadata.zip" + + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + SOURCE_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "dot_travel_composite/Shapefile_and_Metadata.zip" + ) + else: + SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip" + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False LOAD_YAML_CONFIG: bool = True diff --git a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py index a6294c36..1788b835 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py @@ -5,6 +5,7 @@ from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger from data_pipeline.utils import unzip_file_from_url +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -21,7 +22,17 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad): """ def __init__(self): - self.AGGREGATED_RSEI_SCORE_FILE_URL = "http://abt-rsei.s3.amazonaws.com/microdata2019/census_agg/CensusMicroTracts2019_2019_aggregated.zip" + + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.AGGREGATED_RSEI_SCORE_FILE_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "epa_rsei/CensusMicroTracts2019_2019_aggregated.zip" + ) + else: + self.AGGREGATED_RSEI_SCORE_FILE_URL = ( + "http://abt-rsei.s3.amazonaws.com/microdata2019/" + "census_agg/CensusMicroTracts2019_2019_aggregated.zip" + ) self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei" self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75 diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py index 5c00e24a..0e08d225 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py @@ -2,6 +2,7 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -12,7 +13,15 @@ class HudHousingETL(ExtractTransformLoad): def __init__(self): self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT" - self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip" + + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.HOUSING_FTP_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "hud_housing/2014thru2018-140-csv.zip" + ) + else: + self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip" + self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path() # We measure households earning less than 80% of HUD Area Median Family Income by county diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py index 9c32c3a8..447202f3 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py @@ -4,13 +4,24 @@ from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.utils import get_module_logger + logger = get_module_logger(__name__) class HudRecapETL(ExtractTransformLoad): def __init__(self): - # pylint: disable=line-too-long - self.HUD_RECAP_CSV_URL = "https://opendata.arcgis.com/api/v3/datasets/56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326" # noqa: E501 + + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.HUD_RECAP_CSV_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv" + ) + else: + self.HUD_RECAP_CSV_URL = ( + "https://opendata.arcgis.com/api/v3/datasets/" + "56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326" + ) + self.HUD_RECAP_CSV = ( self.get_tmp_path() / "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv" diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py index ee6bc0c6..d9caac8d 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py @@ -6,6 +6,7 @@ from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.score import field_names from data_pipeline.utils import download_file_from_url from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -22,10 +23,16 @@ class MappingInequalityETL(ExtractTransformLoad): """ def __init__(self): - self.MAPPING_INEQUALITY_CSV_URL = ( - "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/" - "main/2010_Census_Tracts/holc_tract_lookup.csv" - ) + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.MAPPING_INEQUALITY_CSV_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "mapping_inequality/holc_tract_lookup.csv" + ) + else: + self.MAPPING_INEQUALITY_CSV_URL = ( + "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/" + "main/2010_Census_Tracts/holc_tract_lookup.csv" + ) self.MAPPING_INEQUALITY_CSV = ( self.get_tmp_path() / "holc_tract_lookup.csv" ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index d9b87d3d..ae31dab1 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -6,6 +6,7 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -14,7 +15,18 @@ class NationalRiskIndexETL(ExtractTransformLoad): """ETL class for the FEMA National Risk Index dataset""" NAME = "national_risk_index" - SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip" + + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + SOURCE_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "national_risk_index/NRI_Table_CensusTracts.zip" + ) + else: + SOURCE_URL = ( + "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/" + "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip" + ) + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False LOAD_YAML_CONFIG: bool = True diff --git a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py index 3b46fcc8..e915d7d9 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py @@ -8,6 +8,7 @@ from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries from data_pipeline.utils import download_file_from_url from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -26,11 +27,19 @@ class USArmyFUDS(ExtractTransformLoad): ISLAND_AREAS_EXPECTED_IN_DATA = True def __init__(self): - self.FILE_URL: str = ( - "https://opendata.arcgis.com/api/v3/datasets/" - "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/" - "data?format=geojson&spatialRefId=4326&where=1%3D1" - ) + + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.FILE_URL = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_" + "all_data_reported_to_Congress_in_FY2020.geojson" + ) + else: + self.FILE_URL: str = ( + "https://opendata.arcgis.com/api/v3/datasets/" + "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/" + "data?format=geojson&spatialRefId=4326&where=1%3D1" + ) self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds" diff --git a/data/data-pipeline/poetry.lock b/data/data-pipeline/poetry.lock index 8e2776d8..793c0702 100644 --- a/data/data-pipeline/poetry.lock +++ b/data/data-pipeline/poetry.lock @@ -346,7 +346,7 @@ conda = ["pyyaml"] [[package]] name = "dynaconf" -version = "3.1.9" +version = "3.1.11" description = "The dynamic configurator for your Python Project" category = "main" optional = false diff --git a/data/data-pipeline/settings.toml b/data/data-pipeline/settings.toml index ce67e23e..d6e88d76 100644 --- a/data/data-pipeline/settings.toml +++ b/data/data-pipeline/settings.toml @@ -1,6 +1,7 @@ [default] AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources" AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-pipeline" +DATASOURCE_RETRIEVAL_FROM_AWS = true [development]