Data source location (#2015)

* data source location

* toml

* cdc_places

* cdc_svi_index

* url updates

* child oppy and dot travel

* up to hud_recap

* completed ticket

* cache bust

* hud_recap

* us_army_fuds
This commit is contained in:
Jorge Escobar 2022-10-20 22:49:42 -04:00 committed by GitHub
parent 841a26d566
commit d975118388
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 126 additions and 28 deletions

View file

@ -10,6 +10,8 @@ from data_pipeline.etl.score.etl_utils import (
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -20,7 +22,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
NAME = "cdc_life_expectancy"
USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
else:
USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
LOAD_YAML_CONFIG: bool = False
LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
@ -29,8 +35,12 @@ class CDCLifeExpectancy(ExtractTransformLoad):
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
# USA" file. Load these separately.
WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
else:
WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
TRACT_INPUT_COLUMN_NAME = "Tract ID"
STATE_INPUT_COLUMN_NAME = "STATE2KX"

View file

@ -6,6 +6,7 @@ from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -22,7 +23,14 @@ class CDCPlacesETL(ExtractTransformLoad):
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.CDC_PLACES_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
)
else:
self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
self.COLUMNS_TO_KEEP: typing.List[str] = [
self.GEOID_TRACT_FIELD_NAME,
field_names.DIABETES_FIELD,

View file

@ -3,6 +3,7 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -16,7 +17,13 @@ class CDCSVIIndex(ExtractTransformLoad):
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.CDC_SVI_INDEX_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"cdc_svi_index/SVI2018_US.csv"
)
else:
self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
self.CDC_RPL_THEMES_THRESHOLD = 0.90

View file

@ -4,6 +4,7 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -37,10 +38,16 @@ class ChildOpportunityIndex(ExtractTransformLoad):
PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self):
self.SOURCE_URL = (
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
"3a0ededa30a0?format=csv"
)
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.SOURCE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"child_opportunity_index/raw.zip"
)
else:
self.SOURCE_URL = (
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
"3a0ededa30a0?format=csv"
)
# TODO: Decide about nixing this
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME

View file

@ -5,6 +5,7 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -13,10 +14,15 @@ class TravelCompositeETL(ExtractTransformLoad):
"""ETL class for the DOT Travel Disadvantage Dataset"""
NAME = "travel_composite"
# Commenting below temporarily to get stating going. In the next PR we'll have env vars to control
# data source endpoints to be either "source" or "aws"
# SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
SOURCE_URL = "https://justice40-data.s3.amazonaws.com/data-sources/raw-data-sources/dot_travel_composite/Shapefile_and_Metadata.zip"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
SOURCE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"dot_travel_composite/Shapefile_and_Metadata.zip"
)
else:
SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True

View file

@ -5,6 +5,7 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -21,7 +22,17 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
"""
def __init__(self):
self.AGGREGATED_RSEI_SCORE_FILE_URL = "http://abt-rsei.s3.amazonaws.com/microdata2019/census_agg/CensusMicroTracts2019_2019_aggregated.zip"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
)
else:
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
"http://abt-rsei.s3.amazonaws.com/microdata2019/"
"census_agg/CensusMicroTracts2019_2019_aggregated.zip"
)
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75

View file

@ -2,6 +2,7 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -12,7 +13,15 @@ class HudHousingETL(ExtractTransformLoad):
def __init__(self):
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.HOUSING_FTP_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"hud_housing/2014thru2018-140-csv.zip"
)
else:
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()
# We measure households earning less than 80% of HUD Area Median Family Income by county

View file

@ -4,13 +4,24 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class HudRecapETL(ExtractTransformLoad):
def __init__(self):
# pylint: disable=line-too-long
self.HUD_RECAP_CSV_URL = "https://opendata.arcgis.com/api/v3/datasets/56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326" # noqa: E501
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.HUD_RECAP_CSV_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
)
else:
self.HUD_RECAP_CSV_URL = (
"https://opendata.arcgis.com/api/v3/datasets/"
"56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
)
self.HUD_RECAP_CSV = (
self.get_tmp_path()
/ "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"

View file

@ -6,6 +6,7 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -22,10 +23,16 @@ class MappingInequalityETL(ExtractTransformLoad):
"""
def __init__(self):
self.MAPPING_INEQUALITY_CSV_URL = (
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
"main/2010_Census_Tracts/holc_tract_lookup.csv"
)
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.MAPPING_INEQUALITY_CSV_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"mapping_inequality/holc_tract_lookup.csv"
)
else:
self.MAPPING_INEQUALITY_CSV_URL = (
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
"main/2010_Census_Tracts/holc_tract_lookup.csv"
)
self.MAPPING_INEQUALITY_CSV = (
self.get_tmp_path() / "holc_tract_lookup.csv"
)

View file

@ -6,6 +6,7 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -14,7 +15,18 @@ class NationalRiskIndexETL(ExtractTransformLoad):
"""ETL class for the FEMA National Risk Index dataset"""
NAME = "national_risk_index"
SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
SOURCE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"national_risk_index/NRI_Table_CensusTracts.zip"
)
else:
SOURCE_URL = (
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True

View file

@ -8,6 +8,7 @@ from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -26,11 +27,19 @@ class USArmyFUDS(ExtractTransformLoad):
ISLAND_AREAS_EXPECTED_IN_DATA = True
def __init__(self):
self.FILE_URL: str = (
"https://opendata.arcgis.com/api/v3/datasets/"
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
"data?format=geojson&spatialRefId=4326&where=1%3D1"
)
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.FILE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
"all_data_reported_to_Congress_in_FY2020.geojson"
)
else:
self.FILE_URL: str = (
"https://opendata.arcgis.com/api/v3/datasets/"
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
"data?format=geojson&spatialRefId=4326&where=1%3D1"
)
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"

View file

@ -346,7 +346,7 @@ conda = ["pyyaml"]
[[package]]
name = "dynaconf"
version = "3.1.9"
version = "3.1.11"
description = "The dynamic configurator for your Python Project"
category = "main"
optional = false

View file

@ -1,6 +1,7 @@
[default]
AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources"
AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-pipeline"
DATASOURCE_RETRIEVAL_FROM_AWS = true
[development]