mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 17:44:20 -08:00
Data source location (#2015)
* data source location * toml * cdc_places * cdc_svi_index * url updates * child oppy and dot travel * up to hud_recap * completed ticket * cache bust * hud_recap * us_army_fuds
This commit is contained in:
parent
841a26d566
commit
d975118388
13 changed files with 126 additions and 28 deletions
|
@ -10,6 +10,8 @@ from data_pipeline.etl.score.etl_utils import (
|
|||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import download_file_from_url
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -20,7 +22,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
|
||||
NAME = "cdc_life_expectancy"
|
||||
|
||||
USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
|
||||
else:
|
||||
USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
|
||||
|
||||
LOAD_YAML_CONFIG: bool = False
|
||||
LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
|
||||
INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
|
||||
|
@ -29,8 +35,12 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
|
||||
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
|
||||
# USA" file. Load these separately.
|
||||
WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
|
||||
MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
|
||||
MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
|
||||
else:
|
||||
WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
|
||||
MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
|
||||
|
||||
TRACT_INPUT_COLUMN_NAME = "Tract ID"
|
||||
STATE_INPUT_COLUMN_NAME = "STATE2KX"
|
||||
|
|
|
@ -6,6 +6,7 @@ from data_pipeline.etl.base import ValidGeoLevel
|
|||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import download_file_from_url
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -22,7 +23,14 @@ class CDCPlacesETL(ExtractTransformLoad):
|
|||
def __init__(self):
|
||||
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
|
||||
|
||||
self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
self.CDC_PLACES_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
|
||||
)
|
||||
else:
|
||||
self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
|
||||
|
||||
self.COLUMNS_TO_KEEP: typing.List[str] = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
field_names.DIABETES_FIELD,
|
||||
|
|
|
@ -3,6 +3,7 @@ import pandas as pd
|
|||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -16,7 +17,13 @@ class CDCSVIIndex(ExtractTransformLoad):
|
|||
def __init__(self):
|
||||
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
|
||||
|
||||
self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
self.CDC_SVI_INDEX_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"cdc_svi_index/SVI2018_US.csv"
|
||||
)
|
||||
else:
|
||||
self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
|
||||
|
||||
self.CDC_RPL_THEMES_THRESHOLD = 0.90
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import pandas as pd
|
|||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.base import ValidGeoLevel
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -37,10 +38,16 @@ class ChildOpportunityIndex(ExtractTransformLoad):
|
|||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||
|
||||
def __init__(self):
|
||||
self.SOURCE_URL = (
|
||||
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
|
||||
"3a0ededa30a0?format=csv"
|
||||
)
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
self.SOURCE_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"child_opportunity_index/raw.zip"
|
||||
)
|
||||
else:
|
||||
self.SOURCE_URL = (
|
||||
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
|
||||
"3a0ededa30a0?format=csv"
|
||||
)
|
||||
|
||||
# TODO: Decide about nixing this
|
||||
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
|
||||
|
|
|
@ -5,6 +5,7 @@ import pandas as pd
|
|||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.base import ValidGeoLevel
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -13,10 +14,15 @@ class TravelCompositeETL(ExtractTransformLoad):
|
|||
"""ETL class for the DOT Travel Disadvantage Dataset"""
|
||||
|
||||
NAME = "travel_composite"
|
||||
# Commenting below temporarily to get stating going. In the next PR we'll have env vars to control
|
||||
# data source endpoints to be either "source" or "aws"
|
||||
# SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
|
||||
SOURCE_URL = "https://justice40-data.s3.amazonaws.com/data-sources/raw-data-sources/dot_travel_composite/Shapefile_and_Metadata.zip"
|
||||
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
SOURCE_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"dot_travel_composite/Shapefile_and_Metadata.zip"
|
||||
)
|
||||
else:
|
||||
SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
|
||||
|
||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||
LOAD_YAML_CONFIG: bool = True
|
||||
|
|
|
@ -5,6 +5,7 @@ from data_pipeline.etl.base import ExtractTransformLoad
|
|||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.utils import unzip_file_from_url
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -21,7 +22,17 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
|||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.AGGREGATED_RSEI_SCORE_FILE_URL = "http://abt-rsei.s3.amazonaws.com/microdata2019/census_agg/CensusMicroTracts2019_2019_aggregated.zip"
|
||||
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
|
||||
)
|
||||
else:
|
||||
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
|
||||
"http://abt-rsei.s3.amazonaws.com/microdata2019/"
|
||||
"census_agg/CensusMicroTracts2019_2019_aggregated.zip"
|
||||
)
|
||||
|
||||
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
|
||||
self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
|
||||
|
|
|
@ -2,6 +2,7 @@ import pandas as pd
|
|||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.base import ValidGeoLevel
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -12,7 +13,15 @@ class HudHousingETL(ExtractTransformLoad):
|
|||
|
||||
def __init__(self):
|
||||
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
|
||||
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
|
||||
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
self.HOUSING_FTP_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"hud_housing/2014thru2018-140-csv.zip"
|
||||
)
|
||||
else:
|
||||
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
|
||||
|
||||
self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()
|
||||
|
||||
# We measure households earning less than 80% of HUD Area Median Family Income by county
|
||||
|
|
|
@ -4,13 +4,24 @@ from data_pipeline.config import settings
|
|||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class HudRecapETL(ExtractTransformLoad):
|
||||
def __init__(self):
|
||||
# pylint: disable=line-too-long
|
||||
self.HUD_RECAP_CSV_URL = "https://opendata.arcgis.com/api/v3/datasets/56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326" # noqa: E501
|
||||
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
self.HUD_RECAP_CSV_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
|
||||
)
|
||||
else:
|
||||
self.HUD_RECAP_CSV_URL = (
|
||||
"https://opendata.arcgis.com/api/v3/datasets/"
|
||||
"56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
|
||||
)
|
||||
|
||||
self.HUD_RECAP_CSV = (
|
||||
self.get_tmp_path()
|
||||
/ "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
|
||||
|
|
|
@ -6,6 +6,7 @@ from data_pipeline.etl.base import ExtractTransformLoad
|
|||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import download_file_from_url
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -22,10 +23,16 @@ class MappingInequalityETL(ExtractTransformLoad):
|
|||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.MAPPING_INEQUALITY_CSV_URL = (
|
||||
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
|
||||
"main/2010_Census_Tracts/holc_tract_lookup.csv"
|
||||
)
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
self.MAPPING_INEQUALITY_CSV_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"mapping_inequality/holc_tract_lookup.csv"
|
||||
)
|
||||
else:
|
||||
self.MAPPING_INEQUALITY_CSV_URL = (
|
||||
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
|
||||
"main/2010_Census_Tracts/holc_tract_lookup.csv"
|
||||
)
|
||||
self.MAPPING_INEQUALITY_CSV = (
|
||||
self.get_tmp_path() / "holc_tract_lookup.csv"
|
||||
)
|
||||
|
|
|
@ -6,6 +6,7 @@ import pandas as pd
|
|||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.base import ValidGeoLevel
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -14,7 +15,18 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
|||
"""ETL class for the FEMA National Risk Index dataset"""
|
||||
|
||||
NAME = "national_risk_index"
|
||||
SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
|
||||
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
SOURCE_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"national_risk_index/NRI_Table_CensusTracts.zip"
|
||||
)
|
||||
else:
|
||||
SOURCE_URL = (
|
||||
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
|
||||
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
|
||||
)
|
||||
|
||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||
LOAD_YAML_CONFIG: bool = True
|
||||
|
|
|
@ -8,6 +8,7 @@ from data_pipeline.etl.base import ValidGeoLevel
|
|||
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
|
||||
from data_pipeline.utils import download_file_from_url
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -26,11 +27,19 @@ class USArmyFUDS(ExtractTransformLoad):
|
|||
ISLAND_AREAS_EXPECTED_IN_DATA = True
|
||||
|
||||
def __init__(self):
|
||||
self.FILE_URL: str = (
|
||||
"https://opendata.arcgis.com/api/v3/datasets/"
|
||||
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
|
||||
"data?format=geojson&spatialRefId=4326&where=1%3D1"
|
||||
)
|
||||
|
||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||
self.FILE_URL = (
|
||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
|
||||
"all_data_reported_to_Congress_in_FY2020.geojson"
|
||||
)
|
||||
else:
|
||||
self.FILE_URL: str = (
|
||||
"https://opendata.arcgis.com/api/v3/datasets/"
|
||||
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
|
||||
"data?format=geojson&spatialRefId=4326&where=1%3D1"
|
||||
)
|
||||
|
||||
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"
|
||||
|
||||
|
|
2
data/data-pipeline/poetry.lock
generated
2
data/data-pipeline/poetry.lock
generated
|
@ -346,7 +346,7 @@ conda = ["pyyaml"]
|
|||
|
||||
[[package]]
|
||||
name = "dynaconf"
|
||||
version = "3.1.9"
|
||||
version = "3.1.11"
|
||||
description = "The dynamic configurator for your Python Project"
|
||||
category = "main"
|
||||
optional = false
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
[default]
|
||||
AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources"
|
||||
AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-pipeline"
|
||||
DATASOURCE_RETRIEVAL_FROM_AWS = true
|
||||
|
||||
[development]
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue