Score F, testing methodology (#510)

* fixing dependency issue

* fixing more dependencies

* including fraction of state AMI

* wip

* nitpick whitespace

* etl working now

* wip on scoring

* fix rename error

* reducing metrics

* fixing score f

* fixing readme

* adding dependency

* passing tests;

* linting/black

* removing unnecessary sample

* fixing error

* adding verify flag on etl/base

Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
This commit is contained in:
Lucas Merrill Brown 2021-08-24 15:40:54 -05:00 committed by GitHub
commit 65ceb7900f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
23 changed files with 557 additions and 153 deletions

View file

@ -1,4 +1,5 @@
from pathlib import Path
from typing import Optional
from data_pipeline.config import settings
from data_pipeline.utils import unzip_file_from_url, remove_all_from_dir
@ -33,14 +34,21 @@ class ExtractTransformLoad:
pass
def extract(self, source_url: str = None, extract_path: Path = None) -> None:
def extract(
self,
source_url: str = None,
extract_path: Path = None,
verify: Optional[bool] = True,
) -> None:
"""Extract the data from
a remote source. By default it provides code to get the file from a source url,
unzips it and stores it on an extract_path."""
# this can be accessed via super().extract()
if source_url and extract_path:
unzip_file_from_url(source_url, self.TMP_PATH, extract_path)
unzip_file_from_url(
source_url, self.TMP_PATH, extract_path, verify=verify
)
def transform(self) -> None:
"""Transform the data extracted into a format that can be consumed by the

View file

@ -34,6 +34,11 @@ DATASET_LIST = [
"module_dir": "hud_recap",
"class_name": "HudRecapETL",
},
{
"name": "cdc_places",
"module_dir": "cdc_places",
"class_name": "CDCPlacesETL",
},
]
CENSUS_INFO = {
"name": "census",

View file

@ -50,6 +50,7 @@ class ScoreETL(ExtractTransformLoad):
self.census_df: pd.DataFrame
self.housing_and_transportation_df: pd.DataFrame
self.hud_housing_df: pd.DataFrame
self.cdc_places_df: pd.DataFrame
def data_sets(self) -> list:
# Define a named tuple that will be used for each data set input.
@ -81,6 +82,36 @@ class ScoreETL(ExtractTransformLoad):
renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
bucket=None,
),
DataSet(
input_field="Current asthma among adults aged >=18 years",
renamed_field="Current asthma among adults aged >=18 years",
bucket=None,
),
DataSet(
input_field="Coronary heart disease among adults aged >=18 years",
renamed_field="Coronary heart disease among adults aged >=18 years",
bucket=None,
),
DataSet(
input_field="Cancer (excluding skin cancer) among adults aged >=18 years",
renamed_field="Cancer (excluding skin cancer) among adults aged >=18 years",
bucket=None,
),
DataSet(
input_field="Current lack of health insurance among adults aged 18-64 years",
renamed_field="Current lack of health insurance among adults aged 18-64 years",
bucket=None,
),
DataSet(
input_field="Diagnosed diabetes among adults aged >=18 years",
renamed_field="Diagnosed diabetes among adults aged >=18 years",
bucket=None,
),
DataSet(
input_field="Physical health not good for >=14 days among adults aged >=18 years",
renamed_field="Physical health not good for >=14 days among adults aged >=18 years",
bucket=None,
),
# The following data sets have buckets, because they're used in Score C
DataSet(
input_field="CANCER",
@ -218,6 +249,14 @@ class ScoreETL(ExtractTransformLoad):
low_memory=False,
)
# Load CDC Places data
cdc_places_csv = self.DATA_PATH / "dataset" / "cdc_places" / "usa.csv"
self.cdc_places_df = pd.read_csv(
cdc_places_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
def transform(self) -> None:
## IMPORTANT: THIS METHOD IS CLOSE TO THE LIMIT OF STATEMENTS
@ -247,8 +286,28 @@ class ScoreETL(ExtractTransformLoad):
)
# Join all the data sources that use census tracts
# TODO: when there's more than one data source using census tract, reduce/merge them here.
census_tract_df = self.hud_housing_df
census_tract_dfs = [
self.hud_housing_df,
self.cdc_places_df,
]
census_tract_df = functools.reduce(
lambda left, right: pd.merge(
left=left,
right=right,
on=self.GEOID_TRACT_FIELD_NAME,
how="outer",
),
census_tract_dfs,
)
# Sanity check the join.
if (
len(census_tract_df[self.GEOID_TRACT_FIELD_NAME].str.len().unique())
!= 1
):
raise ValueError(
f"One of the input CSVs uses {self.GEOID_TRACT_FIELD_NAME} with a different length."
)
# Calculate the tract for the CBG data.
census_block_group_df[
@ -437,12 +496,56 @@ class ScoreETL(ExtractTransformLoad):
)
self.df[meets_burden_field_name] = (
self.df["Particulate matter (PM2.5)"] > 10
) | (self.df["Respiratory hazard " "index"] > 0.75)
(self.df["Particulate matter (PM2.5) (percentile)"] > 0.9)
| (self.df["Respiratory hazard index (percentile)"] > 0.9)
| (self.df["Traffic proximity and volume (percentile)"] > 0.9)
| (
self.df[
"Percent pre-1960s housing (lead paint indicator) (percentile)"
]
> 0.9
)
| (self.df["Proximity to RMP sites (percentile)"] > 0.9)
| (
self.df[
"Current asthma among adults aged >=18 years (percentile)"
]
> 0.9
)
| (
self.df[
"Coronary heart disease among adults aged >=18 years (percentile)"
]
> 0.9
)
| (
self.df[
"Cancer (excluding skin cancer) among adults aged >=18 years (percentile)"
]
> 0.9
)
# | (
# self.df[
# "Current lack of health insurance among adults aged 18-64 years (percentile)"
# ]
# > 0.9
# )
| (
self.df[
"Diagnosed diabetes among adults aged >=18 years (percentile)"
]
> 0.9
)
# | (
# self.df[
# "Physical health not good for >=14 days among adults aged >=18 years (percentile)"
# ]
# > 0.9
# )
)
self.df["Score F (communities)"] = (
self.df[ami_and_high_school_field_name]
& self.df[meets_burden_field_name]
self.df[meets_socio_field_name] & self.df[meets_burden_field_name]
)
def load(self) -> None:
@ -450,10 +553,4 @@ class ScoreETL(ExtractTransformLoad):
# write nationwide csv
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
# TODO: drop
self.df[0:10000].to_csv(
self.SCORE_CSV_PATH / "usa-10000.csv", index=False
)
self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)

View file

@ -10,14 +10,19 @@ logger = get_module_logger(__name__)
class CalEnviroScreenETL(ExtractTransformLoad):
def __init__(self):
self.CALENVIROSCREEN_FTP_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/CalEnviroScreen_4.0_2021.zip"
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/CalEnviroScreen_4.0_2021.zip"
)
self.CALENVIROSCREEN_CSV = (
self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
)
self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
# Definining some variable names
self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = "calenviroscreen_percentile"
self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
"calenviroscreen_percentile"
)
self.CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = (
"calenviroscreen_priority_community"
)

View file

@ -0,0 +1,66 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger, download_file_from_url
logger = get_module_logger(__name__)
class CDCPlacesETL(ExtractTransformLoad):
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
self.CDC_GEOID_FIELD_NAME = "LocationID"
self.CDC_VALUE_FIELD_NAME = "Data_Value"
self.CDC_MEASURE_FIELD_NAME = "Measure"
self.df: pd.DataFrame
def extract(self) -> None:
logger.info("Starting to download 520MB CDC Places file.")
file_path = download_file_from_url(
file_url=self.CDC_PLACES_URL,
download_file_name=self.TMP_PATH
/ "cdc_places"
/ "census_tract.csv",
)
self.df = pd.read_csv(
filepath_or_buffer=file_path,
dtype={self.CDC_GEOID_FIELD_NAME: "string"},
low_memory=False,
)
def transform(self) -> None:
logger.info("Starting CDC Places transform")
# Rename GEOID field
self.df.rename(
columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
inplace=True,
errors="raise",
)
# Note: Puerto Rico not included.
self.df = self.df.pivot(
index=self.GEOID_TRACT_FIELD_NAME,
columns=self.CDC_MEASURE_FIELD_NAME,
values=self.CDC_VALUE_FIELD_NAME,
)
# Make the index (the census tract ID) a column, not the index.
self.df.reset_index(inplace=True)
def load(self) -> None:
logger.info("Saving CDC Places Data")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
def validate(self) -> None:
logger.info("Validating Census ACS Data")
pass

View file

@ -33,7 +33,9 @@ class CensusETL(ExtractTransformLoad):
self.NATIONAL_CBG_CSV_PATH = self.CSV_BASE_PATH / "us.csv"
self.NATIONAL_CBG_JSON_PATH = self.GEOJSON_BASE_PATH / "us.json"
def _path_for_fips_file(self, fips_code: str, file_type: GeoFileType) -> Path:
def _path_for_fips_file(
self, fips_code: str, file_type: GeoFileType
) -> Path:
"""Get paths for associated geospatial files for the provided FIPS code
Args:
@ -93,7 +95,9 @@ class CensusETL(ExtractTransformLoad):
None
"""
shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
geojson_file_path = self._path_for_fips_file(fips_code, GeoFileType.GEOJSON)
geojson_file_path = self._path_for_fips_file(
fips_code, GeoFileType.GEOJSON
)
logger.info(f"Checking if {fips_code} geoJSON file exists ")
if not geojson_file_path.is_file():
logger.info(
@ -176,7 +180,9 @@ class CensusETL(ExtractTransformLoad):
if not self.NATIONAL_CBG_CSV_PATH.is_file():
logger.info(f"Creating {self.NATIONAL_CBG_CSV_PATH}")
with open(self.NATIONAL_CBG_CSV_PATH, mode="w", newline="") as cbg_csv_file:
with open(
self.NATIONAL_CBG_CSV_PATH, mode="w", newline=""
) as cbg_csv_file:
cbg_csv_file_writer = csv.writer(
cbg_csv_file,
delimiter=",",
@ -205,7 +211,9 @@ class CensusETL(ExtractTransformLoad):
state_gdf = gpd.read_file(file_name)
usa_df = usa_df.append(state_gdf)
usa_df = usa_df.to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
usa_df = usa_df.to_crs(
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
)
logger.info("Writing national geojson file")
usa_df.to_file(self.NATIONAL_CBG_JSON_PATH, driver="GeoJSON")

View file

@ -41,10 +41,10 @@ class CensusACSETL(ExtractTransformLoad):
self.STATE_MEDIAN_INCOME_FTP_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/2014_to_2019_state_median_income.zip"
+ "/2015_to_2019_state_median_income.zip"
)
self.STATE_MEDIAN_INCOME_FILE_PATH = (
self.TMP_PATH / "2014_to_2019_state_median_income.csv"
self.TMP_PATH / "2015_to_2019_state_median_income.csv"
)
def _fips_from_censusdata_censusgeo(

View file

@ -8,9 +8,7 @@ logger = get_module_logger(__name__)
class EJScreenETL(ExtractTransformLoad):
def __init__(self):
self.EJSCREEN_FTP_URL = (
"https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
)
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
self.df: pd.DataFrame
@ -20,6 +18,7 @@ class EJScreenETL(ExtractTransformLoad):
super().extract(
self.EJSCREEN_FTP_URL,
self.TMP_PATH,
verify=False, # EPA EJScreen end point has certificate issues often
)
def transform(self) -> None:

View file

@ -35,7 +35,9 @@ class HousingTransportationETL(ExtractTransformLoad):
)
# New file name:
tmp_csv_file_path = zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
tmp_csv_file_path = (
zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
)
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
dfs.append(tmp_df)
@ -47,9 +49,9 @@ class HousingTransportationETL(ExtractTransformLoad):
# Rename and reformat block group ID
self.df.rename(columns={"blkgrp": self.GEOID_FIELD_NAME}, inplace=True)
self.df[self.GEOID_FIELD_NAME] = self.df[self.GEOID_FIELD_NAME].str.replace(
'"', ""
)
self.df[self.GEOID_FIELD_NAME] = self.df[
self.GEOID_FIELD_NAME
].str.replace('"', "")
def load(self) -> None:
logger.info("Saving Housing and Transportation Data")

View file

@ -9,16 +9,16 @@ class HudHousingETL(ExtractTransformLoad):
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "hud_housing"
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
self.HOUSING_FTP_URL = (
"https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip"
)
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip"
self.HOUSING_ZIP_FILE_DIR = self.TMP_PATH / "hud_housing"
# We measure households earning less than 80% of HUD Area Median Family Income by county
# and paying greater than 30% of their income to housing costs.
self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)"
self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME = "HOUSING_BURDEN_NUMERATOR"
self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = "HOUSING_BURDEN_DENOMINATOR"
self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = (
"HOUSING_BURDEN_DENOMINATOR"
)
# Note: some variable definitions.
# HUD-adjusted median family income (HAMFI).
@ -55,7 +55,9 @@ class HudHousingETL(ExtractTransformLoad):
)
# Rename and reformat block group ID
self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True)
self.df.rename(
columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True
)
# The CHAS data has census tract ids such as `14000US01001020100`
# Whereas the rest of our data uses, for the same tract, `01001020100`.
@ -273,7 +275,9 @@ class HudHousingETL(ExtractTransformLoad):
# TODO: add small sample size checks
self.df[self.HOUSING_BURDEN_FIELD_NAME] = self.df[
self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME
].astype(float) / self.df[self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME].astype(
].astype(float) / self.df[
self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME
].astype(
float
)

View file

@ -18,7 +18,9 @@ class HudRecapETL(ExtractTransformLoad):
self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
# Definining some variable names
self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = "hud_recap_priority_community"
self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
"hud_recap_priority_community"
)
self.df: pd.DataFrame

View file

@ -8,9 +8,7 @@ logger = get_module_logger(__name__)
class TreeEquityScoreETL(ExtractTransformLoad):
def __init__(self):
self.TES_URL = (
"https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
)
self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
self.TES_CSV = self.TMP_PATH / "tes_2021_data.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
self.df: gpd.GeoDataFrame
@ -78,8 +76,12 @@ class TreeEquityScoreETL(ExtractTransformLoad):
logger.info("Transforming Tree Equity Score Data")
tes_state_dfs = []
for state in self.states:
tes_state_dfs.append(gpd.read_file(f"{self.TMP_PATH}/{state}/{state}.shp"))
self.df = gpd.GeoDataFrame(pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs)
tes_state_dfs.append(
gpd.read_file(f"{self.TMP_PATH}/{state}/{state}.shp")
)
self.df = gpd.GeoDataFrame(
pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
)
def load(self) -> None:
logger.info("Saving Tree Equity Score GeoJSON")

View file

@ -3,9 +3,6 @@
{
"cell_type": "code",
"execution_count": null,
"id": "0491828b",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import censusdata\n",
@ -32,30 +29,26 @@
"# Some display settings to make pandas outputs more readable.\n",
"pd.set_option(\"display.expand_frame_repr\", False)\n",
"pd.set_option(\"display.precision\", 2)"
]
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"id": "654f25a1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
"censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\"))"
]
],
"outputs": [],
"metadata": {
"scrolled": true
}
},
{
"cell_type": "code",
"execution_count": null,
"id": "8999cea4",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
" \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
@ -85,31 +78,33 @@
"df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n",
"\n",
"df.head()"
]
],
"outputs": [],
"metadata": {
"scrolled": true
}
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a269bb1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
"\n",
"df.rename(columns={\"GEOID10\": \"GEOID2\", \"B19013_001E\": \"Median household income (State)\"}, inplace=True)\n",
"\n",
"df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
]
"# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
],
"outputs": [],
"metadata": {
"scrolled": true
}
},
{
"cell_type": "code",
"execution_count": null,
"id": "91932af5",
"metadata": {},
"source": [],
"outputs": [],
"source": []
"metadata": {}
}
],
"metadata": {
@ -133,4 +128,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}

View file

@ -28,7 +28,7 @@
"from datetime import datetime\n",
"from tqdm.notebook import tqdm_notebook\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
@ -215,7 +215,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "8b795fb4",
"id": "274f6bc6",
"metadata": {},
"outputs": [],
"source": [
@ -234,6 +234,21 @@
"# (`census_tract_indices`).\n",
"census_block_group_indices = [\n",
" Index(\n",
" method_name=\"Score F\",\n",
" priority_communities_field=\"Score F (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score F (socioeconomic only)\",\n",
" priority_communities_field=\"Meets socioeconomic criteria\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score F (burden only)\",\n",
" priority_communities_field=\"Meets burden criteria\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score A\",\n",
" priority_communities_field=\"Score A (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
@ -253,21 +268,21 @@
" priority_communities_field=\"Score D (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score D (30th percentile)\",\n",
" priority_communities_field=\"Score D (top 30th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score D (35th percentile)\",\n",
" priority_communities_field=\"Score D (top 35th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score D (40th percentile)\",\n",
" priority_communities_field=\"Score D (top 40th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
"# Index(\n",
"# method_name=\"Score D (30th percentile)\",\n",
"# priority_communities_field=\"Score D (top 30th percentile)\",\n",
"# other_census_tract_fields_to_keep=[],\n",
"# ),\n",
"# Index(\n",
"# method_name=\"Score D (35th percentile)\",\n",
"# priority_communities_field=\"Score D (top 35th percentile)\",\n",
"# other_census_tract_fields_to_keep=[],\n",
"# ),\n",
"# Index(\n",
"# method_name=\"Score D (40th percentile)\",\n",
"# priority_communities_field=\"Score D (top 40th percentile)\",\n",
"# other_census_tract_fields_to_keep=[],\n",
"# ),\n",
" Index(\n",
" method_name=\"Poverty\",\n",
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
@ -534,7 +549,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "d7acf80d",
"id": "eeb9699d",
"metadata": {},
"outputs": [],
"source": [
@ -682,7 +697,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "777a4623",
"id": "4f44426c",
"metadata": {},
"outputs": [],
"source": [
@ -1140,14 +1155,6 @@
"\n",
"print(file_paths)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e679502a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View file

@ -98,11 +98,50 @@ def remove_all_dirs_from_dir(dir_path: Path) -> None:
logging.info(f"Removing directory {file_path}")
def download_file_from_url(
file_url: str,
download_file_name: Path,
verify: bool = True,
) -> str:
"""Downloads a file from a remote URL location and returns the file location.
Args:
file_url (str): URL where the zip file is located
download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an error (optional, default to False)
Returns:
None
"""
# disable https warning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
if not os.path.isdir(download_file_name.parent):
os.mkdir(download_file_name.parent)
logger.info(f"Downloading {file_url}")
response = requests.get(file_url, verify=verify)
if response.status_code == 200:
file_contents = response.content
else:
sys.exit(
f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
)
# Write the contents to disk.
file = open(download_file_name, "wb")
file.write(file_contents)
file.close()
return download_file_name
def unzip_file_from_url(
file_url: str,
download_path: Path,
unzipped_file_path: Path,
verify: bool = False,
verify: bool = True,
) -> None:
"""Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after
@ -116,23 +155,11 @@ def unzip_file_from_url(
None
"""
# disable https warning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logger.info(f"Downloading {file_url}")
response = requests.get(file_url, verify=verify)
if response.status_code == 200:
file_contents = response.content
else:
sys.exit(
f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
)
zip_file_path = download_path / "downloaded.zip"
zip_file = open(zip_file_path, "wb")
zip_file.write(file_contents)
zip_file.close()
zip_file_path = download_file_from_url(
file_url=file_url,
download_file_name=download_path / "downloaded.zip",
verify=verify,
)
logger.info(f"Extracting {zip_file_path}")
with zipfile.ZipFile(zip_file_path, "r") as zip_ref: