Merge pull request #1 from usds/main

Update 1 based on #885 changes
This commit is contained in:
Saran Ahluwalia 2021-11-16 12:08:38 -05:00 committed by GitHub
commit 1e03e75ac4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 49 additions and 57 deletions

View file

@ -27,7 +27,7 @@ We also recognize capacity building as a key part of involving a diverse open so
Principles and guidelines for participating in our open source community are available [here](COMMUNITY_GUIDELINES.md). Please read them before joining or starting a conversation in this repo or one of the channels listed below.
### Community Chats
We host open source community chats every two weeks on Monday at 5-6pm ET. You can find information about the agenda and how to participate in our [Google Group](https://groups.google.com/u/4/g/justice40-open-source).
We host open source community chats every third Monday of the month at 5-6pm ET. You can find information about the agenda and how to participate in our [Google Group](https://groups.google.com/u/4/g/justice40-open-source).
Community members are welcome to share updates or propose topics for discussion in community chats. Please do so in the Google Group.

View file

@ -4814,12 +4814,12 @@
"dev": true
},
"axios": {
"version": "0.21.1",
"resolved": "https://registry.npmjs.org/axios/-/axios-0.21.1.tgz",
"integrity": "sha512-dKQiRHxGD9PPRIUNIWvZhPTPpl1rf/OxTYKsqKUDjBwYylTvV7SjSHJb9ratfyzM6wCdLCOYLzs73qpg5c4iGA==",
"version": "0.21.4",
"resolved": "https://registry.npmjs.org/axios/-/axios-0.21.4.tgz",
"integrity": "sha512-ut5vewkiu8jjGBdqpM44XxjuCjq9LAKeHVmoVfHVzy8eHgxxq8SbAVQNovDA8mVi05kP0Ea/n/UzcSHcTJQfNg==",
"dev": true,
"requires": {
"follow-redirects": "^1.10.0"
"follow-redirects": "^1.14.0"
}
},
"axobject-query": {
@ -16733,9 +16733,9 @@
}
},
"nth-check": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.0.0.tgz",
"integrity": "sha512-i4sc/Kj8htBrAiH1viZ0TgU8Y5XqCaV/FziYK6TBczxmeKm3AEFWqqF3195yKudrarqy7Zu80Ra5dobFjn9X/Q==",
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.0.1.tgz",
"integrity": "sha512-it1vE95zF6dTT9lBsYbxvqh0Soy4SPowchj0UBGj/V6cTPnXXtQOPUbhZ6CmGzAD/rW22LQK6E96pcdJXk4A4w==",
"dev": true,
"requires": {
"boolbase": "^1.0.0"
@ -21780,9 +21780,9 @@
}
},
"tmpl": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.4.tgz",
"integrity": "sha1-I2QN17QtAEM5ERQIIOXPRA5SHdE=",
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz",
"integrity": "sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==",
"dev": true
},
"to-arraybuffer": {
@ -22526,9 +22526,9 @@
}
},
"url-parse": {
"version": "1.5.1",
"resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.1.tgz",
"integrity": "sha512-HOfCOUJt7iSYzEx/UqgtwKRMC6EU91NFhsCHMv9oM03VJcVo2Qrp8T8kI9D7amFf1cu+/3CEhgb3rF9zL7k85Q==",
"version": "1.5.3",
"resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.3.tgz",
"integrity": "sha512-IIORyIQD9rvj0A4CLWsHkBBJuNqWpFQe224b6j9t/ABmquIS0qDU2pY6kl6AuOrL5OkCXHMCFNe1jBcuAggjvQ==",
"dev": true,
"requires": {
"querystringify": "^2.1.1",

View file

@ -33,7 +33,7 @@ class ExtractTransformLoad:
GEOID_FIELD_NAME: str = "GEOID10"
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
EXPECTED_MAX_CENSUS_TRACTS: int = 73076
def __init__(self, config_path: Path) -> None:

View file

@ -291,7 +291,6 @@ class ScoreETL(ExtractTransformLoad):
field_names.LIFE_EXPECTANCY_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.FEMA_RISK_FIELD,
field_names.FEMA_EXPECTED_ANNUAL_LOSS_RATE_FIELD,
field_names.URBAN_HERUISTIC_FIELD,
field_names.AIR_TOXICS_CANCER_RISK_FIELD,
field_names.RESPITORY_HAZARD_FIELD,

View file

@ -16,13 +16,17 @@ from data_pipeline.utils import (
logger = get_module_logger(__name__)
def reset_data_directories(data_path: Path) -> None:
def reset_data_directories(
data_path: Path,
) -> None:
"""Empties all census folders"""
census_data_path = data_path / "census"
# csv
csv_path = census_data_path / "csv"
remove_files_from_dir(csv_path, ".csv")
remove_files_from_dir(
csv_path, ".csv", exception_list=["fips_states_2010.csv"]
)
# geojson
geojson_path = census_data_path / "geojson"

View file

@ -72,8 +72,8 @@ class CensusACSETL(ExtractTransformLoad):
f"Downloading data for state/territory with FIPS code {fips}"
)
dfs.append(
censusdata.download(
try:
response = censusdata.download(
src="acs5",
year=self.ACS_YEAR,
geo=censusdata.censusgeo(
@ -91,8 +91,13 @@ class CensusACSETL(ExtractTransformLoad):
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS,
)
except ValueError:
logger.error(
f"Could not download data for state/territory with FIPS code {fips}"
)
dfs.append(response)
self.df = pd.concat(dfs)
self.df[self.GEOID_FIELD_NAME] = self.df.index.to_series().apply(

View file

@ -1,4 +1,5 @@
import pandas as pd
from pandas.errors import EmptyDataError
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
@ -26,10 +27,6 @@ class HousingTransportationETL(ExtractTransformLoad):
f"Downloading housing data for state/territory with FIPS code {fips}"
)
# Puerto Rico has no data, so skip
if fips == "72":
continue
unzip_file_from_url(
f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
)
@ -38,7 +35,13 @@ class HousingTransportationETL(ExtractTransformLoad):
tmp_csv_file_path = (
zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
)
try:
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
except EmptyDataError:
logger.error(
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
)
dfs.append(tmp_df)

View file

@ -320,28 +320,6 @@
"# )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b74b0bf",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Create a FEMA risk index score\n",
"# Note: this can be deleted at a later date.\n",
"FEMA_EXPECTED_ANNUAL_LOSS_RATE_FIELD = (\n",
" \"FEMA Risk Index Expected Annual Loss Rate\"\n",
")\n",
"FEMA_COMMUNITIES = \"FEMA Risk Index (top 30th percentile)\"\n",
"merged_df[FEMA_COMMUNITIES] = (\n",
" merged_df[f\"{FEMA_EXPECTED_ANNUAL_LOSS_RATE_FIELD} (percentile)\"] > 0.70\n",
")\n",
"\n",
"merged_df[FEMA_COMMUNITIES].describe()"
]
},
{
"cell_type": "code",
"execution_count": null,

View file

@ -57,9 +57,6 @@ AMI_FIELD = "Area Median Income (State or metropolitan)"
# Climate
FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score"
FEMA_EXPECTED_ANNUAL_LOSS_RATE_FIELD = (
"FEMA Risk Index Expected Annual Loss Rate"
)
EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME = (
"Expected building loss rate (Natural Hazards Risk Index)"
)

View file

@ -46,26 +46,32 @@ def get_module_logger(module_name: str) -> logging.Logger:
logger = get_module_logger(__name__)
def remove_files_from_dir(files_path: Path, extension: str = None) -> None:
def remove_files_from_dir(
files_path: Path, extension: str = None, exception_list: list = None
) -> None:
"""Removes all files from a specific directory with the exception of __init__.py
files or files with a specific extension
Args:
files_path (pathlib.Path): Name of the directory where the files will be deleted
extension (str): Extension of the file pattern to delete, example "json" (optional)
exception_list (list): List of files to not remove (optional)
Returns:
None
"""
for file in os.listdir(files_path):
if extension:
if not file.endswith(extension):
continue
else:
# don't rempove __init__ files as they conserve dir structure
if file == "__init__.py":
continue
if exception_list:
if file in exception_list:
continue
elif extension:
if not file.endswith(extension):
continue
os.remove(files_path / file)
logger.info(f"Removing {file}")