mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-04 07:24:19 -07:00
Data folder restructuring in preparation for 361 (#376)
* initial checkin * gitignore and docker-compose update * readme update and error on hud * encoding issue * one more small README change * data roadmap re-strcuture * pyproject sort * small update to score output folders * checkpoint * couple of last fixes
This commit is contained in:
parent
3032a8305d
commit
543d147e61
66 changed files with 130 additions and 108 deletions
55
data/data-pipeline/etl/sources/census/etl_utils.py
Normal file
55
data/data-pipeline/etl/sources/census/etl_utils.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
from pathlib import Path
|
||||
import csv
|
||||
import os
|
||||
from config import settings
|
||||
|
||||
from utils import (
|
||||
remove_files_from_dir,
|
||||
remove_all_dirs_from_dir,
|
||||
unzip_file_from_url,
|
||||
get_module_logger,
|
||||
)
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def reset_data_directories(data_path: Path) -> None:
|
||||
census_data_path = data_path / "census"
|
||||
|
||||
# csv
|
||||
csv_path = census_data_path / "csv"
|
||||
remove_files_from_dir(csv_path, ".csv")
|
||||
|
||||
# geojson
|
||||
geojson_path = census_data_path / "geojson"
|
||||
remove_files_from_dir(geojson_path, ".json")
|
||||
|
||||
# shp
|
||||
shp_path = census_data_path / "shp"
|
||||
remove_all_dirs_from_dir(shp_path)
|
||||
|
||||
|
||||
def get_state_fips_codes(data_path: Path) -> list:
|
||||
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
|
||||
|
||||
# check if file exists
|
||||
if not os.path.isfile(fips_csv_path):
|
||||
logger.info(f"Downloading fips from S3 repository")
|
||||
unzip_file_from_url(
|
||||
settings.AWS_JUSTICE40_DATA_URL + "/Census/fips_states_2010.zip",
|
||||
data_path / "tmp",
|
||||
data_path / "census" / "csv",
|
||||
)
|
||||
|
||||
fips_state_list = []
|
||||
with open(fips_csv_path) as csv_file:
|
||||
csv_reader = csv.reader(csv_file, delimiter=",")
|
||||
line_count = 0
|
||||
|
||||
for row in csv_reader:
|
||||
if line_count == 0:
|
||||
line_count += 1
|
||||
else:
|
||||
fips = row[0].strip()
|
||||
fips_state_list.append(fips)
|
||||
return fips_state_list
|
Loading…
Add table
Add a link
Reference in a new issue