If the census download task is run more than once,
us.csv doubles in size and all data is removed from dataframe
This commit is contained in:
Nat Hillard 2021-08-05 00:41:30 -04:00
parent f75d40dcb0
commit a2c7b8e1c5

View file

@ -2,6 +2,7 @@ import csv
import json import json
import os import os
from pathlib import Path from pathlib import Path
import subprocess
import geopandas as gpd import geopandas as gpd
from data_pipeline.utils import get_module_logger, unzip_file_from_url from data_pipeline.utils import get_module_logger, unzip_file_from_url
@ -30,11 +31,17 @@ def download_census_csvs(data_path: Path) -> None:
for fips in state_fips_codes: for fips in state_fips_codes:
# check if file exists # check if file exists
shp_file_path = data_path / "census" / "shp" / fips / f"tl_2010_{fips}_bg10.shp" shp_file_path: Path = (
data_path / "census" / "shp" / fips / f"tl_2010_{fips}_bg10.shp"
)
geojson_file_path = data_path / "census" / "geojson" / f"{fips}.json"
logger.info(f"Checking if {fips} shp file exists")
if not shp_file_path.is_file():
logger.info(
f"{fips} shp file does not exist. Downloading and extracting shape file"
)
logger.info(f"Checking if {fips} file exists")
if not os.path.isfile(shp_file_path):
logger.info(f"Downloading and extracting {fips} shape file")
# 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/ # 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/
# But using 2010 for now # But using 2010 for now
cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip" cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
@ -43,17 +50,19 @@ def download_census_csvs(data_path: Path) -> None:
data_path / "tmp", data_path / "tmp",
data_path / "census" / "shp" / fips, data_path / "census" / "shp" / fips,
) )
logger.info(f"Checking if {fips} geoJSON file exists ")
cmd = ( if not geojson_file_path.is_file():
"ogr2ogr -f GeoJSON data/census/geojson/" logger.info(
+ fips f"GeoJSON file {fips} does not exist. Converting shp to geoJSON"
+ ".json data/census/shp/"
+ fips
+ "/tl_2010_"
+ fips
+ "_bg10.shp"
) )
os.system(cmd) cmd = [
"ogr2ogr",
"-f",
"GeoJSON",
str(geojson_file_path),
str(shp_file_path),
]
subprocess.run(cmd, check=True)
# generate CBG CSV table for pandas # generate CBG CSV table for pandas
## load in memory ## load in memory
@ -94,6 +103,7 @@ def download_census_csvs(data_path: Path) -> None:
) )
## write US csv ## write US csv
logger.info("Writing national us.csv file")
with open(csv_dir_path / "us.csv", mode="w", newline="") as cbg_csv_file: with open(csv_dir_path / "us.csv", mode="w", newline="") as cbg_csv_file:
cbg_csv_file_writer = csv.writer( cbg_csv_file_writer = csv.writer(
cbg_csv_file, cbg_csv_file,