refactoring tribal (#1960)

This commit is contained in:
Lucas Merrill Brown 2022-09-30 12:04:12 -04:00 committed by GitHub
parent 247db4acdc
commit f4adf172e3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 28 deletions

View file

@ -160,9 +160,7 @@ class CensusDecennialETL(ExtractTransformLoad):
# our other demographic data # our other demographic data
self.NON_HISPANIC_WHITE_FIELD = "PCT086005" # Total!!White self.NON_HISPANIC_WHITE_FIELD = "PCT086005" # Total!!White
self.HISPANIC_FIELD = "PCT086006" # Total!!Hispanic or Latino self.HISPANIC_FIELD = "PCT086006" # Total!!Hispanic or Latino
self.OTHER_RACE_FIELD = ( self.OTHER_RACE_FIELD = "PCT086007" # Total!!Other Ethnic Origin or Ra
"PCT086007" # Total!!Other Ethnic Origin or Ra
)
self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total
self.BLACK_VI_FIELD = ( self.BLACK_VI_FIELD = (

View file

@ -2,6 +2,7 @@ from pathlib import Path
import geopandas as gpd import geopandas as gpd
import pandas as pd import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger, unzip_file_from_url from data_pipeline.utils import get_module_logger, unzip_file_from_url
@ -11,9 +12,13 @@ logger = get_module_logger(__name__)
class TribalETL(ExtractTransformLoad): class TribalETL(ExtractTransformLoad):
def __init__(self): def __init__(self):
self.GEOJSON_BASE_PATH = self.DATA_PATH / "tribal" / "geojson" self.GEOGRAPHIC_BASE_PATH = (
self.DATA_PATH / "tribal" / "geographic_data"
)
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv" self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
self.NATIONAL_TRIBAL_GEOJSON_PATH = self.GEOJSON_BASE_PATH / "usa.json" self.NATIONAL_TRIBAL_GEOJSON_PATH = (
self.GEOGRAPHIC_BASE_PATH / "usa.json"
)
self.USA_TRIBAL_DF_LIST = [] self.USA_TRIBAL_DF_LIST = []
def extract(self) -> None: def extract(self) -> None:
@ -24,37 +29,57 @@ class TribalETL(ExtractTransformLoad):
""" """
logger.info("Downloading Tribal Data") logger.info("Downloading Tribal Data")
bia_geojson_url = "https://justice40-data.s3.amazonaws.com/data-sources/BIA_National_LAR_json.zip" bia_shapefile_zip_url = (
alaska_geojson_url = "https://justice40-data.s3.amazonaws.com/data-sources/Alaska_Native_Villages_json.zip" settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/BIA_National_LAR_updated_20220929.zip"
)
tsa_and_aian_geojson_zip_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/BIA_TSA_and_AIAN_json.zip"
)
alaska_geojson_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/Alaska_Native_Villages_json.zip"
)
unzip_file_from_url( unzip_file_from_url(
bia_geojson_url, bia_shapefile_zip_url,
self.TMP_PATH, self.TMP_PATH,
self.DATA_PATH / "tribal" / "geojson" / "bia_national_lar", self.GEOGRAPHIC_BASE_PATH / "bia_national_lar",
)
unzip_file_from_url(
tsa_and_aian_geojson_zip_url,
self.TMP_PATH,
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian",
) )
unzip_file_from_url( unzip_file_from_url(
alaska_geojson_url, alaska_geojson_url,
self.TMP_PATH, self.TMP_PATH,
self.DATA_PATH / "tribal" / "geojson" / "alaska_native_villages", self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
) )
pass
def _transform_bia_national_lar(self, tribal_geojson_path: Path) -> None: def _transform_bia_national_lar(self, path: Path) -> None:
"""Transform the Tribal BIA National Lar Geodataframe and appends it to the """Transform the Tribal BIA National Lar Geodataframe and appends it to the
national Tribal Dataframe List national Tribal Dataframe List
Args: Args:
tribal_geojson_path (Path): the Path to the Tribal Geojson path (Path): the Path to the BIA National Lar
Returns: Returns:
None None
""" """
bia_national_lar_df = gpd.read_file(tribal_geojson_path) bia_national_lar_df = gpd.read_file(path)
# DELETE
logger.info(f"Columns: {bia_national_lar_df.columns}\n")
bia_national_lar_df.drop( bia_national_lar_df.drop(
["OBJECTID", "GISAcres", "Shape_Length", "Shape_Area"], ["GISAcres"],
axis=1, axis=1,
inplace=True, inplace=True,
) )
@ -162,29 +187,30 @@ class TribalETL(ExtractTransformLoad):
""" """
logger.info("Transforming Tribal Data") logger.info("Transforming Tribal Data")
# load the geojsons # Set the filepaths:
bia_national_lar_geojson = ( bia_national_lar_shapefile = (
self.GEOJSON_BASE_PATH self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
/ "bia_national_lar"
/ "BIA_National_LAR.json"
) )
bia_aian_supplemental_geojson = ( bia_aian_supplemental_geojson = (
self.GEOJSON_BASE_PATH self.GEOGRAPHIC_BASE_PATH
/ "bia_national_lar" / "tsa_and_aian"
/ "BIA_AIAN_Supplemental.json" / "BIA_AIAN_Supplemental.json"
) )
bia_tsa_geojson_geojson = (
self.GEOJSON_BASE_PATH / "bia_national_lar" / "BIA_TSA.json" bia_tsa_geojson = (
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json"
) )
alaska_native_villages_geojson = ( alaska_native_villages_geojson = (
self.GEOJSON_BASE_PATH self.GEOGRAPHIC_BASE_PATH
/ "alaska_native_villages" / "alaska_native_villages"
/ "AlaskaNativeVillages.gdb.geojson" / "AlaskaNativeVillages.gdb.geojson"
) )
self._transform_bia_national_lar(bia_national_lar_geojson) self._transform_bia_national_lar(bia_national_lar_shapefile)
self._transform_bia_aian_supplemental(bia_aian_supplemental_geojson) self._transform_bia_aian_supplemental(bia_aian_supplemental_geojson)
self._transform_bia_tsa(bia_tsa_geojson_geojson) self._transform_bia_tsa(bia_tsa_geojson)
self._transform_alaska_native_villages(alaska_native_villages_geojson) self._transform_alaska_native_villages(alaska_native_villages_geojson)
def load(self) -> None: def load(self) -> None:
@ -194,13 +220,13 @@ class TribalETL(ExtractTransformLoad):
None None
""" """
logger.info("Saving Tribal GeoJson and CSV") logger.info("Saving Tribal GeoJson and CSV")
usa_tribal_df = gpd.GeoDataFrame( usa_tribal_df = gpd.GeoDataFrame(
pd.concat(self.USA_TRIBAL_DF_LIST, ignore_index=True) pd.concat(self.USA_TRIBAL_DF_LIST, ignore_index=True)
) )
usa_tribal_df = usa_tribal_df.to_crs( usa_tribal_df = usa_tribal_df.to_crs(
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
) )
logger.info("Writing national geojson file") logger.info("Writing national geojson file")
usa_tribal_df.to_file( usa_tribal_df.to_file(
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON" self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"