mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-25 08:20:16 -07:00
User Story 2152 – Clean up logging (#2155)
Update logging messages and message consistency This update includes changes to the level of many log messages. Rather than everything being logged at the info level, it differentiates between debug, info, warning, and error messages. It also changes the default log level to info to avoid much of the noise previously in the logs. It also removes many extra log messages, and adds additional decorators at the beginning of each pipeline run.
This commit is contained in:
parent
7cfb56476e
commit
03a6d3c660
63 changed files with 307 additions and 339 deletions
|
@ -33,15 +33,12 @@ class CalEnviroScreenETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading CalEnviroScreen Data")
|
||||
super().extract(
|
||||
self.CALENVIROSCREEN_FTP_URL,
|
||||
self.get_tmp_path(),
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming CalEnviroScreen Data")
|
||||
|
||||
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
|
||||
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
|
||||
# Load comparison index (CalEnviroScreen 4)
|
||||
|
@ -70,7 +67,6 @@ class CalEnviroScreenETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving CalEnviroScreen CSV")
|
||||
# write nationwide csv
|
||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
|
||||
|
|
|
@ -81,7 +81,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
return df
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting data download.")
|
||||
|
||||
all_usa_raw_df = self._download_and_prep_data(
|
||||
file_url=self.USA_FILE_URL,
|
||||
|
@ -102,13 +101,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
|
||||
)
|
||||
|
||||
logger.info("Downloading data for Maine")
|
||||
logger.debug("Downloading data for Maine")
|
||||
maine_raw_df = self._download_and_prep_data(
|
||||
file_url=self.MAINE_FILE_URL,
|
||||
download_file_name=self.get_tmp_path() / "maine.csv",
|
||||
)
|
||||
|
||||
logger.info("Downloading data for Wisconsin")
|
||||
logger.debug("Downloading data for Wisconsin")
|
||||
wisconsin_raw_df = self._download_and_prep_data(
|
||||
file_url=self.WISCONSIN_FILE_URL,
|
||||
download_file_name=self.get_tmp_path() / "wisconsin.csv",
|
||||
|
@ -138,7 +137,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
self.raw_df = combined_df
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting CDC life expectancy transform.")
|
||||
|
||||
self.output_df = self.raw_df.rename(
|
||||
columns={
|
||||
|
@ -148,7 +146,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving CDC Life Expectancy CSV")
|
||||
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.output_df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
|
|
|
@ -44,7 +44,6 @@ class CDCPlacesETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting to download 520MB CDC Places file.")
|
||||
file_path = download_file_from_url(
|
||||
file_url=self.CDC_PLACES_URL,
|
||||
download_file_name=self.get_tmp_path() / "census_tract.csv",
|
||||
|
@ -57,8 +56,6 @@ class CDCPlacesETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting CDC Places transform")
|
||||
|
||||
# Rename GEOID field
|
||||
self.df.rename(
|
||||
columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
|
||||
|
|
|
@ -48,7 +48,6 @@ class CDCSVIIndex(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading 43 MB CDC SVI INDEX")
|
||||
self.df = pd.read_csv(
|
||||
filepath_or_buffer=self.CDC_SVI_INDEX_URL,
|
||||
dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
|
||||
|
@ -56,7 +55,6 @@ class CDCSVIIndex(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting CDC SVI INDEX transform")
|
||||
# Note: In this dataset all US census tracts are ranked against one another.
|
||||
# Puerto Rico is not included in this dataset
|
||||
self.df.rename(
|
||||
|
@ -109,8 +107,6 @@ class CDCSVIIndex(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving CDC SVI Index Data")
|
||||
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
|
|
|
@ -70,14 +70,9 @@ class CensusETL(ExtractTransformLoad):
|
|||
None
|
||||
"""
|
||||
shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
|
||||
logger.info(f"Checking if {fips_code} shp file exists")
|
||||
|
||||
# check if file exists
|
||||
if not shp_file_path.is_file():
|
||||
logger.info(
|
||||
f"{fips_code} shp file does not exist. Downloading and extracting shape file"
|
||||
)
|
||||
|
||||
tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
|
||||
unzip_file_from_url(
|
||||
tract_state_url,
|
||||
|
@ -86,8 +81,11 @@ class CensusETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading Census Data")
|
||||
for fips_code in self.STATE_FIPS_CODES:
|
||||
logger.debug("Extracting census data")
|
||||
for index, fips_code in enumerate(self.STATE_FIPS_CODES):
|
||||
logger.debug(
|
||||
f"Extracting shape for FIPS {fips_code} – {index+1} of {len(self.STATE_FIPS_CODES)}"
|
||||
)
|
||||
self._extract_shp(fips_code)
|
||||
|
||||
def _transform_to_geojson(self, fips_code: str) -> None:
|
||||
|
@ -100,11 +98,8 @@ class CensusETL(ExtractTransformLoad):
|
|||
geojson_file_path = self._path_for_fips_file(
|
||||
fips_code, GeoFileType.GEOJSON
|
||||
)
|
||||
logger.info(f"Checking if {fips_code} geoJSON file exists ")
|
||||
|
||||
if not geojson_file_path.is_file():
|
||||
logger.info(
|
||||
f"GeoJSON file {fips_code} does not exist. Converting shp to geoJSON"
|
||||
)
|
||||
cmd = [
|
||||
"ogr2ogr",
|
||||
"-f",
|
||||
|
@ -120,9 +115,11 @@ class CensusETL(ExtractTransformLoad):
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.debug("Transforming tracts")
|
||||
|
||||
for file in self.GEOJSON_BASE_PATH.iterdir():
|
||||
if file.suffix == ".json":
|
||||
logger.info(f"Ingesting geoid10 for file {file}")
|
||||
logger.debug(f"Adding GEOID10 for file {file.name}")
|
||||
with open(self.GEOJSON_BASE_PATH / file, encoding="utf-8") as f:
|
||||
geojson = json.load(f)
|
||||
for feature in geojson["features"]:
|
||||
|
@ -142,13 +139,19 @@ class CensusETL(ExtractTransformLoad):
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.info("Transforming Census Data")
|
||||
for fips_code in self.STATE_FIPS_CODES:
|
||||
logger.debug("Transforming census data")
|
||||
|
||||
logger.debug("Transforming SHP files to GeoJSON")
|
||||
for index, fips_code in enumerate(self.STATE_FIPS_CODES):
|
||||
logger.debug(
|
||||
f"Transforming FIPS {fips_code} to GeoJSON – {index+1} of {len(self.STATE_FIPS_CODES)}"
|
||||
)
|
||||
self._transform_to_geojson(fips_code)
|
||||
|
||||
self._generate_tract_table()
|
||||
|
||||
def _load_into_state_csvs(self, fips_code: str) -> None:
|
||||
"""Load state CSVS into individual CSV files
|
||||
"""Load state CSVs into individual CSV files
|
||||
|
||||
Args:
|
||||
fips_code (str): the FIPS code for the region of interest
|
||||
|
@ -182,10 +185,9 @@ class CensusETL(ExtractTransformLoad):
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.info("Writing national us.csv file")
|
||||
logger.debug("Loading national US.csv")
|
||||
|
||||
if not self.NATIONAL_TRACT_CSV_PATH.is_file():
|
||||
logger.info(f"Creating {self.NATIONAL_TRACT_CSV_PATH}")
|
||||
with open(
|
||||
self.NATIONAL_TRACT_CSV_PATH,
|
||||
mode="w",
|
||||
|
@ -211,22 +213,21 @@ class CensusETL(ExtractTransformLoad):
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.info("Generating national geojson file")
|
||||
logger.debug("Loading National GeoJson")
|
||||
|
||||
usa_df = gpd.GeoDataFrame()
|
||||
|
||||
for file_name in self.GEOJSON_BASE_PATH.rglob("*.json"):
|
||||
logger.info(f"Ingesting {file_name}")
|
||||
logger.debug(f"Adding national GeoJSON file {file_name.name}")
|
||||
state_gdf = gpd.read_file(file_name)
|
||||
usa_df = usa_df.append(state_gdf)
|
||||
|
||||
usa_df = usa_df.to_crs(
|
||||
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
||||
)
|
||||
logger.info("Writing national geojson file")
|
||||
usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
|
||||
|
||||
logger.info("Census tract downloading complete")
|
||||
logger.debug("Saving national GeoJSON file")
|
||||
usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
|
||||
|
||||
def load(self) -> None:
|
||||
"""Create state CSVs, National CSV, and National GeoJSON
|
||||
|
@ -234,8 +235,13 @@ class CensusETL(ExtractTransformLoad):
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.info("Saving Census CSV")
|
||||
logger.debug("Loading census data")
|
||||
|
||||
logger.debug("Loading individual state csv files")
|
||||
for fips_code in self.TRACT_PER_STATE:
|
||||
self._load_into_state_csvs(fips_code)
|
||||
|
||||
self._load_national_csv()
|
||||
self._load_national_geojson()
|
||||
|
||||
logger.debug("Census data complete")
|
||||
|
|
|
@ -39,7 +39,6 @@ def get_state_fips_codes(data_path: Path) -> list:
|
|||
"""Returns a list with state data"""
|
||||
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
|
||||
|
||||
logger.info("Downloading fips from S3 repository")
|
||||
unzip_file_from_url(
|
||||
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip",
|
||||
data_path / "tmp",
|
||||
|
@ -97,7 +96,6 @@ def check_census_data_source(
|
|||
|
||||
# download from s3 if census_data_source is aws
|
||||
if census_data_source == "aws":
|
||||
logger.info("Fetching Census data from AWS S3")
|
||||
unzip_file_from_url(
|
||||
CENSUS_DATA_S3_URL,
|
||||
DATA_PATH / "tmp",
|
||||
|
@ -106,14 +104,13 @@ def check_census_data_source(
|
|||
else:
|
||||
# check if census data is found locally
|
||||
if not os.path.isfile(census_data_path / "geojson" / "us.json"):
|
||||
logger.info(
|
||||
logger.error(
|
||||
"No local census data found. Please use '-s aws` to fetch from AWS"
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
|
||||
def zip_census_data():
|
||||
logger.info("Compressing census files to data/tmp folder")
|
||||
|
||||
CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census"
|
||||
TMP_PATH = settings.APP_ROOT / "data" / "tmp"
|
||||
|
|
|
@ -363,18 +363,16 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting Census ACS Transform")
|
||||
|
||||
df = self.df
|
||||
|
||||
# Here we join the geometry of the US to the dataframe so that we can impute
|
||||
# The income of neighbors. first this looks locally; if there's no local
|
||||
# geojson file for all of the US, this will read it off of S3
|
||||
logger.info("Reading in geojson for the country")
|
||||
logger.debug("Reading in geojson for the country")
|
||||
if not os.path.exists(
|
||||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
):
|
||||
logger.info("Fetching Census data from AWS S3")
|
||||
logger.debug("Fetching Census data from AWS S3")
|
||||
unzip_file_from_url(
|
||||
CENSUS_DATA_S3_URL,
|
||||
self.DATA_PATH / "tmp",
|
||||
|
@ -406,7 +404,7 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||
]:
|
||||
missing_value_count = sum(df[field] == -666666666)
|
||||
logger.info(
|
||||
logger.debug(
|
||||
f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of "
|
||||
+ f"`{field}` being marked as null values."
|
||||
)
|
||||
|
@ -591,7 +589,7 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
|
||||
# we impute income for both income measures
|
||||
## TODO: Convert to pydantic for clarity
|
||||
logger.info("Imputing income information")
|
||||
logger.debug("Imputing income information")
|
||||
ImputeVariables = namedtuple(
|
||||
"ImputeVariables", ["raw_field_name", "imputed_field_name"]
|
||||
)
|
||||
|
@ -612,7 +610,7 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION,
|
||||
)
|
||||
|
||||
logger.info("Calculating with imputed values")
|
||||
logger.debug("Calculating with imputed values")
|
||||
|
||||
df[
|
||||
self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME
|
||||
|
@ -644,7 +642,7 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
== 0
|
||||
), "Error: not all values were filled..."
|
||||
|
||||
logger.info("Renaming columns...")
|
||||
logger.debug("Renaming columns...")
|
||||
df = df.rename(
|
||||
columns={
|
||||
self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME: field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
|
||||
|
|
|
@ -88,7 +88,7 @@ def _prepare_dataframe_for_imputation(
|
|||
][geoid_field].unique()
|
||||
|
||||
# Check that imputation is a valid choice for this set of fields
|
||||
logger.info(f"Imputing values for {len(tract_list)} unique tracts.")
|
||||
logger.debug(f"Imputing values for {len(tract_list)} unique tracts.")
|
||||
assert len(tract_list) > 0, "Error: No missing values to impute"
|
||||
|
||||
return tract_list, geo_df
|
||||
|
@ -156,7 +156,7 @@ def calculate_income_measures(
|
|||
mask_to_use
|
||||
][impute_var_pair.raw_field_name].mean()
|
||||
|
||||
logger.info("Casting geodataframe as a typical dataframe")
|
||||
logger.debug("Casting geodataframe as a typical dataframe")
|
||||
# get rid of the geometry column and cast as a typical df
|
||||
df = pd.DataFrame(
|
||||
geo_df[[col for col in geo_df.columns if col != "geometry"]]
|
||||
|
|
|
@ -30,14 +30,14 @@ def retrieve_census_acs_data(
|
|||
dfs = []
|
||||
for fips in get_state_fips_codes(data_path_for_fips_codes):
|
||||
if fips in CENSUS_ACS_FIPS_CODES_TO_SKIP:
|
||||
logger.info(
|
||||
logger.debug(
|
||||
f"Skipping download for state/territory with FIPS code {fips}"
|
||||
)
|
||||
else:
|
||||
census_api_key = ""
|
||||
if os.environ.get("CENSUS_API_KEY"):
|
||||
census_api_key = "with API key"
|
||||
logger.info(
|
||||
logger.debug(
|
||||
f"Downloading data for state/territory with FIPS code {fips} {census_api_key}"
|
||||
)
|
||||
|
||||
|
@ -55,7 +55,7 @@ def retrieve_census_acs_data(
|
|||
|
||||
except ValueError as e:
|
||||
logger.error(
|
||||
f"Could not download data for state/territory with FIPS code {fips}"
|
||||
f"Could not download data for state/territory with FIPS code {fips} because {e}"
|
||||
)
|
||||
raise e
|
||||
|
||||
|
|
|
@ -100,7 +100,6 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting Census 2010 ACS Transform")
|
||||
# Define the variables to retrieve
|
||||
variables = (
|
||||
self.UNEMPLOYED_FIELDS
|
||||
|
@ -118,8 +117,6 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting Census 2010 ACS Transform")
|
||||
|
||||
df = self.df
|
||||
|
||||
# Calculate percent unemployment.
|
||||
|
@ -184,8 +181,6 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
|||
self.df = output_df
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Census 2010 ACS Data")
|
||||
|
||||
# mkdir census
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
|
|
@ -224,7 +224,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
return state_median_incomes_df
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting four separate downloads.")
|
||||
# Load and clean GEOCORR data
|
||||
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
|
||||
# The specific query used is the following, which takes a couple of minutes to run:
|
||||
|
@ -239,7 +238,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
# and with the "target geographies" selected being:
|
||||
# - Core based statistical area (CBSA)
|
||||
# - CBSA Type (Metro or Micro)
|
||||
logger.info("Starting download of 1.5MB Geocorr information.")
|
||||
logger.debug("Starting download of 1.5MB Geocorr information.")
|
||||
|
||||
unzip_file_from_url(
|
||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||
|
@ -265,7 +264,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
low_memory=False,
|
||||
)
|
||||
|
||||
logger.info("Pulling PR tract list down.")
|
||||
logger.debug("Pulling PR tract list down.")
|
||||
# This step is necessary because PR is not in geocorr at the level that gets joined
|
||||
pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
|
||||
download_file_from_url(
|
||||
|
@ -282,7 +281,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
self.pr_tracts["State Abbreviation"] = "PR"
|
||||
|
||||
# Download MSA median incomes
|
||||
logger.info("Starting download of MSA median incomes.")
|
||||
logger.debug("Starting download of MSA median incomes.")
|
||||
download = requests.get(
|
||||
self.MSA_MEDIAN_INCOME_URL,
|
||||
verify=None,
|
||||
|
@ -291,7 +290,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
self.msa_median_incomes = json.loads(download.content)
|
||||
|
||||
# Download state median incomes
|
||||
logger.info("Starting download of state median incomes.")
|
||||
logger.debug("Starting download of state median incomes.")
|
||||
download_state = requests.get(
|
||||
self.STATE_MEDIAN_INCOME_URL,
|
||||
verify=None,
|
||||
|
@ -301,8 +300,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
## NOTE we already have PR's MI here
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting transforms.")
|
||||
|
||||
# Run transforms:
|
||||
geocorr_df = self._transform_geocorr()
|
||||
msa_median_incomes_df = self._transform_msa_median_incomes()
|
||||
|
@ -352,8 +349,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
self.output_df = merged_with_state_income_df
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Census ACS Median Income CSV")
|
||||
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.output_df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||
|
|
|
@ -352,7 +352,7 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
dfs = []
|
||||
dfs_vi = []
|
||||
for island in self.ISLAND_TERRITORIES:
|
||||
logger.info(
|
||||
logger.debug(
|
||||
f"Downloading data for state/territory {island['state_abbreviation']}"
|
||||
)
|
||||
for county in island["county_fips"]:
|
||||
|
@ -369,7 +369,13 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
||||
)
|
||||
|
||||
df = json.loads(download.content)
|
||||
try:
|
||||
df = json.loads(download.content)
|
||||
except ValueError as e:
|
||||
logger.error(
|
||||
f"Could not load content in census decennial ETL because {e}. Content is {download.content}."
|
||||
)
|
||||
|
||||
# First row is the header
|
||||
df = pd.DataFrame(df[1:], columns=df[0])
|
||||
|
||||
|
@ -393,8 +399,6 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.df_vi = pd.concat(dfs_vi)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting Census Decennial Transform")
|
||||
|
||||
# Rename All Fields
|
||||
self.df.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
||||
self.df_vi.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
||||
|
@ -489,13 +493,11 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
# Reporting Missing Values
|
||||
for col in self.df_all.columns:
|
||||
missing_value_count = self.df_all[col].isnull().sum()
|
||||
logger.info(
|
||||
logger.debug(
|
||||
f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows"
|
||||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Census Decennial Data")
|
||||
|
||||
# mkdir census
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
|
|
@ -65,14 +65,12 @@ class ChildOpportunityIndex(ExtractTransformLoad):
|
|||
self.output_df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting 51MB data download.")
|
||||
super().extract(
|
||||
source_url=self.SOURCE_URL,
|
||||
extract_path=self.get_tmp_path(),
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting transforms.")
|
||||
raw_df = pd.read_csv(
|
||||
filepath_or_buffer=self.get_tmp_path() / "raw.csv",
|
||||
# The following need to remain as strings for all of their digits, not get
|
||||
|
|
|
@ -30,7 +30,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
|||
self.output_df: pd.DataFrame
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting DOE Energy Burden transforms.")
|
||||
raw_df: pd.DataFrame = pd.read_csv(
|
||||
filepath_or_buffer=self.get_tmp_path()
|
||||
/ "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
|
||||
|
@ -41,7 +40,7 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
|||
low_memory=False,
|
||||
)
|
||||
|
||||
logger.info("Renaming columns and ensuring output format is correct")
|
||||
logger.debug("Renaming columns and ensuring output format is correct")
|
||||
output_df = raw_df.rename(
|
||||
columns={
|
||||
self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
|
||||
|
|
|
@ -53,7 +53,6 @@ class TravelCompositeETL(ExtractTransformLoad):
|
|||
- Renames the Census Tract column to match the other datasets
|
||||
- Converts to CSV
|
||||
"""
|
||||
logger.info("Transforming DOT Travel Disadvantage Data")
|
||||
|
||||
# read in the unzipped shapefile from data source
|
||||
# reformat it to be standard df, remove unassigned rows, and
|
||||
|
|
|
@ -60,7 +60,6 @@ class AbandonedMineETL(ExtractTransformLoad):
|
|||
self.output_df: pd.DataFrame
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting eAMLIS transforms.")
|
||||
df = pd.read_csv(
|
||||
self.get_tmp_path() / "eAMLIS export of all data.tsv",
|
||||
sep="\t",
|
||||
|
|
|
@ -44,7 +44,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
|||
]
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading EJScreen Data")
|
||||
super().extract(
|
||||
self.EJSCREEN_FTP_URL,
|
||||
self.get_tmp_path(),
|
||||
|
@ -52,7 +51,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming EJScreen Data")
|
||||
self.df = pd.read_csv(
|
||||
self.EJSCREEN_CSV,
|
||||
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
||||
|
|
|
@ -39,7 +39,7 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
|
|||
|
||||
def extract(self) -> None:
|
||||
if self.ejscreen_areas_of_concern_data_exists():
|
||||
logger.info("Loading EJSCREEN Areas of Concern Data Locally")
|
||||
logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
|
||||
self.df = pd.read_csv(
|
||||
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
|
||||
dtype={
|
||||
|
@ -48,24 +48,24 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
|
|||
low_memory=False,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
logger.warning(
|
||||
"EJSCREEN areas of concern data does not exist locally. Not loading the data."
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming EJSCREEN Areas of Concern Data")
|
||||
logger.debug("Transforming EJSCREEN Areas of Concern Data")
|
||||
|
||||
# TO DO: As a one off we did all the processing in a separate Notebook
|
||||
# Can add here later for a future PR
|
||||
|
||||
def load(self) -> None:
|
||||
if self.ejscreen_areas_of_concern_data_exists():
|
||||
logger.info("Saving EJSCREEN Areas of Concern Data")
|
||||
logger.debug("Saving EJSCREEN Areas of Concern Data")
|
||||
# write nationwide csv
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
else:
|
||||
logger.info(
|
||||
logger.warning(
|
||||
"EJSCREEN areas of concern data does not exist locally. Not saving the data."
|
||||
)
|
||||
|
|
|
@ -49,8 +49,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting data download.")
|
||||
|
||||
unzip_file_from_url(
|
||||
file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
|
||||
download_path=self.get_tmp_path(),
|
||||
|
@ -70,8 +68,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting transforms.")
|
||||
|
||||
self.df = self.df.rename(
|
||||
columns={
|
||||
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
|
||||
|
@ -105,8 +101,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving CSV")
|
||||
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||
|
|
|
@ -65,8 +65,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting 2.5 MB data download.")
|
||||
|
||||
# the column headers from the above dataset are actually a census tract's data at this point
|
||||
# We will use this data structure later to specify the column names
|
||||
input_columns = [
|
||||
|
@ -98,8 +96,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting transforms.")
|
||||
|
||||
score_columns = [x for x in self.df.columns if "SCORE" in x]
|
||||
|
||||
# coerce dataframe type to perform correct next steps
|
||||
|
@ -157,8 +153,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving CSV")
|
||||
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||
|
|
|
@ -48,7 +48,6 @@ class FloodRiskETL(ExtractTransformLoad):
|
|||
- Renames the Census Tract column to match the other datasets
|
||||
- Calculates share of properties at risk, left-clipping number of properties at 250
|
||||
"""
|
||||
logger.info("Transforming National Risk Index Data")
|
||||
|
||||
# read in the unzipped csv data source then rename the
|
||||
# Census Tract column for merging
|
||||
|
|
|
@ -48,7 +48,6 @@ class WildfireRiskETL(ExtractTransformLoad):
|
|||
- Renames the Census Tract column to match the other datasets
|
||||
- Calculates share of properties at risk, left-clipping number of properties at 250
|
||||
"""
|
||||
logger.info("Transforming National Risk Index Data")
|
||||
# read in the unzipped csv data source then rename the
|
||||
# Census Tract column for merging
|
||||
df_fsf_fire: pd.DataFrame = pd.read_csv(
|
||||
|
|
|
@ -16,7 +16,7 @@ logger = get_module_logger(__name__)
|
|||
def get_tract_geojson(
|
||||
_tract_data_path: Optional[Path] = None,
|
||||
) -> gpd.GeoDataFrame:
|
||||
logger.info("Loading tract geometry data from census ETL")
|
||||
logger.debug("Loading tract geometry data from census ETL")
|
||||
GEOJSON_PATH = _tract_data_path
|
||||
if GEOJSON_PATH is None:
|
||||
GEOJSON_PATH = CensusETL.NATIONAL_TRACT_JSON_PATH
|
||||
|
@ -40,7 +40,7 @@ def get_tract_geojson(
|
|||
def get_tribal_geojson(
|
||||
_tribal_data_path: Optional[Path] = None,
|
||||
) -> gpd.GeoDataFrame:
|
||||
logger.info("Loading Tribal geometry data from Tribal ETL")
|
||||
logger.debug("Loading Tribal geometry data from Tribal ETL")
|
||||
GEOJSON_PATH = _tribal_data_path
|
||||
if GEOJSON_PATH is None:
|
||||
GEOJSON_PATH = TribalETL().NATIONAL_TRIBAL_GEOJSON_PATH
|
||||
|
|
|
@ -34,9 +34,6 @@ class GeoCorrETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info(
|
||||
"Starting to download 2MB GeoCorr Urban Rural Census Tract Map file."
|
||||
)
|
||||
unzip_file_from_url(
|
||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||
+ "/geocorr_urban_rural.csv.zip",
|
||||
|
@ -53,7 +50,6 @@ class GeoCorrETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting GeoCorr Urban Rural Map transform")
|
||||
# Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr
|
||||
|
||||
self.output_df = self.df.rename(
|
||||
|
|
|
@ -43,7 +43,6 @@ class HistoricRedliningETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming Historic Redlining Data")
|
||||
# this is obviously temporary
|
||||
historic_redlining_data = pd.read_excel(
|
||||
self.HISTORIC_REDLINING_FILE_PATH
|
||||
|
@ -55,7 +54,7 @@ class HistoricRedliningETL(ExtractTransformLoad):
|
|||
columns={"HRS2010": self.REDLINING_SCALAR}
|
||||
)
|
||||
|
||||
logger.info(f"{historic_redlining_data.columns}")
|
||||
logger.debug(f"{historic_redlining_data.columns}")
|
||||
|
||||
# Calculate lots of different score thresholds for convenience
|
||||
for threshold in [3.25, 3.5, 3.75]:
|
||||
|
|
|
@ -23,7 +23,7 @@ class HousingTransportationETL(ExtractTransformLoad):
|
|||
dfs = []
|
||||
zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
|
||||
for fips in get_state_fips_codes(self.DATA_PATH):
|
||||
logger.info(
|
||||
logger.debug(
|
||||
f"Downloading housing data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
|
@ -50,8 +50,6 @@ class HousingTransportationETL(ExtractTransformLoad):
|
|||
self.df = pd.concat(dfs)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming Housing and Transportation Data")
|
||||
|
||||
# Rename and reformat tract ID
|
||||
self.df.rename(
|
||||
columns={"tract": self.GEOID_TRACT_FIELD_NAME}, inplace=True
|
||||
|
@ -61,7 +59,5 @@ class HousingTransportationETL(ExtractTransformLoad):
|
|||
].str.replace('"', "")
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Housing and Transportation Data")
|
||||
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
|
|
@ -56,7 +56,6 @@ class HudHousingETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Extracting 1.09 GB HUD Housing Data")
|
||||
super().extract(
|
||||
self.HOUSING_FTP_URL,
|
||||
self.HOUSING_ZIP_FILE_DIR,
|
||||
|
@ -80,8 +79,6 @@ class HudHousingETL(ExtractTransformLoad):
|
|||
return tmp_df
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming HUD Housing Data")
|
||||
|
||||
table_8 = self._read_chas_table("Table8.csv")
|
||||
table_3 = self._read_chas_table("Table3.csv")
|
||||
|
||||
|
|
|
@ -36,7 +36,6 @@ class HudRecapETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading HUD Recap Data")
|
||||
download = requests.get(
|
||||
self.HUD_RECAP_CSV_URL,
|
||||
verify=None,
|
||||
|
@ -48,8 +47,6 @@ class HudRecapETL(ExtractTransformLoad):
|
|||
csv_file.close()
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming HUD Recap Data")
|
||||
|
||||
# Load comparison index (CalEnviroScreen 4)
|
||||
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
|
||||
|
||||
|
@ -75,7 +72,6 @@ class HudRecapETL(ExtractTransformLoad):
|
|||
self.df.sort_values(by=self.GEOID_TRACT_FIELD_NAME, inplace=True)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving HUD Recap CSV")
|
||||
# write nationwide csv
|
||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)
|
||||
|
|
|
@ -39,7 +39,6 @@ class MappingForEJETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading Mapping for EJ Data")
|
||||
super().extract(
|
||||
self.MAPPING_FOR_EJ_VA_URL,
|
||||
self.get_tmp_path(),
|
||||
|
@ -50,8 +49,6 @@ class MappingForEJETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming Mapping for EJ Data")
|
||||
|
||||
# Join (here, it's just concatenating) the two dataframes from
|
||||
# CO and VA
|
||||
self.df = pd.concat(
|
||||
|
@ -86,7 +83,6 @@ class MappingForEJETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Mapping for EJ CSV")
|
||||
# write selected states csv
|
||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
|
@ -94,4 +90,4 @@ class MappingForEJETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating Mapping For EJ Data")
|
||||
logger.debug("Skipping validation for MappingForEJETL")
|
||||
|
|
|
@ -75,14 +75,12 @@ class MappingInequalityETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading Mapping Inequality Data")
|
||||
download_file_from_url(
|
||||
file_url=self.MAPPING_INEQUALITY_CSV_URL,
|
||||
download_file_name=self.MAPPING_INEQUALITY_CSV,
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming Mapping Inequality Data")
|
||||
df: pd.DataFrame = pd.read_csv(
|
||||
self.MAPPING_INEQUALITY_CSV,
|
||||
dtype={self.TRACT_INPUT_FIELD: "string"},
|
||||
|
@ -207,7 +205,6 @@ class MappingInequalityETL(ExtractTransformLoad):
|
|||
self.df = grouped_df
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Mapping Inequality CSV")
|
||||
# write nationwide csv
|
||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
|
|
|
@ -33,15 +33,13 @@ class MarylandEJScreenETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading 207MB Maryland EJSCREEN Data")
|
||||
logger.debug("Downloading 207MB Maryland EJSCREEN Data")
|
||||
super().extract(
|
||||
self.MARYLAND_EJSCREEN_URL,
|
||||
self.get_tmp_path(),
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming Maryland EJSCREEN Data")
|
||||
|
||||
list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
|
||||
|
||||
# Ignore counties becauses this is not the level of measurement
|
||||
|
@ -105,7 +103,6 @@ class MarylandEJScreenETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Maryland EJSCREEN CSV")
|
||||
# write maryland tracts to csv
|
||||
self.OUTPUT_CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
|
|
|
@ -33,7 +33,6 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
|
|||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading Michigan EJSCREEN Data")
|
||||
self.df = pd.read_csv(
|
||||
filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
|
||||
dtype={"GEO_ID": "string"},
|
||||
|
@ -41,8 +40,6 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming Michigan EJSCREEN Data")
|
||||
|
||||
self.df.rename(
|
||||
columns={
|
||||
"GEO_ID": self.GEOID_TRACT_FIELD_NAME,
|
||||
|
@ -60,7 +57,6 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Michigan Environmental Screening Tool to CSV")
|
||||
# write nationwide csv
|
||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
|
|
|
@ -69,7 +69,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
|||
"""Unzips NRI dataset from the FEMA data source and writes the files
|
||||
to the temporary data folder for use in the transform() method
|
||||
"""
|
||||
logger.info("Downloading 405MB National Risk Index Data")
|
||||
|
||||
super().extract(
|
||||
source_url=self.SOURCE_URL,
|
||||
|
@ -84,7 +83,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
|||
- Applies the NRI score for each Census Tract to the Census Block
|
||||
Groups inside of that Tract
|
||||
"""
|
||||
logger.info("Transforming National Risk Index Data")
|
||||
|
||||
# read in the unzipped csv from NRI data source then rename the
|
||||
# Census Tract column for merging
|
||||
|
|
|
@ -53,7 +53,6 @@ class NatureDeprivedETL(ExtractTransformLoad):
|
|||
|
||||
- Renames columns as needed
|
||||
"""
|
||||
logger.info("Transforming NLCD Data")
|
||||
|
||||
df_ncld: pd.DataFrame = pd.read_csv(
|
||||
self.INPUT_CSV,
|
||||
|
|
|
@ -76,8 +76,6 @@ class PersistentPovertyETL(ExtractTransformLoad):
|
|||
return df
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting to download 86MB persistent poverty file.")
|
||||
|
||||
unzipped_file_path = self.get_tmp_path()
|
||||
|
||||
unzip_file_from_url(
|
||||
|
@ -124,7 +122,6 @@ class PersistentPovertyETL(ExtractTransformLoad):
|
|||
self.df = self._join_input_dfs(temporary_input_dfs)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting persistent poverty transform")
|
||||
transformed_df = self.df
|
||||
|
||||
# Note: the fields are defined as following.
|
||||
|
|
|
@ -77,7 +77,6 @@ class TreeEquityScoreETL(ExtractTransformLoad):
|
|||
]
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading Tree Equity Score Data")
|
||||
for state in self.states:
|
||||
super().extract(
|
||||
f"{self.TES_URL}{state}.zip.zip",
|
||||
|
@ -85,7 +84,6 @@ class TreeEquityScoreETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming Tree Equity Score Data")
|
||||
tes_state_dfs = []
|
||||
for state in self.states:
|
||||
tes_state_dfs.append(
|
||||
|
@ -103,7 +101,6 @@ class TreeEquityScoreETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Tree Equity Score CSV")
|
||||
# write nationwide csv
|
||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df = self.df[
|
||||
|
|
|
@ -28,7 +28,6 @@ class TribalETL(ExtractTransformLoad):
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.info("Downloading Tribal Data")
|
||||
|
||||
bia_shapefile_zip_url = (
|
||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||
|
@ -77,7 +76,7 @@ class TribalETL(ExtractTransformLoad):
|
|||
bia_national_lar_df = gpd.read_file(path)
|
||||
|
||||
# DELETE
|
||||
logger.info(f"Columns: {bia_national_lar_df.columns}\n")
|
||||
logger.debug(f"Columns: {bia_national_lar_df.columns}\n")
|
||||
|
||||
bia_national_lar_df.drop(
|
||||
["GISAcres"],
|
||||
|
@ -186,8 +185,6 @@ class TribalETL(ExtractTransformLoad):
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.info("Transforming Tribal Data")
|
||||
|
||||
# Set the filepaths:
|
||||
bia_national_lar_shapefile = (
|
||||
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
|
||||
|
@ -220,7 +217,7 @@ class TribalETL(ExtractTransformLoad):
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.info("Saving Tribal GeoJson and CSV")
|
||||
logger.debug("Saving Tribal GeoJson and CSV")
|
||||
usa_tribal_df = gpd.GeoDataFrame(
|
||||
pd.concat(self.USA_TRIBAL_DF_LIST, ignore_index=True)
|
||||
)
|
||||
|
@ -228,7 +225,7 @@ class TribalETL(ExtractTransformLoad):
|
|||
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
||||
)
|
||||
|
||||
logger.info("Writing national geojson file")
|
||||
logger.debug("Writing national geojson file")
|
||||
usa_tribal_df.to_file(
|
||||
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
|
||||
)
|
||||
|
|
|
@ -94,8 +94,6 @@ class TribalOverlapETL(ExtractTransformLoad):
|
|||
self.tribal_gdf = get_tribal_geojson()
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting tribal overlap transforms.")
|
||||
|
||||
# First, calculate whether tracts include any areas from the Tribal areas,
|
||||
# for both the points in AK and the polygons in the continental US (CONUS).
|
||||
tribal_overlap_with_tracts = add_tracts_for_geometries(
|
||||
|
|
|
@ -56,8 +56,6 @@ class USArmyFUDS(ExtractTransformLoad):
|
|||
self.output_df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting FUDS data download.")
|
||||
|
||||
download_file_from_url(
|
||||
file_url=self.FILE_URL,
|
||||
download_file_name=self.DOWNLOAD_FILE_NAME,
|
||||
|
@ -65,11 +63,10 @@ class USArmyFUDS(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting FUDS transform.")
|
||||
# before we try to do any transformation, get the tract data
|
||||
# so it's loaded and the census ETL is out of scope
|
||||
|
||||
logger.info("Loading FUDS data as GeoDataFrame for transform")
|
||||
logger.debug("Loading FUDS data as GeoDataFrame for transform")
|
||||
raw_df = gpd.read_file(
|
||||
filename=self.DOWNLOAD_FILE_NAME,
|
||||
low_memory=False,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue