mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-08 05:04:18 -07:00
Analysis by region (#385)
* Adding regional comparisons * Small ETL fixes
This commit is contained in:
parent
81290ce672
commit
67b39475f7
7 changed files with 440 additions and 158 deletions
|
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
|
|||
|
||||
class CalEnviroScreenETL(ExtractTransformLoad):
|
||||
def __init__(self):
|
||||
self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip"
|
||||
self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/data-sources/CalEnviroScreen_4.0_2021.zip"
|
||||
self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
|
||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from pathlib import Path
|
||||
import csv
|
||||
import pandas as pd
|
||||
import os
|
||||
from config import settings
|
||||
|
||||
|
@ -53,3 +54,18 @@ def get_state_fips_codes(data_path: Path) -> list:
|
|||
fips = row[0].strip()
|
||||
fips_state_list.append(fips)
|
||||
return fips_state_list
|
||||
|
||||
|
||||
def get_state_information(data_path: Path) -> pd.DataFrame:
|
||||
"""Load the full state file as a dataframe.
|
||||
|
||||
Useful because of the state regional information.
|
||||
"""
|
||||
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
|
||||
|
||||
df = pd.read_csv(fips_csv_path)
|
||||
|
||||
# Left pad the FIPS codes with 0s
|
||||
df["fips"] = df["fips"].astype(str).apply(lambda x: x.zfill(2))
|
||||
|
||||
return df
|
||||
|
|
|
@ -44,8 +44,6 @@ class HousingTransportationETL(ExtractTransformLoad):
|
|||
|
||||
self.df = pd.concat(dfs)
|
||||
|
||||
self.df.head()
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info(f"Transforming Housing and Transportation Data")
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ class HudRecapETL(ExtractTransformLoad):
|
|||
logger.info(f"Transforming HUD Recap Data")
|
||||
|
||||
# Load comparison index (CalEnviroScreen 4)
|
||||
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"Census Tract": "string"})
|
||||
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
|
||||
|
||||
self.df.rename(
|
||||
columns={
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue