mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-31 09:11:19 -07:00
Analysis by region (#385)
* Adding regional comparisons * Small ETL fixes
This commit is contained in:
parent
81290ce672
commit
67b39475f7
7 changed files with 440 additions and 158 deletions
|
@ -394,13 +394,15 @@ class ScoreETL(ExtractTransformLoad):
|
|||
"Score C",
|
||||
"Score D",
|
||||
"Score E",
|
||||
"Poverty (Less than 200% of federal poverty line)",
|
||||
]:
|
||||
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] = self.df[
|
||||
score_field
|
||||
].rank(pct=True)
|
||||
self.df[f"{score_field} (top 25th percentile)"] = (
|
||||
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] >= 0.75
|
||||
)
|
||||
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] = self.df[score_field].rank(pct=True)
|
||||
|
||||
for threshold in [0.25, 0.3, 0.35, 0.4]:
|
||||
fraction_converted_to_percent = int(100 * threshold)
|
||||
self.df[f"{score_field} (top {fraction_converted_to_percent}th percentile)"] = (
|
||||
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] >= 1 - threshold
|
||||
)
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info(f"Saving Score CSV")
|
||||
|
|
|
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
|
|||
|
||||
class CalEnviroScreenETL(ExtractTransformLoad):
|
||||
def __init__(self):
|
||||
self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip"
|
||||
self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/data-sources/CalEnviroScreen_4.0_2021.zip"
|
||||
self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
|
||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from pathlib import Path
|
||||
import csv
|
||||
import pandas as pd
|
||||
import os
|
||||
from config import settings
|
||||
|
||||
|
@ -53,3 +54,18 @@ def get_state_fips_codes(data_path: Path) -> list:
|
|||
fips = row[0].strip()
|
||||
fips_state_list.append(fips)
|
||||
return fips_state_list
|
||||
|
||||
|
||||
def get_state_information(data_path: Path) -> pd.DataFrame:
|
||||
"""Load the full state file as a dataframe.
|
||||
|
||||
Useful because of the state regional information.
|
||||
"""
|
||||
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
|
||||
|
||||
df = pd.read_csv(fips_csv_path)
|
||||
|
||||
# Left pad the FIPS codes with 0s
|
||||
df["fips"] = df["fips"].astype(str).apply(lambda x: x.zfill(2))
|
||||
|
||||
return df
|
||||
|
|
|
@ -44,8 +44,6 @@ class HousingTransportationETL(ExtractTransformLoad):
|
|||
|
||||
self.df = pd.concat(dfs)
|
||||
|
||||
self.df.head()
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info(f"Transforming Housing and Transportation Data")
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ class HudRecapETL(ExtractTransformLoad):
|
|||
logger.info(f"Transforming HUD Recap Data")
|
||||
|
||||
# Load comparison index (CalEnviroScreen 4)
|
||||
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"Census Tract": "string"})
|
||||
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
|
||||
|
||||
self.df.rename(
|
||||
columns={
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue