Analysis by region (#385)

* Adding regional comparisons

* Small ETL fixes
This commit is contained in:
Lucas Merrill Brown 2021-07-26 08:02:25 -07:00 committed by GitHub
commit 67b39475f7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 440 additions and 158 deletions

View file

@ -394,13 +394,15 @@ class ScoreETL(ExtractTransformLoad):
"Score C",
"Score D",
"Score E",
"Poverty (Less than 200% of federal poverty line)",
]:
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] = self.df[
score_field
].rank(pct=True)
self.df[f"{score_field} (top 25th percentile)"] = (
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] >= 0.75
)
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] = self.df[score_field].rank(pct=True)
for threshold in [0.25, 0.3, 0.35, 0.4]:
fraction_converted_to_percent = int(100 * threshold)
self.df[f"{score_field} (top {fraction_converted_to_percent}th percentile)"] = (
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] >= 1 - threshold
)
def load(self) -> None:
logger.info(f"Saving Score CSV")

View file

@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
class CalEnviroScreenETL(ExtractTransformLoad):
def __init__(self):
self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip"
self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/data-sources/CalEnviroScreen_4.0_2021.zip"
self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"

View file

@ -1,5 +1,6 @@
from pathlib import Path
import csv
import pandas as pd
import os
from config import settings
@ -53,3 +54,18 @@ def get_state_fips_codes(data_path: Path) -> list:
fips = row[0].strip()
fips_state_list.append(fips)
return fips_state_list
def get_state_information(data_path: Path) -> pd.DataFrame:
"""Load the full state file as a dataframe.
Useful because of the state regional information.
"""
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
df = pd.read_csv(fips_csv_path)
# Left pad the FIPS codes with 0s
df["fips"] = df["fips"].astype(str).apply(lambda x: x.zfill(2))
return df

View file

@ -44,8 +44,6 @@ class HousingTransportationETL(ExtractTransformLoad):
self.df = pd.concat(dfs)
self.df.head()
def transform(self) -> None:
logger.info(f"Transforming Housing and Transportation Data")

View file

@ -33,7 +33,7 @@ class HudRecapETL(ExtractTransformLoad):
logger.info(f"Transforming HUD Recap Data")
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"Census Tract": "string"})
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
self.df.rename(
columns={