j40-cejst-2/score/etl/sources/ejscreen/etl.py
Jorge Escobar 842312f69f
ETL Classes for Data Sets (#260)
* first commit

* checkpoint

* checkpoint

* first extract module 🎉

* completed census acs etl class

* completed ejscreen etl

* completed etl

* score generation ready

* improving census load and separation

* score generation working 🎉

* completed etls

* new score generation

* PR reviews

* run specific etl; starting docstrings

* docstrings work

* more docstrings

* completed docstrings

* adding pyenv version

* more reasonable poetry req for python

* PR comments
2021-07-12 15:50:44 -04:00

39 lines
1.2 KiB
Python

import pandas as pd
from etl.base import ExtractTransformLoad
from utils import get_module_logger
logger = get_module_logger(__name__)
class EJScreenETL(ExtractTransformLoad):
def __init__(self):
self.EJSCREEN_FTP_URL = (
"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip"
)
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2020_StatePctile.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2020"
self.df: pd.DataFrame
def extract(self) -> None:
logger.info(f"Downloading EJScreen Data")
super().extract(
self.EJSCREEN_FTP_URL,
self.TMP_PATH,
)
def transform(self) -> None:
logger.info(f"Transforming EJScreen Data")
self.df = pd.read_csv(
self.EJSCREEN_CSV,
dtype={"ID": "string"},
# EJSCREEN writes the word "None" for NA data.
na_values=["None"],
low_memory=False,
)
def load(self) -> None:
logger.info(f"Saving EJScreen CSV")
# write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.CSV_PATH / f"usa.csv", index=False)