Adding eamlis and fuds data to legacy pollution in score (#1832)

Update to add EAMLIS and FUDS data to score
This commit is contained in:
Emma Nechamkin 2022-08-18 13:32:29 -04:00 committed by GitHub
commit cb4866b93f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 93 additions and 24 deletions

View file

@ -93,21 +93,23 @@ def etl_runner(dataset_to_run: str = None) -> None:
dataset for dataset in dataset_list if dataset["is_memory_intensive"]
]
logger.info("Running concurrent jobs")
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
executor.submit(_run_one_dataset, dataset=dataset)
for dataset in concurrent_datasets
}
if concurrent_datasets:
logger.info("Running concurrent jobs")
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
executor.submit(_run_one_dataset, dataset=dataset)
for dataset in concurrent_datasets
}
for fut in concurrent.futures.as_completed(futures):
# Calling result will raise an exception if one occurred.
# Otherwise, the exceptions are silently ignored.
fut.result()
for fut in concurrent.futures.as_completed(futures):
# Calling result will raise an exception if one occurred.
# Otherwise, the exceptions are silently ignored.
fut.result()
logger.info("Running high-memory jobs")
for dataset in high_memory_datasets:
_run_one_dataset(dataset=dataset)
if high_memory_datasets:
logger.info("Running high-memory jobs")
for dataset in high_memory_datasets:
_run_one_dataset(dataset=dataset)
def score_generate() -> None:

View file

@ -312,6 +312,8 @@ TILES_SCORE_COLUMNS = {
field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
+ field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
field_names.AML_BOOLEAN: "AML_ET",
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET"
## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
## FPL_200 (there is no higher ed in narwhal)
}

View file

@ -14,6 +14,8 @@ from data_pipeline.etl.sources.dot_travel_composite.etl import (
from data_pipeline.etl.sources.fsf_flood_risk.etl import (
FloodRiskETL,
)
from data_pipeline.etl.sources.eamlis.etl import AbandonedMineETL
from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
from data_pipeline.etl.sources.nlcd_nature_deprived.etl import NatureDeprivedETL
from data_pipeline.etl.sources.fsf_wildfire_risk.etl import WildfireRiskETL
from data_pipeline.score.score_runner import ScoreRunner
@ -49,6 +51,8 @@ class ScoreETL(ExtractTransformLoad):
self.fsf_flood_df: pd.DataFrame
self.fsf_fire_df: pd.DataFrame
self.nature_deprived_df: pd.DataFrame
self.eamlis_df: pd.DataFrame
self.fuds_df: pd.DataFrame
def extract(self) -> None:
logger.info("Loading data sets from disk.")
@ -139,6 +143,12 @@ class ScoreETL(ExtractTransformLoad):
# Load NLCD Nature-Deprived Communities data
self.nature_deprived_df = NatureDeprivedETL.get_data_frame()
# Load eAMLIS dataset
self.eamlis_df = AbandonedMineETL.get_data_frame()
# Load FUDS dataset
self.fuds_df = USArmyFUDS.get_data_frame()
# Load GeoCorr Urban Rural Map
geocorr_urban_rural_csv = (
constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
@ -362,6 +372,8 @@ class ScoreETL(ExtractTransformLoad):
self.fsf_flood_df,
self.fsf_fire_df,
self.nature_deprived_df,
self.eamlis_df,
self.fuds_df,
]
# Sanity check each data frame before merging.
@ -457,6 +469,8 @@ class ScoreETL(ExtractTransformLoad):
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
field_names.TRACT_ELIGIBLE_FOR_NONNATURAL_THRESHOLD,
field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
field_names.AML_BOOLEAN,
]
# For some columns, high values are "good", so we want to reverse the percentile

File diff suppressed because one or more lines are too long

View file

@ -55,7 +55,7 @@ class USArmyFUDS(ExtractTransformLoad):
# before we try to do any transformation, get the tract data
# so it's loaded and the census ETL is out of scope
logger.info("Loading FUDs data as GeoDataFrame for transform")
logger.info("Loading FUDS data as GeoDataFrame for transform")
raw_df = gpd.read_file(
filename=self.DOWNLOAD_FILE_NAME,
low_memory=False,
@ -88,7 +88,7 @@ class USArmyFUDS(ExtractTransformLoad):
.size()
)
self.output_df = (
self.output_df.fillna(0).astype("int64").sort_index().reset_index()
self.output_df.fillna(0).astype(np.int64).sort_index().reset_index()
)
self.output_df[self.ELIGIBLE_FUDS_BINARY_FIELD_NAME] = np.where(