mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 09:41:26 -08:00
Data Pipeline performance improvements for Census GeoJson and Score file
This commit is contained in:
parent
d5d055864f
commit
c32bd1f363
37 changed files with 1305 additions and 1413 deletions
7
.github/workflows/deploy_backend_main.yml
vendored
7
.github/workflows/deploy_backend_main.yml
vendored
|
@ -59,12 +59,6 @@ jobs:
|
|||
with:
|
||||
path: data/data-pipeline/data_pipeline/data/census
|
||||
key: data-census
|
||||
- name: Install GDAL/ogr2ogr
|
||||
if: steps.cache-census.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install gdal-bin
|
||||
ogrinfo --version
|
||||
- name: Get Census Data
|
||||
if: steps.cache-census.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
|
@ -72,7 +66,6 @@ jobs:
|
|||
- name: Run ETL
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application etl-run
|
||||
poetry run python3 -m data_pipeline.application etl-run --dataset tribal
|
||||
- name: Generate Score
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application score-run
|
||||
|
|
6
.github/workflows/pr_backend.yml
vendored
6
.github/workflows/pr_backend.yml
vendored
|
@ -98,11 +98,6 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: poetry add s4cmd && poetry install
|
||||
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
|
||||
- name: Install GDAL/ogr2ogr
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install gdal-bin
|
||||
ogrinfo --version
|
||||
- name: Load cached ETL data
|
||||
id: cached-etl-data
|
||||
uses: actions/cache@v4
|
||||
|
@ -119,7 +114,6 @@ jobs:
|
|||
if: steps.cached-etl-data.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application etl-run
|
||||
poetry run python3 -m data_pipeline.application etl-run --dataset tribal
|
||||
- name: Generate Score
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application score-run
|
||||
|
|
157
data/data-pipeline/.vscode/launch.json
vendored
157
data/data-pipeline/.vscode/launch.json
vendored
|
@ -4,27 +4,9 @@
|
|||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Score Run",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"score-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Score Post",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-score-post"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Data Cleanup",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
|
@ -33,7 +15,7 @@
|
|||
},
|
||||
{
|
||||
"name": "Census Cleanup",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
|
@ -42,73 +24,25 @@
|
|||
},
|
||||
{
|
||||
"name": "Download Census",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"census-data-download"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Score Full Run",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"score-full-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Combine Score and GeoJSON",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"geo-score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Score Tiles",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-map-tiles"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Tribal Tiles",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-map-tiles",
|
||||
"-t"
|
||||
"census-data-download", "-u"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "ETL Run",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"etl-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "ETL Run NRI",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"etl-run",
|
||||
"--dataset",
|
||||
"national_risk_index"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "ETL Run Tribal",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
|
@ -117,18 +51,91 @@
|
|||
"tribal"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Score Run",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"score-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Combine Score and GeoJSON",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"geo-score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Score Post",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-score-post"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Score Tiles",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-map-tiles"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Tribal Tiles",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-map-tiles",
|
||||
"-t"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Score Full Run",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"score-full-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Data Full Run",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"data-full-run",
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Comparator",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.comparator",
|
||||
"args": [
|
||||
"compare-score",
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Convert score to CSV",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"convert-score",
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "poetry install",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "poetry",
|
||||
"args": [
|
||||
|
@ -137,7 +144,7 @@
|
|||
},
|
||||
{
|
||||
"name": "poetry update",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "poetry",
|
||||
"args": [
|
||||
|
|
|
@ -58,7 +58,6 @@ The application requires the installation of three 3rd party tools.
|
|||
|
||||
| Tool | Purpose | Link |
|
||||
| --------------- | -------------------- | --------------------------------------------------------- |
|
||||
| GDAL | Generate census data | [GDAL library](https://github.com/OSGeo/gdal) |
|
||||
| libspatialindex | Score generation | [libspatialindex](https://libspatialindex.org/en/latest/) |
|
||||
| tippecanoe | Generate map tiles | [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe) |
|
||||
|
||||
|
@ -66,7 +65,6 @@ The application requires the installation of three 3rd party tools.
|
|||
|
||||
Use Homebrew to install the three tools.
|
||||
|
||||
- GDAL: `brew install gdal`
|
||||
- libspatialindex: `brew install spatialindex`
|
||||
- tippecanoe: `brew install tippecanoe`
|
||||
|
||||
|
|
|
@ -117,7 +117,7 @@ Begin the process of running the application in your local environment by downlo
|
|||
|
||||
To download census data, run the command `poetry run python3 data_pipeline/application.py census-data-download`.
|
||||
|
||||
If you have a high speed internet connection and don't want to generate the census data or install `GDAL` locally, you can download [a zip version of the Census file](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census` folder.
|
||||
If you have a high speed internet connection and don't want to generate the census data locally, you can download [a zip version of the Census file](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census` folder.
|
||||
|
||||
#### Run the Application
|
||||
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from subprocess import call
|
||||
|
||||
import click
|
||||
|
@ -19,6 +22,7 @@ from data_pipeline.etl.sources.tribal.etl_utils import (
|
|||
reset_data_directories as tribal_reset,
|
||||
)
|
||||
from data_pipeline.tile.generate import generate_tiles
|
||||
from data_pipeline.etl.score import constants
|
||||
from data_pipeline.utils import check_first_run
|
||||
from data_pipeline.utils import data_folder_cleanup
|
||||
from data_pipeline.utils import downloadable_cleanup
|
||||
|
@ -330,25 +334,11 @@ def data_full_run(check: bool, data_source: str, use_cache: bool):
|
|||
temp_folder_cleanup()
|
||||
tribal_reset(data_path)
|
||||
|
||||
if data_source == "local":
|
||||
log_info("Downloading census data")
|
||||
etl_runner("census", use_cache)
|
||||
|
||||
log_info("Running all ETLs")
|
||||
etl_runner(use_cache=True)
|
||||
|
||||
log_info("Running tribal ETL")
|
||||
etl_runner("tribal", use_cache)
|
||||
|
||||
else:
|
||||
log_info("Downloading census data")
|
||||
etl_runner("census", use_cache=False)
|
||||
|
||||
log_info("Running all ETLs")
|
||||
etl_runner(use_cache=False)
|
||||
|
||||
log_info("Running tribal ETL")
|
||||
etl_runner("tribal", use_cache=False)
|
||||
etl_runner(use_cache)
|
||||
|
||||
log_info("Generating score")
|
||||
score_generate()
|
||||
|
@ -467,10 +457,41 @@ def full_run(ctx, use_cache):
|
|||
ctx.invoke(data_cleanup)
|
||||
ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache)
|
||||
ctx.invoke(etl_run, dataset=None, use_cache=use_cache)
|
||||
ctx.invoke(etl_run, dataset="tribal", use_cache=use_cache)
|
||||
ctx.invoke(full_post_etl)
|
||||
|
||||
|
||||
@cli.command(
|
||||
help="Convert a Pickle or Parquet file to GeoJSON or CSV depending on the contents of the file.",
|
||||
)
|
||||
@click.option(
|
||||
"--source",
|
||||
"-s",
|
||||
type=click.Path(),
|
||||
# We don't require this option, otherwise the tool will not run when there is no score
|
||||
default=constants.DATA_SCORE_CSV_FULL_FILE_PATH,
|
||||
help="Path to the input file. Defaults to the default location of the local score file.",
|
||||
)
|
||||
@click.option(
|
||||
"--destination",
|
||||
"-d",
|
||||
type=click.Path(writable=True),
|
||||
default=Path(
|
||||
os.path.splitext(constants.DATA_SCORE_CSV_FULL_FILE_PATH)[0] + ".csv"
|
||||
),
|
||||
help="Path to the input file. Defaults to the source file with CSV extension.",
|
||||
)
|
||||
def convert_score(source: Path, destination: Path):
|
||||
"""Converts the score file to CSV."""
|
||||
if source.exists():
|
||||
score_df = pd.read_parquet(source)
|
||||
logger.info(f"Saving score as CSV to {destination}")
|
||||
score_df.to_csv(destination, index=False)
|
||||
logger.info("Done.")
|
||||
else:
|
||||
logger.error(f"Error: Unable to read {source}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def log_title(title: str, subtitle: str = None):
|
||||
"""Logs a title in our fancy title format"""
|
||||
logger.info("-" * LOG_LINE_WIDTH)
|
||||
|
|
|
@ -51,12 +51,19 @@ def _read_from_file(file_path: Path):
|
|||
"Please generate the score and try again."
|
||||
)
|
||||
sys.exit(1)
|
||||
return pd.read_csv(
|
||||
df = pd.DataFrame()
|
||||
if file_path.suffix == ".parquet":
|
||||
df = pd.read_parquet(file_path)
|
||||
df.set_index("GEOID10_TRACT", inplace=True)
|
||||
else:
|
||||
df = pd.read_csv(
|
||||
file_path,
|
||||
index_col="GEOID10_TRACT",
|
||||
dtype={"GEOID10_TRACT": str},
|
||||
low_memory=False,
|
||||
).sort_index()
|
||||
)
|
||||
|
||||
return df.sort_index()
|
||||
|
||||
|
||||
def _add_tract_list(tract_list: list[str]):
|
||||
|
@ -67,7 +74,7 @@ def _add_tract_list(tract_list: list[str]):
|
|||
tract_list (list[str]): a list of tracts
|
||||
"""
|
||||
if len(tract_list) > 0:
|
||||
_add_text("Those tracts are:\n")
|
||||
_add_text(" Those tracts are:\n")
|
||||
# First extract the Census states/territories
|
||||
states_by_tract = []
|
||||
for tract in tract_list:
|
||||
|
@ -125,7 +132,7 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
|||
local_df (pd.DataFrame): the local score
|
||||
"""
|
||||
log_info("Comparing dataframe contents (production vs local)")
|
||||
_add_text("\n\n## Scores\n")
|
||||
_add_text("\n## Scores\n")
|
||||
|
||||
production_row_count = len(prod_df.index)
|
||||
local_row_count = len(local_df.index)
|
||||
|
@ -189,10 +196,10 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
|||
f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population."
|
||||
)
|
||||
_add_text(
|
||||
" The number of tracts match!\n "
|
||||
" The number of tracts match!\n"
|
||||
if len(production_disadvantaged_tracts_set)
|
||||
== len(local_disadvantaged_tracts_set)
|
||||
else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n "
|
||||
else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n"
|
||||
)
|
||||
|
||||
removed_tracts = production_disadvantaged_tracts_set.difference(
|
||||
|
@ -213,17 +220,44 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
|||
)
|
||||
_add_tract_list(added_tracts)
|
||||
|
||||
# Grandfathered tracts from v1.0
|
||||
|
||||
def _check_grandfathered_tracts(
|
||||
prod_df: pd.DataFrame, local_df: pd.DataFrame, compare_to_version: str
|
||||
):
|
||||
"""
|
||||
Find grandfathered tracts for v1.0 comparisons.
|
||||
|
||||
Args:
|
||||
prod_df (pd.DataFrame): the production score
|
||||
local_df (pd.DataFrame): the local score
|
||||
compare_to_version (str): the compare to version
|
||||
"""
|
||||
|
||||
# Set the field we will check for grandfathering.
|
||||
# This allows us to add other fields for other versions.
|
||||
grandfathered_field = (
|
||||
field_names.GRANDFATHERED_N_COMMUNITIES_V1_0
|
||||
if compare_to_version.startswith("1")
|
||||
else None
|
||||
)
|
||||
|
||||
# If there is a grandfathered field then check for those tracts
|
||||
if grandfathered_field:
|
||||
log_info("Checking for grandfathered tracks")
|
||||
grandfathered_tracts = local_df.loc[
|
||||
local_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
|
||||
].index
|
||||
if len(grandfathered_tracts) > 0:
|
||||
_add_text(
|
||||
f"* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring."
|
||||
f"\n* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring."
|
||||
)
|
||||
_add_tract_list(grandfathered_tracts)
|
||||
else:
|
||||
_add_text("* There are NO grandfathered tracts from v1.0 scoring.\n")
|
||||
_add_text(
|
||||
"* There are NO grandfathered tracts from v1.0 scoring.\n"
|
||||
)
|
||||
else:
|
||||
_add_text("\n* There is no grandfathered tract list for this version.")
|
||||
|
||||
|
||||
def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
||||
|
@ -234,7 +268,7 @@ def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
|||
prod_df (pd.DataFrame): the production score
|
||||
local_df (pd.DataFrame): the local score
|
||||
"""
|
||||
_add_text("\n## Delta\n")
|
||||
_add_text("\n\n## Delta\n")
|
||||
# First we make the columns on two dataframes to be the same to be able to compare
|
||||
local_score_df_columns = local_df.columns.array.tolist()
|
||||
production_score_df_columns = prod_df.columns.array.tolist()
|
||||
|
@ -287,7 +321,7 @@ def cli():
|
|||
@click.option(
|
||||
"-v",
|
||||
"--compare-to-version",
|
||||
default="1.0",
|
||||
default="2.0",
|
||||
required=False,
|
||||
type=str,
|
||||
help="Set the production score version to compare to",
|
||||
|
@ -359,8 +393,10 @@ def compare_score(
|
|||
|
||||
_compare_score_columns(production_score_df, local_score_df)
|
||||
_compare_score_results(production_score_df, local_score_df)
|
||||
_check_grandfathered_tracts(
|
||||
production_score_df, local_score_df, compare_to_version
|
||||
)
|
||||
_generate_delta(production_score_df, local_score_df)
|
||||
|
||||
result_doc = _get_result_doc()
|
||||
print(result_doc)
|
||||
|
||||
|
|
|
@ -155,7 +155,13 @@ DATASET_LIST = [
|
|||
"class_name": "HistoricRedliningETL",
|
||||
"is_memory_intensive": False,
|
||||
},
|
||||
# This has to come after us.json exists
|
||||
{
|
||||
"name": "tribal",
|
||||
"module_dir": "tribal",
|
||||
"class_name": "TribalETL",
|
||||
"is_memory_intensive": False,
|
||||
},
|
||||
# This has to come after us_geo.parquet exists
|
||||
{
|
||||
"name": "census_acs",
|
||||
"module_dir": "census_acs",
|
||||
|
@ -196,10 +202,3 @@ CENSUS_INFO = {
|
|||
"class_name": "CensusETL",
|
||||
"is_memory_intensive": False,
|
||||
}
|
||||
|
||||
TRIBAL_INFO = {
|
||||
"name": "tribal",
|
||||
"module_dir": "tribal",
|
||||
"class_name": "TribalETL",
|
||||
"is_memory_intensive": False,
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import concurrent.futures
|
||||
import importlib
|
||||
import time
|
||||
import typing
|
||||
import os
|
||||
|
||||
|
@ -27,9 +28,7 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
|
|||
None
|
||||
"""
|
||||
dataset_list = constants.DATASET_LIST
|
||||
etls_to_search = (
|
||||
dataset_list + [constants.CENSUS_INFO] + [constants.TRIBAL_INFO]
|
||||
)
|
||||
etls_to_search = dataset_list + [constants.CENSUS_INFO]
|
||||
|
||||
if dataset_to_run:
|
||||
dataset_element = next(
|
||||
|
@ -59,6 +58,8 @@ def _get_dataset(dataset: dict) -> ExtractTransformLoad:
|
|||
def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
|
||||
"""Runs one etl process."""
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
logger.info(f"Running ETL for {dataset['name']}")
|
||||
etl_instance = _get_dataset(dataset)
|
||||
|
||||
|
@ -83,6 +84,9 @@ def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
|
|||
etl_instance.cleanup()
|
||||
|
||||
logger.info(f"Finished ETL for dataset {dataset['name']}")
|
||||
logger.debug(
|
||||
f"Execution time for ETL for dataset {dataset['name']} was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def etl_runner(
|
||||
|
@ -197,10 +201,14 @@ def score_generate() -> None:
|
|||
"""
|
||||
|
||||
# Score Gen
|
||||
start_time = time.time()
|
||||
score_gen = ScoreETL()
|
||||
score_gen.extract()
|
||||
score_gen.transform()
|
||||
score_gen.load()
|
||||
logger.debug(
|
||||
f"Execution time for Score Generation was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def score_post(data_source: str = "local") -> None:
|
||||
|
@ -216,11 +224,15 @@ def score_post(data_source: str = "local") -> None:
|
|||
None
|
||||
"""
|
||||
# Post Score Processing
|
||||
start_time = time.time()
|
||||
score_post = PostScoreETL(data_source=data_source)
|
||||
score_post.extract()
|
||||
score_post.transform()
|
||||
score_post.load()
|
||||
score_post.cleanup()
|
||||
logger.debug(
|
||||
f"Execution time for Score Post was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def score_geo(data_source: str = "local") -> None:
|
||||
|
@ -237,10 +249,14 @@ def score_geo(data_source: str = "local") -> None:
|
|||
"""
|
||||
|
||||
# Score Geo
|
||||
start_time = time.time()
|
||||
score_geo = GeoScoreETL(data_source=data_source)
|
||||
score_geo.extract()
|
||||
score_geo.transform()
|
||||
score_geo.load()
|
||||
logger.debug(
|
||||
f"Execution time for Score Geo was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def _find_dataset_index(dataset_list, key, value):
|
||||
|
|
|
@ -24,7 +24,7 @@ DATA_CENSUS_DIR = DATA_PATH / "census"
|
|||
DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
|
||||
DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
|
||||
DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"
|
||||
DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us.json"
|
||||
DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us_geo.parquet"
|
||||
|
||||
# Score paths
|
||||
DATA_SCORE_DIR = DATA_PATH / "score"
|
||||
|
@ -32,7 +32,7 @@ DATA_SCORE_DIR = DATA_PATH / "score"
|
|||
## Score CSV Paths
|
||||
DATA_SCORE_CSV_DIR = DATA_SCORE_DIR / "csv"
|
||||
DATA_SCORE_CSV_FULL_DIR = DATA_SCORE_CSV_DIR / "full"
|
||||
DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa.csv"
|
||||
DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa_score.parquet"
|
||||
FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
|
||||
DATA_SCORE_CSV_FULL_DIR / "usa_counties.csv"
|
||||
)
|
||||
|
|
|
@ -727,4 +727,4 @@ class ScoreETL(ExtractTransformLoad):
|
|||
def load(self) -> None:
|
||||
constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
|
||||
self.df.to_parquet(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
|
||||
|
|
|
@ -37,9 +37,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
|
||||
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
|
||||
|
||||
self.CENSUS_USA_GEOJSON = (
|
||||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
)
|
||||
self.CENSUS_USA_GEOJSON = constants.DATA_CENSUS_GEOJSON_FILE_PATH
|
||||
|
||||
# Import the shortened name for Score N to be used on tiles.
|
||||
# We should no longer be using PFS
|
||||
|
@ -87,16 +85,14 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
score_data_source=self.DATA_SOURCE,
|
||||
)
|
||||
|
||||
logger.info("Reading US GeoJSON (~6 minutes)")
|
||||
full_geojson_usa_df = gpd.read_file(
|
||||
logger.info("Reading US GeoJSON")
|
||||
full_geojson_usa_df = gpd.read_parquet(
|
||||
self.CENSUS_USA_GEOJSON,
|
||||
dtype={self.GEOID_FIELD_NAME: "string"},
|
||||
usecols=[
|
||||
columns=[
|
||||
self.GEOID_FIELD_NAME,
|
||||
self.GEOMETRY_FIELD_NAME,
|
||||
self.LAND_FIELD_NAME,
|
||||
],
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
# We only want to keep tracts to visualize that have non-0 land
|
||||
|
@ -104,7 +100,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
full_geojson_usa_df[self.LAND_FIELD_NAME] > 0
|
||||
]
|
||||
|
||||
logger.info("Reading score CSV")
|
||||
logger.info("Reading tile score CSV")
|
||||
self.score_usa_df = pd.read_csv(
|
||||
self.TILE_SCORE_CSV,
|
||||
dtype={
|
||||
|
|
|
@ -94,12 +94,8 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
||||
logger.debug("Reading Score CSV")
|
||||
df = pd.read_csv(
|
||||
score_path,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
logger.debug("Reading Score")
|
||||
df = pd.read_parquet(score_path)
|
||||
|
||||
# Convert total population to an int
|
||||
df["Total population"] = df["Total population"].astype(
|
||||
|
@ -116,8 +112,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
gpd.GeoDataFrame: the census geo json data
|
||||
"""
|
||||
logger.debug("Reading Census GeoJSON")
|
||||
with open(geo_path, "r", encoding="utf-8") as file:
|
||||
data = gpd.read_file(file)
|
||||
data = gpd.read_parquet(geo_path)
|
||||
return data
|
||||
|
||||
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||
|
|
|
@ -70,7 +70,7 @@ def state_data_initial(sample_data_dir):
|
|||
|
||||
@pytest.fixture()
|
||||
def score_data_initial(sample_data_dir):
|
||||
return sample_data_dir / "score_data_initial.csv"
|
||||
return sample_data_dir / "score_data_initial.parquet"
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -104,8 +104,8 @@ def states_transformed_expected():
|
|||
|
||||
@pytest.fixture()
|
||||
def score_transformed_expected():
|
||||
return pd.read_pickle(
|
||||
pytest.SNAPSHOT_DIR / "score_transformed_expected.pkl"
|
||||
return pd.read_parquet(
|
||||
pytest.SNAPSHOT_DIR / "score_transformed_expected.parquet"
|
||||
)
|
||||
|
||||
|
||||
|
@ -122,7 +122,7 @@ def national_tract_df():
|
|||
|
||||
@pytest.fixture()
|
||||
def score_data_expected():
|
||||
return pd.read_pickle(pytest.SNAPSHOT_DIR / "score_data_expected.pkl")
|
||||
return pd.read_parquet(pytest.SNAPSHOT_DIR / "score_data_expected.parquet")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -144,8 +144,8 @@ def create_tile_data_expected():
|
|||
|
||||
@pytest.fixture()
|
||||
def downloadable_data_expected():
|
||||
return pd.read_pickle(
|
||||
pytest.SNAPSHOT_DIR / "downloadable_data_expected.pkl"
|
||||
return pd.read_parquet(
|
||||
pytest.SNAPSHOT_DIR / "downloadable_data_expected.parquet"
|
||||
)
|
||||
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -33,8 +33,7 @@ def test_extract_states(etl, state_data_initial):
|
|||
|
||||
def test_extract_score(etl, score_data_initial):
|
||||
extracted = etl._extract_score(score_data_initial)
|
||||
string_cols = ["GEOID10_TRACT"]
|
||||
assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
|
||||
assert len(extracted) > 0
|
||||
|
||||
|
||||
# Transform Tests
|
||||
|
@ -107,6 +106,7 @@ def test_create_downloadable_data(
|
|||
pdt.assert_frame_equal(
|
||||
output_downloadable_df_actual,
|
||||
downloadable_data_expected,
|
||||
check_dtype=False,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
import csv
|
||||
import json
|
||||
import subprocess
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
||||
import geopandas as gpd
|
||||
import pandas as pd
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
@ -26,8 +25,8 @@ class CensusETL(ExtractTransformLoad):
|
|||
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
|
||||
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
|
||||
NATIONAL_TRACT_CSV_PATH = CSV_BASE_PATH / "us.csv"
|
||||
NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us.json"
|
||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||
NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us_geo.parquet"
|
||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10"
|
||||
|
||||
def __init__(self):
|
||||
|
||||
|
@ -59,7 +58,7 @@ class CensusETL(ExtractTransformLoad):
|
|||
/ f"tl_2010_{fips_code}_tract10.shp"
|
||||
)
|
||||
elif file_type == GeoFileType.GEOJSON:
|
||||
file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.json")
|
||||
file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.parquet")
|
||||
elif file_type == GeoFileType.CSV:
|
||||
file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
|
||||
return file_path
|
||||
|
@ -93,14 +92,8 @@ class CensusETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
if not geojson_file_path.is_file():
|
||||
cmd = [
|
||||
"ogr2ogr",
|
||||
"-f",
|
||||
"GeoJSON",
|
||||
str(geojson_file_path),
|
||||
str(shp_file_path),
|
||||
]
|
||||
subprocess.run(cmd, check=True)
|
||||
gdf = gpd.read_file(shp_file_path)
|
||||
gdf.to_parquet(geojson_file_path)
|
||||
|
||||
def _generate_tract_table(self) -> None:
|
||||
"""Generate Tract CSV table for pandas, load in memory
|
||||
|
@ -110,20 +103,15 @@ class CensusETL(ExtractTransformLoad):
|
|||
"""
|
||||
logger.debug("Transforming tracts")
|
||||
|
||||
for file in self.GEOJSON_BASE_PATH.iterdir():
|
||||
if file.suffix == ".json":
|
||||
files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
|
||||
files.sort()
|
||||
for file in files:
|
||||
logger.debug(f"Adding GEOID10 for file {file.name}")
|
||||
with open(self.GEOJSON_BASE_PATH / file, encoding="utf-8") as f:
|
||||
geojson = json.load(f)
|
||||
for feature in geojson["features"]:
|
||||
tractid10 = feature["properties"]["GEOID10"]
|
||||
self.TRACT_NATIONAL.append(str(tractid10))
|
||||
tractid10_state_id = tractid10[:2]
|
||||
if not self.TRACT_PER_STATE.get(tractid10_state_id):
|
||||
self.TRACT_PER_STATE[tractid10_state_id] = []
|
||||
self.TRACT_PER_STATE[tractid10_state_id].append(
|
||||
tractid10
|
||||
)
|
||||
state_df = gpd.read_parquet(file)
|
||||
tract_list = state_df["GEOID10"].to_list()
|
||||
self.TRACT_NATIONAL.extend(tract_list)
|
||||
tractid10_state_id = state_df["STATEFP10"][0]
|
||||
self.TRACT_PER_STATE[tractid10_state_id] = tract_list
|
||||
|
||||
def transform(self) -> None:
|
||||
"""Download all census shape files from the Census FTP and extract the geojson
|
||||
|
@ -210,18 +198,24 @@ class CensusETL(ExtractTransformLoad):
|
|||
|
||||
usa_df = gpd.GeoDataFrame()
|
||||
|
||||
for file_name in self.GEOJSON_BASE_PATH.rglob("*.json"):
|
||||
# Read state only files and append them into a MEGA US GPD
|
||||
files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
|
||||
files.sort()
|
||||
for file_name in files:
|
||||
logger.debug(f"Adding national GeoJSON file {file_name.name}")
|
||||
state_gdf = gpd.read_file(file_name)
|
||||
usa_df = usa_df.append(state_gdf)
|
||||
state_gdf = gpd.read_parquet(file_name)
|
||||
usa_df = pd.concat([usa_df, state_gdf], ignore_index=True)
|
||||
|
||||
assert len(usa_df.columns) > 0
|
||||
logger.debug("Converting to CRS")
|
||||
usa_df = usa_df.to_crs(
|
||||
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
||||
)
|
||||
usa_df = usa_df.to_crs("EPSG:4326")
|
||||
|
||||
logger.debug("Saving national GeoJSON file")
|
||||
usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
|
||||
# Convert tract ID to a string
|
||||
usa_df[self.GEOID_TRACT_FIELD_NAME] = usa_df[
|
||||
self.GEOID_TRACT_FIELD_NAME
|
||||
].astype(str, errors="ignore")
|
||||
usa_df.to_parquet(self.NATIONAL_TRACT_JSON_PATH)
|
||||
|
||||
def load(self) -> None:
|
||||
"""Create state CSVs, National CSV, and National GeoJSON
|
||||
|
|
|
@ -104,7 +104,7 @@ def check_census_data_source(
|
|||
)
|
||||
else:
|
||||
# check if census data is found locally
|
||||
if not os.path.isfile(census_data_path / "geojson" / "us.json"):
|
||||
if not os.path.isfile(census_data_path / "geojson" / "us_geo.parquet"):
|
||||
logger.error(
|
||||
"No local census data found. Please use '-s aws` to fetch from AWS"
|
||||
)
|
||||
|
|
|
@ -507,7 +507,7 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
# geojson file for all of the US, this will read it off of S3
|
||||
logger.debug("Reading in geojson for the country")
|
||||
if not os.path.exists(
|
||||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
self.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
|
||||
):
|
||||
logger.debug("Fetching Census data from AWS S3")
|
||||
unzip_file_from_url(
|
||||
|
@ -515,9 +515,8 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.DATA_PATH / "tmp",
|
||||
self.DATA_PATH,
|
||||
)
|
||||
|
||||
self.geo_df = gpd.read_file(
|
||||
self.DATA_PATH / "census" / "geojson" / "us.json",
|
||||
self.geo_df = gpd.read_parquet(
|
||||
self.DATA_PATH / "census" / "geojson" / "us_geo.parquet",
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
|
|
|
@ -33,7 +33,7 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
/ f"census_decennial_{DECENNIAL_YEAR}"
|
||||
)
|
||||
CENSUS_GEOJSON_PATH = (
|
||||
ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
|
||||
)
|
||||
|
||||
def __get_api_url(
|
||||
|
@ -148,7 +148,7 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
"""Impute income for both income measures."""
|
||||
# Merges Census geojson to imput values from.
|
||||
logger.debug(f"Reading GeoJSON from {geojson_path}")
|
||||
geo_df = gpd.read_file(geojson_path)
|
||||
geo_df = gpd.read_parquet(geojson_path)
|
||||
self.df_all = CensusACSETL.merge_geojson(
|
||||
df=self.df_all,
|
||||
usa_geo_df=geo_df,
|
||||
|
|
|
@ -26,10 +26,7 @@ def get_tract_geojson(
|
|||
census_etl.extract()
|
||||
census_etl.transform()
|
||||
census_etl.load()
|
||||
tract_data = gpd.read_file(
|
||||
GEOJSON_PATH,
|
||||
include_fields=["GEOID10"],
|
||||
)
|
||||
tract_data = gpd.read_parquet(GEOJSON_PATH)
|
||||
tract_data = tract_data.rename(
|
||||
columns={"GEOID10": "GEOID10_TRACT"}, errors="raise"
|
||||
)
|
||||
|
|
|
@ -7,10 +7,13 @@ from data_pipeline.score.field_names import GEOID_TRACT_FIELD
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def final_score_df():
|
||||
return pd.read_csv(
|
||||
settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv",
|
||||
dtype={GEOID_TRACT_FIELD: str},
|
||||
low_memory=False,
|
||||
return pd.read_parquet(
|
||||
settings.APP_ROOT
|
||||
/ "data"
|
||||
/ "score"
|
||||
/ "csv"
|
||||
/ "full"
|
||||
/ "usa_score.parquet",
|
||||
)
|
||||
|
||||
|
||||
|
@ -173,7 +176,7 @@ def geocorr_urban_rural_df():
|
|||
@pytest.fixture()
|
||||
def census_decennial_df():
|
||||
census_decennial_csv = (
|
||||
constants.DATA_PATH / "dataset" / "census_decennial_2010" / "usa.csv"
|
||||
constants.DATA_PATH / "dataset" / "census_decennial_2020" / "usa.csv"
|
||||
)
|
||||
return pd.read_csv(
|
||||
census_decennial_csv,
|
||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
|
@ -17,7 +17,7 @@ from data_pipeline.score.utils import (
|
|||
@contextmanager
|
||||
def patch_calculate_tract_adjacency_scores():
|
||||
# Use fixtures for tract data.
|
||||
tract_data_path = Path(__file__).parent / "data" / "us.geojson"
|
||||
tract_data_path = Path(__file__).parent / "data" / "us_geo.parquet"
|
||||
|
||||
get_tract_geojson_mock = partial(
|
||||
get_tract_geojson, _tract_data_path=tract_data_path
|
||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
|
@ -68,7 +68,7 @@ def transformed_data_fixture(
|
|||
"""Load the test data and call the ETL transform"""
|
||||
dec = CensusDecennialETL()
|
||||
dec.df_all = extracted_data_fixture
|
||||
dec.transform(imputed_path_fixture / "census-us-territory-geojson.json")
|
||||
dec.transform(imputed_path_fixture / "census-us-territory-geojson.parquet")
|
||||
return dec.df_all
|
||||
|
||||
|
||||
|
|
2063
data/data-pipeline/poetry.lock
generated
2063
data/data-pipeline/poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "justice40-data-pipeline"
|
||||
version = "1.0.1"
|
||||
version = "2.0"
|
||||
description = "ETL, Score and Map Generation of Justice 40 Tool"
|
||||
authors = ["Justice40 Engineering <j40-engineering@lists.usds.gov>"]
|
||||
keywords = ["justice40", "environmental_justice", "python", "etl"]
|
||||
|
@ -42,6 +42,7 @@ pydantic = "^1.9.0"
|
|||
Rtree = "^1.0.0"
|
||||
fiona = "~1.8.21"
|
||||
tenacity = ">=5.0.2"
|
||||
pyarrow = "^18.1.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^21"
|
||||
|
|
Loading…
Add table
Reference in a new issue