Data Pipeline performance improvements for Census GeoJson and Score file

This commit is contained in:
Carlos Felix 2025-01-13 09:28:14 -05:00 committed by Carlos Felix
parent d5d055864f
commit c32bd1f363
37 changed files with 1305 additions and 1413 deletions

View file

@ -59,12 +59,6 @@ jobs:
with: with:
path: data/data-pipeline/data_pipeline/data/census path: data/data-pipeline/data_pipeline/data/census
key: data-census key: data-census
- name: Install GDAL/ogr2ogr
if: steps.cache-census.outputs.cache-hit != 'true'
run: |
sudo apt-get update
sudo apt-get -y install gdal-bin
ogrinfo --version
- name: Get Census Data - name: Get Census Data
if: steps.cache-census.outputs.cache-hit != 'true' if: steps.cache-census.outputs.cache-hit != 'true'
run: | run: |
@ -72,7 +66,6 @@ jobs:
- name: Run ETL - name: Run ETL
run: | run: |
poetry run python3 -m data_pipeline.application etl-run poetry run python3 -m data_pipeline.application etl-run
poetry run python3 -m data_pipeline.application etl-run --dataset tribal
- name: Generate Score - name: Generate Score
run: | run: |
poetry run python3 -m data_pipeline.application score-run poetry run python3 -m data_pipeline.application score-run

View file

@ -98,11 +98,6 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: poetry add s4cmd && poetry install run: poetry add s4cmd && poetry install
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
- name: Install GDAL/ogr2ogr
run: |
sudo apt-get update
sudo apt-get -y install gdal-bin
ogrinfo --version
- name: Load cached ETL data - name: Load cached ETL data
id: cached-etl-data id: cached-etl-data
uses: actions/cache@v4 uses: actions/cache@v4
@ -119,7 +114,6 @@ jobs:
if: steps.cached-etl-data.outputs.cache-hit != 'true' if: steps.cached-etl-data.outputs.cache-hit != 'true'
run: | run: |
poetry run python3 -m data_pipeline.application etl-run poetry run python3 -m data_pipeline.application etl-run
poetry run python3 -m data_pipeline.application etl-run --dataset tribal
- name: Generate Score - name: Generate Score
run: | run: |
poetry run python3 -m data_pipeline.application score-run poetry run python3 -m data_pipeline.application score-run

View file

@ -4,27 +4,9 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0", "version": "0.2.0",
"configurations": [ "configurations": [
{
"name": "Score Run",
"type": "python",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"score-run"
]
},
{
"name": "Generate Score Post",
"type": "python",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"generate-score-post"
]
},
{ {
"name": "Data Cleanup", "name": "Data Cleanup",
"type": "python", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "data_pipeline.application", "module": "data_pipeline.application",
"args": [ "args": [
@ -33,7 +15,7 @@
}, },
{ {
"name": "Census Cleanup", "name": "Census Cleanup",
"type": "python", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "data_pipeline.application", "module": "data_pipeline.application",
"args": [ "args": [
@ -42,73 +24,25 @@
}, },
{ {
"name": "Download Census", "name": "Download Census",
"type": "python", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "data_pipeline.application", "module": "data_pipeline.application",
"args": [ "args": [
"census-data-download" "census-data-download", "-u"
]
},
{
"name": "Score Full Run",
"type": "python",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"score-full-run"
]
},
{
"name": "Combine Score and GeoJSON",
"type": "python",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"geo-score"
]
},
{
"name": "Generate Score Tiles",
"type": "python",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"generate-map-tiles"
]
},
{
"name": "Generate Tribal Tiles",
"type": "python",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"generate-map-tiles",
"-t"
] ]
}, },
{ {
"name": "ETL Run", "name": "ETL Run",
"type": "python", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "data_pipeline.application", "module": "data_pipeline.application",
"args": [ "args": [
"etl-run" "etl-run"
] ]
}, },
{
"name": "ETL Run NRI",
"type": "python",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"etl-run",
"--dataset",
"national_risk_index"
]
},
{ {
"name": "ETL Run Tribal", "name": "ETL Run Tribal",
"type": "python", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "data_pipeline.application", "module": "data_pipeline.application",
"args": [ "args": [
@ -117,18 +51,91 @@
"tribal" "tribal"
] ]
}, },
{
"name": "Score Run",
"type": "debugpy",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"score-run"
]
},
{
"name": "Combine Score and GeoJSON",
"type": "debugpy",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"geo-score"
]
},
{
"name": "Generate Score Post",
"type": "debugpy",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"generate-score-post"
]
},
{
"name": "Generate Score Tiles",
"type": "debugpy",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"generate-map-tiles"
]
},
{
"name": "Generate Tribal Tiles",
"type": "debugpy",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"generate-map-tiles",
"-t"
]
},
{
"name": "Score Full Run",
"type": "debugpy",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"score-full-run"
]
},
{ {
"name": "Data Full Run", "name": "Data Full Run",
"type": "python", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "data_pipeline.application", "module": "data_pipeline.application",
"args": [ "args": [
"data-full-run", "data-full-run",
] ]
}, },
{
"name": "Comparator",
"type": "debugpy",
"request": "launch",
"module": "data_pipeline.comparator",
"args": [
"compare-score",
]
},
{
"name": "Convert score to CSV",
"type": "debugpy",
"request": "launch",
"module": "data_pipeline.application",
"args": [
"convert-score",
]
},
{ {
"name": "poetry install", "name": "poetry install",
"type": "python", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "poetry", "module": "poetry",
"args": [ "args": [
@ -137,7 +144,7 @@
}, },
{ {
"name": "poetry update", "name": "poetry update",
"type": "python", "type": "debugpy",
"request": "launch", "request": "launch",
"module": "poetry", "module": "poetry",
"args": [ "args": [

View file

@ -58,7 +58,6 @@ The application requires the installation of three 3rd party tools.
| Tool | Purpose | Link | | Tool | Purpose | Link |
| --------------- | -------------------- | --------------------------------------------------------- | | --------------- | -------------------- | --------------------------------------------------------- |
| GDAL | Generate census data | [GDAL library](https://github.com/OSGeo/gdal) |
| libspatialindex | Score generation | [libspatialindex](https://libspatialindex.org/en/latest/) | | libspatialindex | Score generation | [libspatialindex](https://libspatialindex.org/en/latest/) |
| tippecanoe | Generate map tiles | [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe) | | tippecanoe | Generate map tiles | [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe) |
@ -66,7 +65,6 @@ The application requires the installation of three 3rd party tools.
Use Homebrew to install the three tools. Use Homebrew to install the three tools.
- GDAL: `brew install gdal`
- libspatialindex: `brew install spatialindex` - libspatialindex: `brew install spatialindex`
- tippecanoe: `brew install tippecanoe` - tippecanoe: `brew install tippecanoe`

View file

@ -117,7 +117,7 @@ Begin the process of running the application in your local environment by downlo
To download census data, run the command `poetry run python3 data_pipeline/application.py census-data-download`. To download census data, run the command `poetry run python3 data_pipeline/application.py census-data-download`.
If you have a high speed internet connection and don't want to generate the census data or install `GDAL` locally, you can download [a zip version of the Census file](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census` folder. If you have a high speed internet connection and don't want to generate the census data locally, you can download [a zip version of the Census file](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census` folder.
#### Run the Application #### Run the Application

View file

@ -1,4 +1,7 @@
import sys import sys
import os
import pandas as pd
from pathlib import Path
from subprocess import call from subprocess import call
import click import click
@ -19,6 +22,7 @@ from data_pipeline.etl.sources.tribal.etl_utils import (
reset_data_directories as tribal_reset, reset_data_directories as tribal_reset,
) )
from data_pipeline.tile.generate import generate_tiles from data_pipeline.tile.generate import generate_tiles
from data_pipeline.etl.score import constants
from data_pipeline.utils import check_first_run from data_pipeline.utils import check_first_run
from data_pipeline.utils import data_folder_cleanup from data_pipeline.utils import data_folder_cleanup
from data_pipeline.utils import downloadable_cleanup from data_pipeline.utils import downloadable_cleanup
@ -330,25 +334,11 @@ def data_full_run(check: bool, data_source: str, use_cache: bool):
temp_folder_cleanup() temp_folder_cleanup()
tribal_reset(data_path) tribal_reset(data_path)
if data_source == "local":
log_info("Downloading census data") log_info("Downloading census data")
etl_runner("census", use_cache) etl_runner("census", use_cache)
log_info("Running all ETLs") log_info("Running all ETLs")
etl_runner(use_cache=True) etl_runner(use_cache)
log_info("Running tribal ETL")
etl_runner("tribal", use_cache)
else:
log_info("Downloading census data")
etl_runner("census", use_cache=False)
log_info("Running all ETLs")
etl_runner(use_cache=False)
log_info("Running tribal ETL")
etl_runner("tribal", use_cache=False)
log_info("Generating score") log_info("Generating score")
score_generate() score_generate()
@ -467,10 +457,41 @@ def full_run(ctx, use_cache):
ctx.invoke(data_cleanup) ctx.invoke(data_cleanup)
ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache) ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache)
ctx.invoke(etl_run, dataset=None, use_cache=use_cache) ctx.invoke(etl_run, dataset=None, use_cache=use_cache)
ctx.invoke(etl_run, dataset="tribal", use_cache=use_cache)
ctx.invoke(full_post_etl) ctx.invoke(full_post_etl)
@cli.command(
help="Convert a Pickle or Parquet file to GeoJSON or CSV depending on the contents of the file.",
)
@click.option(
"--source",
"-s",
type=click.Path(),
# We don't require this option, otherwise the tool will not run when there is no score
default=constants.DATA_SCORE_CSV_FULL_FILE_PATH,
help="Path to the input file. Defaults to the default location of the local score file.",
)
@click.option(
"--destination",
"-d",
type=click.Path(writable=True),
default=Path(
os.path.splitext(constants.DATA_SCORE_CSV_FULL_FILE_PATH)[0] + ".csv"
),
help="Path to the input file. Defaults to the source file with CSV extension.",
)
def convert_score(source: Path, destination: Path):
"""Converts the score file to CSV."""
if source.exists():
score_df = pd.read_parquet(source)
logger.info(f"Saving score as CSV to {destination}")
score_df.to_csv(destination, index=False)
logger.info("Done.")
else:
logger.error(f"Error: Unable to read {source}")
sys.exit(1)
def log_title(title: str, subtitle: str = None): def log_title(title: str, subtitle: str = None):
"""Logs a title in our fancy title format""" """Logs a title in our fancy title format"""
logger.info("-" * LOG_LINE_WIDTH) logger.info("-" * LOG_LINE_WIDTH)

View file

@ -51,12 +51,19 @@ def _read_from_file(file_path: Path):
"Please generate the score and try again." "Please generate the score and try again."
) )
sys.exit(1) sys.exit(1)
return pd.read_csv( df = pd.DataFrame()
if file_path.suffix == ".parquet":
df = pd.read_parquet(file_path)
df.set_index("GEOID10_TRACT", inplace=True)
else:
df = pd.read_csv(
file_path, file_path,
index_col="GEOID10_TRACT", index_col="GEOID10_TRACT",
dtype={"GEOID10_TRACT": str}, dtype={"GEOID10_TRACT": str},
low_memory=False, low_memory=False,
).sort_index() )
return df.sort_index()
def _add_tract_list(tract_list: list[str]): def _add_tract_list(tract_list: list[str]):
@ -125,7 +132,7 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
local_df (pd.DataFrame): the local score local_df (pd.DataFrame): the local score
""" """
log_info("Comparing dataframe contents (production vs local)") log_info("Comparing dataframe contents (production vs local)")
_add_text("\n\n## Scores\n") _add_text("\n## Scores\n")
production_row_count = len(prod_df.index) production_row_count = len(prod_df.index)
local_row_count = len(local_df.index) local_row_count = len(local_df.index)
@ -213,17 +220,44 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
) )
_add_tract_list(added_tracts) _add_tract_list(added_tracts)
# Grandfathered tracts from v1.0
def _check_grandfathered_tracts(
prod_df: pd.DataFrame, local_df: pd.DataFrame, compare_to_version: str
):
"""
Find grandfathered tracts for v1.0 comparisons.
Args:
prod_df (pd.DataFrame): the production score
local_df (pd.DataFrame): the local score
compare_to_version (str): the compare to version
"""
# Set the field we will check for grandfathering.
# This allows us to add other fields for other versions.
grandfathered_field = (
field_names.GRANDFATHERED_N_COMMUNITIES_V1_0
if compare_to_version.startswith("1")
else None
)
# If there is a grandfathered field then check for those tracts
if grandfathered_field:
log_info("Checking for grandfathered tracks")
grandfathered_tracts = local_df.loc[ grandfathered_tracts = local_df.loc[
local_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0] local_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
].index ].index
if len(grandfathered_tracts) > 0: if len(grandfathered_tracts) > 0:
_add_text( _add_text(
f"* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring." f"\n* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring."
) )
_add_tract_list(grandfathered_tracts) _add_tract_list(grandfathered_tracts)
else: else:
_add_text("* There are NO grandfathered tracts from v1.0 scoring.\n") _add_text(
"* There are NO grandfathered tracts from v1.0 scoring.\n"
)
else:
_add_text("\n* There is no grandfathered tract list for this version.")
def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame): def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame):
@ -234,7 +268,7 @@ def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame):
prod_df (pd.DataFrame): the production score prod_df (pd.DataFrame): the production score
local_df (pd.DataFrame): the local score local_df (pd.DataFrame): the local score
""" """
_add_text("\n## Delta\n") _add_text("\n\n## Delta\n")
# First we make the columns on two dataframes to be the same to be able to compare # First we make the columns on two dataframes to be the same to be able to compare
local_score_df_columns = local_df.columns.array.tolist() local_score_df_columns = local_df.columns.array.tolist()
production_score_df_columns = prod_df.columns.array.tolist() production_score_df_columns = prod_df.columns.array.tolist()
@ -287,7 +321,7 @@ def cli():
@click.option( @click.option(
"-v", "-v",
"--compare-to-version", "--compare-to-version",
default="1.0", default="2.0",
required=False, required=False,
type=str, type=str,
help="Set the production score version to compare to", help="Set the production score version to compare to",
@ -359,8 +393,10 @@ def compare_score(
_compare_score_columns(production_score_df, local_score_df) _compare_score_columns(production_score_df, local_score_df)
_compare_score_results(production_score_df, local_score_df) _compare_score_results(production_score_df, local_score_df)
_check_grandfathered_tracts(
production_score_df, local_score_df, compare_to_version
)
_generate_delta(production_score_df, local_score_df) _generate_delta(production_score_df, local_score_df)
result_doc = _get_result_doc() result_doc = _get_result_doc()
print(result_doc) print(result_doc)

View file

@ -155,7 +155,13 @@ DATASET_LIST = [
"class_name": "HistoricRedliningETL", "class_name": "HistoricRedliningETL",
"is_memory_intensive": False, "is_memory_intensive": False,
}, },
# This has to come after us.json exists {
"name": "tribal",
"module_dir": "tribal",
"class_name": "TribalETL",
"is_memory_intensive": False,
},
# This has to come after us_geo.parquet exists
{ {
"name": "census_acs", "name": "census_acs",
"module_dir": "census_acs", "module_dir": "census_acs",
@ -196,10 +202,3 @@ CENSUS_INFO = {
"class_name": "CensusETL", "class_name": "CensusETL",
"is_memory_intensive": False, "is_memory_intensive": False,
} }
TRIBAL_INFO = {
"name": "tribal",
"module_dir": "tribal",
"class_name": "TribalETL",
"is_memory_intensive": False,
}

View file

@ -1,5 +1,6 @@
import concurrent.futures import concurrent.futures
import importlib import importlib
import time
import typing import typing
import os import os
@ -27,9 +28,7 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
None None
""" """
dataset_list = constants.DATASET_LIST dataset_list = constants.DATASET_LIST
etls_to_search = ( etls_to_search = dataset_list + [constants.CENSUS_INFO]
dataset_list + [constants.CENSUS_INFO] + [constants.TRIBAL_INFO]
)
if dataset_to_run: if dataset_to_run:
dataset_element = next( dataset_element = next(
@ -59,6 +58,8 @@ def _get_dataset(dataset: dict) -> ExtractTransformLoad:
def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None: def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
"""Runs one etl process.""" """Runs one etl process."""
start_time = time.time()
logger.info(f"Running ETL for {dataset['name']}") logger.info(f"Running ETL for {dataset['name']}")
etl_instance = _get_dataset(dataset) etl_instance = _get_dataset(dataset)
@ -83,6 +84,9 @@ def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
etl_instance.cleanup() etl_instance.cleanup()
logger.info(f"Finished ETL for dataset {dataset['name']}") logger.info(f"Finished ETL for dataset {dataset['name']}")
logger.debug(
f"Execution time for ETL for dataset {dataset['name']} was {time.time() - start_time}s"
)
def etl_runner( def etl_runner(
@ -197,10 +201,14 @@ def score_generate() -> None:
""" """
# Score Gen # Score Gen
start_time = time.time()
score_gen = ScoreETL() score_gen = ScoreETL()
score_gen.extract() score_gen.extract()
score_gen.transform() score_gen.transform()
score_gen.load() score_gen.load()
logger.debug(
f"Execution time for Score Generation was {time.time() - start_time}s"
)
def score_post(data_source: str = "local") -> None: def score_post(data_source: str = "local") -> None:
@ -216,11 +224,15 @@ def score_post(data_source: str = "local") -> None:
None None
""" """
# Post Score Processing # Post Score Processing
start_time = time.time()
score_post = PostScoreETL(data_source=data_source) score_post = PostScoreETL(data_source=data_source)
score_post.extract() score_post.extract()
score_post.transform() score_post.transform()
score_post.load() score_post.load()
score_post.cleanup() score_post.cleanup()
logger.debug(
f"Execution time for Score Post was {time.time() - start_time}s"
)
def score_geo(data_source: str = "local") -> None: def score_geo(data_source: str = "local") -> None:
@ -237,10 +249,14 @@ def score_geo(data_source: str = "local") -> None:
""" """
# Score Geo # Score Geo
start_time = time.time()
score_geo = GeoScoreETL(data_source=data_source) score_geo = GeoScoreETL(data_source=data_source)
score_geo.extract() score_geo.extract()
score_geo.transform() score_geo.transform()
score_geo.load() score_geo.load()
logger.debug(
f"Execution time for Score Geo was {time.time() - start_time}s"
)
def _find_dataset_index(dataset_list, key, value): def _find_dataset_index(dataset_list, key, value):

View file

@ -24,7 +24,7 @@ DATA_CENSUS_DIR = DATA_PATH / "census"
DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv" DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv" DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv" DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"
DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us.json" DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us_geo.parquet"
# Score paths # Score paths
DATA_SCORE_DIR = DATA_PATH / "score" DATA_SCORE_DIR = DATA_PATH / "score"
@ -32,7 +32,7 @@ DATA_SCORE_DIR = DATA_PATH / "score"
## Score CSV Paths ## Score CSV Paths
DATA_SCORE_CSV_DIR = DATA_SCORE_DIR / "csv" DATA_SCORE_CSV_DIR = DATA_SCORE_DIR / "csv"
DATA_SCORE_CSV_FULL_DIR = DATA_SCORE_CSV_DIR / "full" DATA_SCORE_CSV_FULL_DIR = DATA_SCORE_CSV_DIR / "full"
DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa.csv" DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa_score.parquet"
FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = ( FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
DATA_SCORE_CSV_FULL_DIR / "usa_counties.csv" DATA_SCORE_CSV_FULL_DIR / "usa_counties.csv"
) )

View file

@ -727,4 +727,4 @@ class ScoreETL(ExtractTransformLoad):
def load(self) -> None: def load(self) -> None:
constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True) constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)
self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False) self.df.to_parquet(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)

View file

@ -37,9 +37,7 @@ class GeoScoreETL(ExtractTransformLoad):
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv" self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
self.CENSUS_USA_GEOJSON = ( self.CENSUS_USA_GEOJSON = constants.DATA_CENSUS_GEOJSON_FILE_PATH
self.DATA_PATH / "census" / "geojson" / "us.json"
)
# Import the shortened name for Score N to be used on tiles. # Import the shortened name for Score N to be used on tiles.
# We should no longer be using PFS # We should no longer be using PFS
@ -87,16 +85,14 @@ class GeoScoreETL(ExtractTransformLoad):
score_data_source=self.DATA_SOURCE, score_data_source=self.DATA_SOURCE,
) )
logger.info("Reading US GeoJSON (~6 minutes)") logger.info("Reading US GeoJSON")
full_geojson_usa_df = gpd.read_file( full_geojson_usa_df = gpd.read_parquet(
self.CENSUS_USA_GEOJSON, self.CENSUS_USA_GEOJSON,
dtype={self.GEOID_FIELD_NAME: "string"}, columns=[
usecols=[
self.GEOID_FIELD_NAME, self.GEOID_FIELD_NAME,
self.GEOMETRY_FIELD_NAME, self.GEOMETRY_FIELD_NAME,
self.LAND_FIELD_NAME, self.LAND_FIELD_NAME,
], ],
low_memory=False,
) )
# We only want to keep tracts to visualize that have non-0 land # We only want to keep tracts to visualize that have non-0 land
@ -104,7 +100,7 @@ class GeoScoreETL(ExtractTransformLoad):
full_geojson_usa_df[self.LAND_FIELD_NAME] > 0 full_geojson_usa_df[self.LAND_FIELD_NAME] > 0
] ]
logger.info("Reading score CSV") logger.info("Reading tile score CSV")
self.score_usa_df = pd.read_csv( self.score_usa_df = pd.read_csv(
self.TILE_SCORE_CSV, self.TILE_SCORE_CSV,
dtype={ dtype={

View file

@ -94,12 +94,8 @@ class PostScoreETL(ExtractTransformLoad):
) )
def _extract_score(self, score_path: Path) -> pd.DataFrame: def _extract_score(self, score_path: Path) -> pd.DataFrame:
logger.debug("Reading Score CSV") logger.debug("Reading Score")
df = pd.read_csv( df = pd.read_parquet(score_path)
score_path,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
# Convert total population to an int # Convert total population to an int
df["Total population"] = df["Total population"].astype( df["Total population"] = df["Total population"].astype(
@ -116,8 +112,7 @@ class PostScoreETL(ExtractTransformLoad):
gpd.GeoDataFrame: the census geo json data gpd.GeoDataFrame: the census geo json data
""" """
logger.debug("Reading Census GeoJSON") logger.debug("Reading Census GeoJSON")
with open(geo_path, "r", encoding="utf-8") as file: data = gpd.read_parquet(geo_path)
data = gpd.read_file(file)
return data return data
def extract(self, use_cached_data_sources: bool = False) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:

View file

@ -70,7 +70,7 @@ def state_data_initial(sample_data_dir):
@pytest.fixture() @pytest.fixture()
def score_data_initial(sample_data_dir): def score_data_initial(sample_data_dir):
return sample_data_dir / "score_data_initial.csv" return sample_data_dir / "score_data_initial.parquet"
@pytest.fixture() @pytest.fixture()
@ -104,8 +104,8 @@ def states_transformed_expected():
@pytest.fixture() @pytest.fixture()
def score_transformed_expected(): def score_transformed_expected():
return pd.read_pickle( return pd.read_parquet(
pytest.SNAPSHOT_DIR / "score_transformed_expected.pkl" pytest.SNAPSHOT_DIR / "score_transformed_expected.parquet"
) )
@ -122,7 +122,7 @@ def national_tract_df():
@pytest.fixture() @pytest.fixture()
def score_data_expected(): def score_data_expected():
return pd.read_pickle(pytest.SNAPSHOT_DIR / "score_data_expected.pkl") return pd.read_parquet(pytest.SNAPSHOT_DIR / "score_data_expected.parquet")
@pytest.fixture() @pytest.fixture()
@ -144,8 +144,8 @@ def create_tile_data_expected():
@pytest.fixture() @pytest.fixture()
def downloadable_data_expected(): def downloadable_data_expected():
return pd.read_pickle( return pd.read_parquet(
pytest.SNAPSHOT_DIR / "downloadable_data_expected.pkl" pytest.SNAPSHOT_DIR / "downloadable_data_expected.parquet"
) )

File diff suppressed because one or more lines are too long

View file

@ -33,8 +33,7 @@ def test_extract_states(etl, state_data_initial):
def test_extract_score(etl, score_data_initial): def test_extract_score(etl, score_data_initial):
extracted = etl._extract_score(score_data_initial) extracted = etl._extract_score(score_data_initial)
string_cols = ["GEOID10_TRACT"] assert len(extracted) > 0
assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
# Transform Tests # Transform Tests
@ -107,6 +106,7 @@ def test_create_downloadable_data(
pdt.assert_frame_equal( pdt.assert_frame_equal(
output_downloadable_df_actual, output_downloadable_df_actual,
downloadable_data_expected, downloadable_data_expected,
check_dtype=False,
) )

View file

@ -1,10 +1,9 @@
import csv import csv
import json
import subprocess
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
import geopandas as gpd import geopandas as gpd
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
@ -26,8 +25,8 @@ class CensusETL(ExtractTransformLoad):
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv" CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson" GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
NATIONAL_TRACT_CSV_PATH = CSV_BASE_PATH / "us.csv" NATIONAL_TRACT_CSV_PATH = CSV_BASE_PATH / "us.csv"
NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us.json" NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us_geo.parquet"
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT" GEOID_TRACT_FIELD_NAME: str = "GEOID10"
def __init__(self): def __init__(self):
@ -59,7 +58,7 @@ class CensusETL(ExtractTransformLoad):
/ f"tl_2010_{fips_code}_tract10.shp" / f"tl_2010_{fips_code}_tract10.shp"
) )
elif file_type == GeoFileType.GEOJSON: elif file_type == GeoFileType.GEOJSON:
file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.json") file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.parquet")
elif file_type == GeoFileType.CSV: elif file_type == GeoFileType.CSV:
file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv") file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
return file_path return file_path
@ -93,14 +92,8 @@ class CensusETL(ExtractTransformLoad):
) )
if not geojson_file_path.is_file(): if not geojson_file_path.is_file():
cmd = [ gdf = gpd.read_file(shp_file_path)
"ogr2ogr", gdf.to_parquet(geojson_file_path)
"-f",
"GeoJSON",
str(geojson_file_path),
str(shp_file_path),
]
subprocess.run(cmd, check=True)
def _generate_tract_table(self) -> None: def _generate_tract_table(self) -> None:
"""Generate Tract CSV table for pandas, load in memory """Generate Tract CSV table for pandas, load in memory
@ -110,20 +103,15 @@ class CensusETL(ExtractTransformLoad):
""" """
logger.debug("Transforming tracts") logger.debug("Transforming tracts")
for file in self.GEOJSON_BASE_PATH.iterdir(): files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
if file.suffix == ".json": files.sort()
for file in files:
logger.debug(f"Adding GEOID10 for file {file.name}") logger.debug(f"Adding GEOID10 for file {file.name}")
with open(self.GEOJSON_BASE_PATH / file, encoding="utf-8") as f: state_df = gpd.read_parquet(file)
geojson = json.load(f) tract_list = state_df["GEOID10"].to_list()
for feature in geojson["features"]: self.TRACT_NATIONAL.extend(tract_list)
tractid10 = feature["properties"]["GEOID10"] tractid10_state_id = state_df["STATEFP10"][0]
self.TRACT_NATIONAL.append(str(tractid10)) self.TRACT_PER_STATE[tractid10_state_id] = tract_list
tractid10_state_id = tractid10[:2]
if not self.TRACT_PER_STATE.get(tractid10_state_id):
self.TRACT_PER_STATE[tractid10_state_id] = []
self.TRACT_PER_STATE[tractid10_state_id].append(
tractid10
)
def transform(self) -> None: def transform(self) -> None:
"""Download all census shape files from the Census FTP and extract the geojson """Download all census shape files from the Census FTP and extract the geojson
@ -210,18 +198,24 @@ class CensusETL(ExtractTransformLoad):
usa_df = gpd.GeoDataFrame() usa_df = gpd.GeoDataFrame()
for file_name in self.GEOJSON_BASE_PATH.rglob("*.json"): # Read state only files and append them into a MEGA US GPD
files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
files.sort()
for file_name in files:
logger.debug(f"Adding national GeoJSON file {file_name.name}") logger.debug(f"Adding national GeoJSON file {file_name.name}")
state_gdf = gpd.read_file(file_name) state_gdf = gpd.read_parquet(file_name)
usa_df = usa_df.append(state_gdf) usa_df = pd.concat([usa_df, state_gdf], ignore_index=True)
assert len(usa_df.columns) > 0
logger.debug("Converting to CRS") logger.debug("Converting to CRS")
usa_df = usa_df.to_crs( usa_df = usa_df.to_crs("EPSG:4326")
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
)
logger.debug("Saving national GeoJSON file") logger.debug("Saving national GeoJSON file")
usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON") # Convert tract ID to a string
usa_df[self.GEOID_TRACT_FIELD_NAME] = usa_df[
self.GEOID_TRACT_FIELD_NAME
].astype(str, errors="ignore")
usa_df.to_parquet(self.NATIONAL_TRACT_JSON_PATH)
def load(self) -> None: def load(self) -> None:
"""Create state CSVs, National CSV, and National GeoJSON """Create state CSVs, National CSV, and National GeoJSON

View file

@ -104,7 +104,7 @@ def check_census_data_source(
) )
else: else:
# check if census data is found locally # check if census data is found locally
if not os.path.isfile(census_data_path / "geojson" / "us.json"): if not os.path.isfile(census_data_path / "geojson" / "us_geo.parquet"):
logger.error( logger.error(
"No local census data found. Please use '-s aws` to fetch from AWS" "No local census data found. Please use '-s aws` to fetch from AWS"
) )

View file

@ -507,7 +507,7 @@ class CensusACSETL(ExtractTransformLoad):
# geojson file for all of the US, this will read it off of S3 # geojson file for all of the US, this will read it off of S3
logger.debug("Reading in geojson for the country") logger.debug("Reading in geojson for the country")
if not os.path.exists( if not os.path.exists(
self.DATA_PATH / "census" / "geojson" / "us.json" self.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
): ):
logger.debug("Fetching Census data from AWS S3") logger.debug("Fetching Census data from AWS S3")
unzip_file_from_url( unzip_file_from_url(
@ -515,9 +515,8 @@ class CensusACSETL(ExtractTransformLoad):
self.DATA_PATH / "tmp", self.DATA_PATH / "tmp",
self.DATA_PATH, self.DATA_PATH,
) )
self.geo_df = gpd.read_parquet(
self.geo_df = gpd.read_file( self.DATA_PATH / "census" / "geojson" / "us_geo.parquet",
self.DATA_PATH / "census" / "geojson" / "us.json",
) )
def transform(self) -> None: def transform(self) -> None:

View file

@ -33,7 +33,7 @@ class CensusDecennialETL(ExtractTransformLoad):
/ f"census_decennial_{DECENNIAL_YEAR}" / f"census_decennial_{DECENNIAL_YEAR}"
) )
CENSUS_GEOJSON_PATH = ( CENSUS_GEOJSON_PATH = (
ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us.json" ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
) )
def __get_api_url( def __get_api_url(
@ -148,7 +148,7 @@ class CensusDecennialETL(ExtractTransformLoad):
"""Impute income for both income measures.""" """Impute income for both income measures."""
# Merges Census geojson to imput values from. # Merges Census geojson to imput values from.
logger.debug(f"Reading GeoJSON from {geojson_path}") logger.debug(f"Reading GeoJSON from {geojson_path}")
geo_df = gpd.read_file(geojson_path) geo_df = gpd.read_parquet(geojson_path)
self.df_all = CensusACSETL.merge_geojson( self.df_all = CensusACSETL.merge_geojson(
df=self.df_all, df=self.df_all,
usa_geo_df=geo_df, usa_geo_df=geo_df,

View file

@ -26,10 +26,7 @@ def get_tract_geojson(
census_etl.extract() census_etl.extract()
census_etl.transform() census_etl.transform()
census_etl.load() census_etl.load()
tract_data = gpd.read_file( tract_data = gpd.read_parquet(GEOJSON_PATH)
GEOJSON_PATH,
include_fields=["GEOID10"],
)
tract_data = tract_data.rename( tract_data = tract_data.rename(
columns={"GEOID10": "GEOID10_TRACT"}, errors="raise" columns={"GEOID10": "GEOID10_TRACT"}, errors="raise"
) )

View file

@ -7,10 +7,13 @@ from data_pipeline.score.field_names import GEOID_TRACT_FIELD
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def final_score_df(): def final_score_df():
return pd.read_csv( return pd.read_parquet(
settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv", settings.APP_ROOT
dtype={GEOID_TRACT_FIELD: str}, / "data"
low_memory=False, / "score"
/ "csv"
/ "full"
/ "usa_score.parquet",
) )
@ -173,7 +176,7 @@ def geocorr_urban_rural_df():
@pytest.fixture() @pytest.fixture()
def census_decennial_df(): def census_decennial_df():
census_decennial_csv = ( census_decennial_csv = (
constants.DATA_PATH / "dataset" / "census_decennial_2010" / "usa.csv" constants.DATA_PATH / "dataset" / "census_decennial_2020" / "usa.csv"
) )
return pd.read_csv( return pd.read_csv(
census_decennial_csv, census_decennial_csv,

File diff suppressed because one or more lines are too long

View file

@ -17,7 +17,7 @@ from data_pipeline.score.utils import (
@contextmanager @contextmanager
def patch_calculate_tract_adjacency_scores(): def patch_calculate_tract_adjacency_scores():
# Use fixtures for tract data. # Use fixtures for tract data.
tract_data_path = Path(__file__).parent / "data" / "us.geojson" tract_data_path = Path(__file__).parent / "data" / "us_geo.parquet"
get_tract_geojson_mock = partial( get_tract_geojson_mock = partial(
get_tract_geojson, _tract_data_path=tract_data_path get_tract_geojson, _tract_data_path=tract_data_path

View file

@ -68,7 +68,7 @@ def transformed_data_fixture(
"""Load the test data and call the ETL transform""" """Load the test data and call the ETL transform"""
dec = CensusDecennialETL() dec = CensusDecennialETL()
dec.df_all = extracted_data_fixture dec.df_all = extracted_data_fixture
dec.transform(imputed_path_fixture / "census-us-territory-geojson.json") dec.transform(imputed_path_fixture / "census-us-territory-geojson.parquet")
return dec.df_all return dec.df_all

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "justice40-data-pipeline" name = "justice40-data-pipeline"
version = "1.0.1" version = "2.0"
description = "ETL, Score and Map Generation of Justice 40 Tool" description = "ETL, Score and Map Generation of Justice 40 Tool"
authors = ["Justice40 Engineering <j40-engineering@lists.usds.gov>"] authors = ["Justice40 Engineering <j40-engineering@lists.usds.gov>"]
keywords = ["justice40", "environmental_justice", "python", "etl"] keywords = ["justice40", "environmental_justice", "python", "etl"]
@ -42,6 +42,7 @@ pydantic = "^1.9.0"
Rtree = "^1.0.0" Rtree = "^1.0.0"
fiona = "~1.8.21" fiona = "~1.8.21"
tenacity = ">=5.0.2" tenacity = ">=5.0.2"
pyarrow = "^18.1.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = "^21" black = "^21"