Data directory should adopt standard Poetry-suggested python package structure (#457)

* Fixes #456 - Our data directory should adopt standard python package structure
* a few missed references
* updating readme
* updating requirements
* Running Black
* Fixes for flake8
* updating pylint
This commit is contained in:
Nat Hillard 2021-08-05 15:35:54 -04:00 committed by GitHub
commit c1568e87c0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
61 changed files with 1273 additions and 1256 deletions

View file

@ -0,0 +1,124 @@
import click
from .config import settings
from .etl.runner import etl_runner, score_generate, score_geo
from .etl.sources.census.etl import download_census_csvs
from .etl.sources.census.etl_utils import reset_data_directories as census_reset
from .tile.generate import generate_tiles
from .utils import (
data_folder_cleanup,
get_module_logger,
score_folder_cleanup,
temp_folder_cleanup,
)
logger = get_module_logger(__name__)
@click.group()
def cli():
"""Defines a click group for the commands below"""
pass
@cli.command(help="Clean up all census data folders")
def census_cleanup():
"""CLI command to clean up the census data folder"""
data_path = settings.APP_ROOT / "data"
# census directories
logger.info("Initializing all census data")
census_reset(data_path)
logger.info("Cleaned up all census data files")
@cli.command(help="Clean up all data folders")
def data_cleanup():
"""CLI command to clean up the all the data folders"""
data_folder_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
logger.info("Cleaned up all data folders")
@cli.command(
help="Census data download",
)
def census_data_download():
"""CLI command to download all census shape files from the Census FTP and extract the geojson
to generate national and by state Census Block Group CSVs"""
data_path = settings.APP_ROOT / "data"
logger.info("Initializing all census data")
census_reset(data_path)
logger.info("Downloading census data")
download_census_csvs(data_path)
logger.info("Completed downloading census data")
@cli.command(
help="Run all ETL processes or a specific one",
)
@click.option("-d", "--dataset", required=False, type=str)
def etl_run(dataset: str):
"""Run a specific or all ETL processes
Args:
dataset (str): Name of the ETL module to be run (optional)
Returns:
None
"""
etl_runner(dataset)
@cli.command(
help="Generate Score",
)
def score_run():
"""CLI command to generate the score"""
score_generate()
@cli.command(
help="Run ETL + Score Generation",
)
def score_full_run():
"""CLI command to run ETL and generate the score in one command"""
data_folder_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
etl_runner()
score_generate()
@cli.command(help="Generate Geojson files with scores baked in")
def geo_score():
"""CLI command to generate the score"""
score_geo()
@cli.command(
help="Generate map tiles",
)
def generate_map_tiles():
"""CLI command to generate the map tiles"""
data_path = settings.APP_ROOT / "data"
generate_tiles(data_path)
if __name__ == "__main__":
cli()

View file

@ -0,0 +1,18 @@
import pathlib
from dynaconf import Dynaconf
import data_pipeline
settings = Dynaconf(
envvar_prefix="DYNACONF",
settings_files=["settings.toml", ".secrets.toml"],
environments=True,
)
# set root dir
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
# To set an environment use:
# Linux/OSX: export ENV_FOR_DYNACONF=staging
# Windows: set ENV_FOR_DYNACONF=staging

View file

@ -0,0 +1,60 @@
from pathlib import Path
from data_pipeline.config import settings
from data_pipeline.utils import unzip_file_from_url, remove_all_from_dir
class ExtractTransformLoad:
"""
A class used to instantiate an ETL object to retrieve and process data from
datasets.
Attributes:
DATA_PATH (pathlib.Path): Local path where all data will be stored
TMP_PATH (pathlib.Path): Local path where temporary data will be stored
GEOID_FIELD_NAME (str): The common column name for a Census Block Group identifier
GEOID_TRACT_FIELD_NAME (str): The common column name for a Census Tract identifier
"""
DATA_PATH: Path = settings.APP_ROOT / "data"
TMP_PATH: Path = DATA_PATH / "tmp"
GEOID_FIELD_NAME: str = "GEOID10"
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
def get_yaml_config(self) -> None:
"""Reads the YAML configuration file for the dataset and stores
the properies in the instance (upcoming feature)"""
pass
def check_ttl(self) -> None:
"""Checks if the ETL process can be run based on a the TLL value on the
YAML config (upcoming feature)"""
pass
def extract(self, source_url: str = None, extract_path: Path = None) -> None:
"""Extract the data from
a remote source. By default it provides code to get the file from a source url,
unzips it and stores it on an extract_path."""
# this can be accessed via super().extract()
if source_url and extract_path:
unzip_file_from_url(source_url, self.TMP_PATH, extract_path)
def transform(self) -> None:
"""Transform the data extracted into a format that can be consumed by the
score generator"""
raise NotImplementedError
def load(self) -> None:
"""Saves the transformed data in the specified local data folder or remote AWS S3
bucket"""
raise NotImplementedError
def cleanup(self) -> None:
"""Clears out any files stored in the TMP folder"""
remove_all_from_dir(self.TMP_PATH)

View file

@ -0,0 +1,137 @@
import importlib
from data_pipeline.etl.score.etl_score import ScoreETL
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
from data_pipeline.etl.score.etl_score_post import PostScoreETL
def etl_runner(dataset_to_run: str = None) -> None:
"""Runs all etl processes or a specific one
Args:
dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
Returns:
None
"""
# this list comes from YAMLs
dataset_list = [
{
"name": "tree_equity_score",
"module_dir": "tree_equity_score",
"class_name": "TreeEquityScoreETL",
},
{
"name": "census_acs",
"module_dir": "census_acs",
"class_name": "CensusACSETL",
},
{
"name": "ejscreen",
"module_dir": "ejscreen",
"class_name": "EJScreenETL",
},
{
"name": "housing_and_transportation",
"module_dir": "housing_and_transportation",
"class_name": "HousingTransportationETL",
},
{
"name": "hud_housing",
"module_dir": "hud_housing",
"class_name": "HudHousingETL",
},
{
"name": "calenviroscreen",
"module_dir": "calenviroscreen",
"class_name": "CalEnviroScreenETL",
},
{
"name": "hud_recap",
"module_dir": "hud_recap",
"class_name": "HudRecapETL",
},
]
if dataset_to_run:
dataset_element = next(
(item for item in dataset_list if item["name"] == dataset_to_run),
None,
)
if not dataset_list:
raise ValueError("Invalid dataset name")
else:
# reset the list to just the dataset
dataset_list = [dataset_element]
# Run the ETLs for the dataset_list
for dataset in dataset_list:
etl_module = importlib.import_module(
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
)
etl_class = getattr(etl_module, dataset["class_name"])
etl_instance = etl_class()
# run extract
etl_instance.extract()
# run transform
etl_instance.transform()
# run load
etl_instance.load()
# cleanup
etl_instance.cleanup()
# update the front end JSON/CSV of list of data sources
pass
def score_generate() -> None:
"""Generates the score and saves it on the local data directory
Args:
None
Returns:
None
"""
# Score Gen
score_gen = ScoreETL()
score_gen.extract()
score_gen.transform()
score_gen.load()
# Post Score Processing
score_post = PostScoreETL()
score_post.extract()
score_post.transform()
score_post.load()
score_post.cleanup()
def score_geo() -> None:
"""Generates the geojson files with score data baked in
Args:
None
Returns:
None
"""
# Score Geo
score_geo = GeoScoreETL()
score_geo.extract()
score_geo.transform()
score_geo.load()
def _find_dataset_index(dataset_list, key, value):
for i, element in enumerate(dataset_list):
if element[key] == value:
return i
return -1

View file

@ -0,0 +1,401 @@
import collections
import functools
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreETL(ExtractTransformLoad):
def __init__(self):
# Define some global parameters
self.BUCKET_SOCIOECONOMIC = "Socioeconomic Factors"
self.BUCKET_SENSITIVE = "Sensitive populations"
self.BUCKET_ENVIRONMENTAL = "Environmental effects"
self.BUCKET_EXPOSURES = "Exposures"
self.BUCKETS = [
self.BUCKET_SOCIOECONOMIC,
self.BUCKET_SENSITIVE,
self.BUCKET_ENVIRONMENTAL,
self.BUCKET_EXPOSURES,
]
# A few specific field names
# TODO: clean this up, I name some fields but not others.
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)"
self.POVERTY_FIELD_NAME = "Poverty (Less than 200% of federal poverty line)"
self.HIGH_SCHOOL_FIELD_NAME = (
"Percent individuals age 25 or over with less than high school degree"
)
# There's another aggregation level (a second level of "buckets").
self.AGGREGATION_POLLUTION = "Pollution Burden"
self.AGGREGATION_POPULATION = "Population Characteristics"
self.PERCENTILE_FIELD_SUFFIX = " (percentile)"
self.MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" / "full"
# dataframes
self.df: pd.DataFrame
self.ejscreen_df: pd.DataFrame
self.census_df: pd.DataFrame
self.housing_and_transportation_df: pd.DataFrame
self.hud_housing_df: pd.DataFrame
def extract(self) -> None:
# EJSCreen csv Load
ejscreen_csv = self.DATA_PATH / "dataset" / "ejscreen_2019" / "usa.csv"
self.ejscreen_df = pd.read_csv(
ejscreen_csv, dtype={"ID": "string"}, low_memory=False
)
self.ejscreen_df.rename(columns={"ID": self.GEOID_FIELD_NAME}, inplace=True)
# Load census data
census_csv = self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
self.census_df = pd.read_csv(
census_csv, dtype={self.GEOID_FIELD_NAME: "string"}, low_memory=False,
)
# Load housing and transportation data
housing_and_transportation_index_csv = (
self.DATA_PATH / "dataset" / "housing_and_transportation_index" / "usa.csv"
)
self.housing_and_transportation_df = pd.read_csv(
housing_and_transportation_index_csv,
dtype={self.GEOID_FIELD_NAME: "string"},
low_memory=False,
)
# Load HUD housing data
hud_housing_csv = self.DATA_PATH / "dataset" / "hud_housing" / "usa.csv"
self.hud_housing_df = pd.read_csv(
hud_housing_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
def transform(self) -> None:
logger.info("Transforming Score Data")
# Join all the data sources that use census block groups
census_block_group_dfs = [
self.ejscreen_df,
self.census_df,
self.housing_and_transportation_df,
]
census_block_group_df = functools.reduce(
lambda left, right: pd.merge(
left=left, right=right, on=self.GEOID_FIELD_NAME, how="outer"
),
census_block_group_dfs,
)
# Sanity check the join.
if len(census_block_group_df[self.GEOID_FIELD_NAME].str.len().unique()) != 1:
raise ValueError(
f"One of the input CSVs uses {self.GEOID_FIELD_NAME} with a different length."
)
# Join all the data sources that use census tracts
# TODO: when there's more than one data source using census tract, reduce/merge them here.
census_tract_df = self.hud_housing_df
# Calculate the tract for the CBG data.
census_block_group_df[self.GEOID_TRACT_FIELD_NAME] = census_block_group_df[
self.GEOID_FIELD_NAME
].str[0:11]
self.df = census_block_group_df.merge(
census_tract_df, on=self.GEOID_TRACT_FIELD_NAME
)
if len(census_block_group_df) > 220333:
raise ValueError("Too many rows in the join.")
# Define a named tuple that will be used for each data set input.
DataSet = collections.namedtuple(
typename="DataSet", field_names=["input_field", "renamed_field", "bucket"],
)
data_sets = [
# The following data sets have `bucket=None`, because it's not used in the bucket based score ("Score C").
DataSet(
input_field=self.GEOID_FIELD_NAME,
# Use the name `GEOID10` to enable geoplatform.gov's workflow.
renamed_field=self.GEOID_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.HOUSING_BURDEN_FIELD_NAME,
renamed_field=self.HOUSING_BURDEN_FIELD_NAME,
bucket=None,
),
DataSet(
input_field="ACSTOTPOP", renamed_field="Total population", bucket=None,
),
# The following data sets have buckets, because they're used in the score
DataSet(
input_field="CANCER",
renamed_field="Air toxics cancer risk",
bucket=self.BUCKET_EXPOSURES,
),
DataSet(
input_field="RESP",
renamed_field="Respiratory hazard index",
bucket=self.BUCKET_EXPOSURES,
),
DataSet(
input_field="DSLPM",
renamed_field="Diesel particulate matter",
bucket=self.BUCKET_EXPOSURES,
),
DataSet(
input_field="PM25",
renamed_field="Particulate matter (PM2.5)",
bucket=self.BUCKET_EXPOSURES,
),
DataSet(
input_field="OZONE",
renamed_field="Ozone",
bucket=self.BUCKET_EXPOSURES,
),
DataSet(
input_field="PTRAF",
renamed_field="Traffic proximity and volume",
bucket=self.BUCKET_EXPOSURES,
),
DataSet(
input_field="PRMP",
renamed_field="Proximity to RMP sites",
bucket=self.BUCKET_ENVIRONMENTAL,
),
DataSet(
input_field="PTSDF",
renamed_field="Proximity to TSDF sites",
bucket=self.BUCKET_ENVIRONMENTAL,
),
DataSet(
input_field="PNPL",
renamed_field="Proximity to NPL sites",
bucket=self.BUCKET_ENVIRONMENTAL,
),
DataSet(
input_field="PWDIS",
renamed_field="Wastewater discharge",
bucket=self.BUCKET_ENVIRONMENTAL,
),
DataSet(
input_field="PRE1960PCT",
renamed_field="Percent pre-1960s housing (lead paint indicator)",
bucket=self.BUCKET_ENVIRONMENTAL,
),
DataSet(
input_field="UNDER5PCT",
renamed_field="Individuals under 5 years old",
bucket=self.BUCKET_SENSITIVE,
),
DataSet(
input_field="OVER64PCT",
renamed_field="Individuals over 64 years old",
bucket=self.BUCKET_SENSITIVE,
),
DataSet(
input_field=self.LINGUISTIC_ISOLATION_FIELD_NAME,
renamed_field=self.LINGUISTIC_ISOLATION_FIELD_NAME,
bucket=self.BUCKET_SENSITIVE,
),
DataSet(
input_field="LINGISOPCT",
renamed_field="Percent of households in linguistic isolation",
bucket=self.BUCKET_SOCIOECONOMIC,
),
DataSet(
input_field="LOWINCPCT",
renamed_field=self.POVERTY_FIELD_NAME,
bucket=self.BUCKET_SOCIOECONOMIC,
),
DataSet(
input_field="LESSHSPCT",
renamed_field=self.HIGH_SCHOOL_FIELD_NAME,
bucket=self.BUCKET_SOCIOECONOMIC,
),
DataSet(
input_field=self.UNEMPLOYED_FIELD_NAME,
renamed_field=self.UNEMPLOYED_FIELD_NAME,
bucket=self.BUCKET_SOCIOECONOMIC,
),
DataSet(
input_field="ht_ami",
renamed_field="Housing + Transportation Costs % Income for the Regional Typical Household",
bucket=self.BUCKET_SOCIOECONOMIC,
),
]
# Rename columns:
renaming_dict = {
data_set.input_field: data_set.renamed_field for data_set in data_sets
}
self.df.rename(
columns=renaming_dict, inplace=True, errors="raise",
)
columns_to_keep = [data_set.renamed_field for data_set in data_sets]
self.df = self.df[columns_to_keep]
# Convert all columns to numeric.
for data_set in data_sets:
# Skip GEOID_FIELD_NAME, because it's a string.
if data_set.renamed_field == self.GEOID_FIELD_NAME:
continue
self.df[f"{data_set.renamed_field}"] = pd.to_numeric(
self.df[data_set.renamed_field]
)
# calculate percentiles
for data_set in data_sets:
self.df[
f"{data_set.renamed_field}{self.PERCENTILE_FIELD_SUFFIX}"
] = self.df[data_set.renamed_field].rank(pct=True)
# Math:
# (
# Observed value
# - minimum of all values
# )
# divided by
# (
# Maximum of all values
# - minimum of all values
# )
for data_set in data_sets:
# Skip GEOID_FIELD_NAME, because it's a string.
if data_set.renamed_field == self.GEOID_FIELD_NAME:
continue
min_value = self.df[data_set.renamed_field].min(skipna=True)
max_value = self.df[data_set.renamed_field].max(skipna=True)
logger.info(
f"For data set {data_set.renamed_field}, the min value is {min_value} and the max value is {max_value}."
)
self.df[f"{data_set.renamed_field}{self.MIN_MAX_FIELD_SUFFIX}"] = (
self.df[data_set.renamed_field] - min_value
) / (max_value - min_value)
# Graph distributions and correlations.
min_max_fields = [ # noqa: F841
f"{data_set.renamed_field}{self.MIN_MAX_FIELD_SUFFIX}"
for data_set in data_sets
if data_set.renamed_field != self.GEOID_FIELD_NAME
]
# Calculate score "A" and score "B"
self.df["Score A"] = self.df[
[
"Poverty (Less than 200% of federal poverty line) (percentile)",
"Percent individuals age 25 or over with less than high school degree (percentile)",
]
].mean(axis=1)
self.df["Score B"] = (
self.df["Poverty (Less than 200% of federal poverty line) (percentile)"]
* self.df[
"Percent individuals age 25 or over with less than high school degree (percentile)"
]
)
# Calculate "CalEnviroScreen for the US" score
# Average all the percentile values in each bucket into a single score for each of the four buckets.
for bucket in self.BUCKETS:
fields_in_bucket = [
f"{data_set.renamed_field}{self.PERCENTILE_FIELD_SUFFIX}"
for data_set in data_sets
if data_set.bucket == bucket
]
self.df[f"{bucket}"] = self.df[fields_in_bucket].mean(axis=1)
# Combine the score from the two Exposures and Environmental Effects buckets
# into a single score called "Pollution Burden".
# The math for this score is:
# (1.0 * Exposures Score + 0.5 * Environment Effects score) / 1.5.
self.df[self.AGGREGATION_POLLUTION] = (
1.0 * self.df[f"{self.BUCKET_EXPOSURES}"]
+ 0.5 * self.df[f"{self.BUCKET_ENVIRONMENTAL}"]
) / 1.5
# Average the score from the two Sensitive populations and
# Socioeconomic factors buckets into a single score called
# "Population Characteristics".
self.df[self.AGGREGATION_POPULATION] = self.df[
[f"{self.BUCKET_SENSITIVE}", f"{self.BUCKET_SOCIOECONOMIC}"]
].mean(axis=1)
# Multiply the "Pollution Burden" score and the "Population Characteristics"
# together to produce the cumulative impact score.
self.df["Score C"] = (
self.df[self.AGGREGATION_POLLUTION] * self.df[self.AGGREGATION_POPULATION]
)
if len(census_block_group_df) > 220333:
raise ValueError("Too many rows in the join.")
fields_to_use_in_score = [
self.UNEMPLOYED_FIELD_NAME,
self.LINGUISTIC_ISOLATION_FIELD_NAME,
self.HOUSING_BURDEN_FIELD_NAME,
self.POVERTY_FIELD_NAME,
self.HIGH_SCHOOL_FIELD_NAME,
]
fields_min_max = [
f"{field}{self.MIN_MAX_FIELD_SUFFIX}" for field in fields_to_use_in_score
]
fields_percentile = [
f"{field}{self.PERCENTILE_FIELD_SUFFIX}" for field in fields_to_use_in_score
]
# Calculate "Score D", which uses min-max normalization
# and calculate "Score E", which uses percentile normalization for the same fields
self.df["Score D"] = self.df[fields_min_max].mean(axis=1)
self.df["Score E"] = self.df[fields_percentile].mean(axis=1)
# Calculate correlations
self.df[fields_min_max].corr()
# Create percentiles for the scores
for score_field in [
"Score A",
"Score B",
"Score C",
"Score D",
"Score E",
"Poverty (Less than 200% of federal poverty line)",
]:
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"] = self.df[
score_field
].rank(pct=True)
for threshold in [0.25, 0.3, 0.35, 0.4]:
fraction_converted_to_percent = int(100 * threshold)
self.df[
f"{score_field} (top {fraction_converted_to_percent}th percentile)"
] = (
self.df[f"{score_field}{self.PERCENTILE_FIELD_SUFFIX}"]
>= 1 - threshold
)
def load(self) -> None:
logger.info("Saving Score CSV")
# write nationwide csv
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)

View file

@ -0,0 +1,156 @@
import math
import pandas as pd
import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class GeoScoreETL(ExtractTransformLoad):
"""
A class used to generate per state and national GeoJson files with the score baked in
"""
def __init__(self):
self.SCORE_GEOJSON_PATH = self.DATA_PATH / "score" / "geojson"
self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
self.CENSUS_USA_GEOJSON = self.DATA_PATH / "census" / "geojson" / "us.json"
self.TARGET_SCORE_NAME = "Score E (percentile)"
self.TARGET_SCORE_RENAME_TO = "E_SCORE"
self.NUMBER_OF_BUCKETS = 10
self.geojson_usa_df: gpd.GeoDataFrame
self.score_usa_df: pd.DataFrame
self.geojson_score_usa_high: gpd.GeoDataFrame
self.geojson_score_usa_low: gpd.GeoDataFrame
def extract(self) -> None:
logger.info("Reading US GeoJSON (~6 minutes)")
self.geojson_usa_df = gpd.read_file(
self.CENSUS_USA_GEOJSON,
dtype={"GEOID10": "string"},
usecols=["GEOID10", "geometry"],
low_memory=False,
)
self.geojson_usa_df.head()
logger.info("Reading score CSV")
self.score_usa_df = pd.read_csv(
self.TILE_SCORE_CSV, dtype={"GEOID10": "string"}, low_memory=False,
)
def transform(self) -> None:
logger.info("Pruning Census GeoJSON")
fields = ["GEOID10", "geometry"]
self.geojson_usa_df = self.geojson_usa_df[fields]
logger.info("Merging and compressing score CSV with USA GeoJSON")
self.geojson_score_usa_high = self.score_usa_df.merge(
self.geojson_usa_df, on="GEOID10", how="left"
)
self.geojson_score_usa_high = gpd.GeoDataFrame(
self.geojson_score_usa_high, crs="EPSG:4326"
)
usa_simplified = self.geojson_score_usa_high[
["GEOID10", self.TARGET_SCORE_NAME, "geometry"]
].reset_index(drop=True)
usa_simplified.rename(
columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, inplace=True,
)
logger.info("Aggregating into tracts (~5 minutes)")
usa_tracts = self._aggregate_to_tracts(usa_simplified)
usa_tracts = gpd.GeoDataFrame(
usa_tracts,
columns=[self.TARGET_SCORE_RENAME_TO, "geometry"],
crs="EPSG:4326",
)
logger.info("Creating buckets from tracts")
usa_bucketed = self._create_buckets_from_tracts(
usa_tracts, self.NUMBER_OF_BUCKETS
)
logger.info("Aggregating buckets")
usa_aggregated = self._aggregate_buckets(usa_bucketed, agg_func="mean")
compressed = self._breakup_multipolygons(usa_aggregated, self.NUMBER_OF_BUCKETS)
self.geojson_score_usa_low = gpd.GeoDataFrame(
compressed,
columns=[self.TARGET_SCORE_RENAME_TO, "geometry"],
crs="EPSG:4326",
)
def _aggregate_to_tracts(
self, block_group_df: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
# The tract identifier is the first 11 digits of the GEOID
block_group_df["tract"] = block_group_df.apply(
lambda row: row["GEOID10"][0:11], axis=1
)
state_tracts = block_group_df.dissolve(by="tract", aggfunc="mean")
return state_tracts
def _create_buckets_from_tracts(
self, state_tracts: gpd.GeoDataFrame, num_buckets: int
) -> gpd.GeoDataFrame:
# assign tracts to buckets by D_SCORE
state_tracts.sort_values(self.TARGET_SCORE_RENAME_TO, inplace=True)
SCORE_bucket = []
bucket_size = math.ceil(len(state_tracts.index) / self.NUMBER_OF_BUCKETS)
for i in range(len(state_tracts.index)):
SCORE_bucket.extend([math.floor(i / bucket_size)])
state_tracts[f"{self.TARGET_SCORE_RENAME_TO}_bucket"] = SCORE_bucket
return state_tracts
def _aggregate_buckets(self, state_tracts: gpd.GeoDataFrame, agg_func: str):
# dissolve tracts by bucket
state_attr = state_tracts[
[
self.TARGET_SCORE_RENAME_TO,
f"{self.TARGET_SCORE_RENAME_TO}_bucket",
"geometry",
]
].reset_index(drop=True)
state_dissolve = state_attr.dissolve(
by=f"{self.TARGET_SCORE_RENAME_TO}_bucket", aggfunc=agg_func
)
return state_dissolve
def _breakup_multipolygons(
self, state_bucketed_df: gpd.GeoDataFrame, num_buckets: int
) -> gpd.GeoDataFrame:
compressed = []
for i in range(num_buckets):
for j in range(len(state_bucketed_df["geometry"][i].geoms)):
compressed.append(
[
state_bucketed_df[self.TARGET_SCORE_RENAME_TO][i],
state_bucketed_df["geometry"][i].geoms[j],
]
)
return compressed
def load(self) -> None:
logger.info("Writing usa-high (~9 minutes)")
self.geojson_score_usa_high.to_file(self.SCORE_HIGH_GEOJSON, driver="GeoJSON")
logger.info("Completed writing usa-high")
logger.info("Writing usa-low (~9 minutes)")
self.geojson_score_usa_low.to_file(self.SCORE_LOW_GEOJSON, driver="GeoJSON")
logger.info("Completed writing usa-low")

View file

@ -0,0 +1,135 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class PostScoreETL(ExtractTransformLoad):
"""
A class used to instantiate an ETL object to retrieve and process data from
datasets.
"""
def __init__(self):
self.CENSUS_COUNTIES_ZIP_URL = "https://www2.census.gov/geo/docs/maps-data/data/gazetteer/Gaz_counties_national.zip"
self.CENSUS_COUNTIES_TXT = self.TMP_PATH / "Gaz_counties_national.txt"
self.CENSUS_COUNTIES_COLS = ["USPS", "GEOID", "NAME"]
self.CENSUS_USA_CSV = self.DATA_PATH / "census" / "csv" / "us.csv"
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
self.STATE_CSV = self.DATA_PATH / "census" / "csv" / "fips_states_2010.csv"
self.FULL_SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa.csv"
self.TILR_SCORE_CSV = self.SCORE_CSV_PATH / "tile" / "usa.csv"
self.TILES_SCORE_COLUMNS = [
"GEOID10",
"Score E (percentile)",
"Score E (top 25th percentile)",
"GEOID",
"State Abbreviation",
"County Name",
]
self.TILES_SCORE_CSV_PATH = self.SCORE_CSV_PATH / "tiles"
self.TILES_SCORE_CSV = self.TILES_SCORE_CSV_PATH / "usa.csv"
self.counties_df: pd.DataFrame
self.states_df: pd.DataFrame
self.score_df: pd.DataFrame
self.score_county_state_merged: pd.DataFrame
self.score_for_tiles: pd.DataFrame
def extract(self) -> None:
super().extract(
self.CENSUS_COUNTIES_ZIP_URL, self.TMP_PATH,
)
logger.info("Reading Counties CSV")
self.counties_df = pd.read_csv(
self.CENSUS_COUNTIES_TXT,
sep="\t",
dtype={"GEOID": "string", "USPS": "string"},
low_memory=False,
encoding="latin-1",
)
logger.info("Reading States CSV")
self.states_df = pd.read_csv(
self.STATE_CSV, dtype={"fips": "string", "state_code": "string"}
)
self.score_df = pd.read_csv(self.FULL_SCORE_CSV, dtype={"GEOID10": "string"})
def transform(self) -> None:
logger.info("Transforming data sources for Score + County CSV")
# rename some of the columns to prepare for merge
self.counties_df = self.counties_df[["USPS", "GEOID", "NAME"]]
self.counties_df.rename(
columns={"USPS": "State Abbreviation", "NAME": "County Name"}, inplace=True,
)
# remove unnecessary columns
self.states_df.rename(
columns={
"fips": "State Code",
"state_name": "State Name",
"state_abbreviation": "State Abbreviation",
},
inplace=True,
)
self.states_df.drop(["region", "division"], axis=1, inplace=True)
# add the tract level column
self.score_df["GEOID"] = self.score_df.GEOID10.str[:5]
# merge state with counties
county_state_merged = self.counties_df.merge(
self.states_df, on="State Abbreviation", how="left"
)
# merge state + county with score
self.score_county_state_merged = self.score_df.merge(
county_state_merged, on="GEOID", how="left"
)
# check if there are census cbgs without score
logger.info("Removing CBG rows without score")
## load cbgs
cbg_usa_df = pd.read_csv(
self.CENSUS_USA_CSV,
names=["GEOID10"],
dtype={"GEOID10": "string"},
low_memory=False,
header=None,
)
# merge census cbgs with score
merged_df = cbg_usa_df.merge(
self.score_county_state_merged, on="GEOID10", how="left"
)
# list the null score cbgs
null_cbg_df = merged_df[merged_df["Score E (percentile)"].isnull()]
# subsctract data sets
removed_df = pd.concat([merged_df, null_cbg_df, null_cbg_df]).drop_duplicates(
keep=False
)
# set the score to the new df
self.score_county_state_merged = removed_df
def load(self) -> None:
logger.info("Saving Full Score CSV with County Information")
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
self.score_county_state_merged.to_csv(self.FULL_SCORE_CSV, index=False)
logger.info("Saving Tile Score CSV")
# TODO: check which are the columns we'll use
# Related to: https://github.com/usds/justice40-tool/issues/302
score_tiles = self.score_county_state_merged[self.TILES_SCORE_COLUMNS]
self.TILES_SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
score_tiles.to_csv(self.TILES_SCORE_CSV, index=False)

View file

@ -0,0 +1,72 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
class CalEnviroScreenETL(ExtractTransformLoad):
def __init__(self):
self.CALENVIROSCREEN_FTP_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/CalEnviroScreen_4.0_2021.zip"
)
self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
# Definining some variable names
self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = "calenviroscreen_percentile"
self.CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = (
"calenviroscreen_priority_community"
)
# Choosing constants.
# None of these numbers are final, but just for the purposes of comparison.
self.CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD = 75
self.df: pd.DataFrame
def extract(self) -> None:
logger.info("Downloading CalEnviroScreen Data")
super().extract(
self.CALENVIROSCREEN_FTP_URL,
self.TMP_PATH,
)
def transform(self) -> None:
logger.info("Transforming CalEnviroScreen Data")
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(
self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"}
)
self.df.rename(
columns={
"Census Tract": self.GEOID_TRACT_FIELD_NAME,
"DRAFT CES 4.0 Score": self.CALENVIROSCREEN_SCORE_FIELD_NAME,
"DRAFT CES 4.0 Percentile": self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME,
},
inplace=True,
)
# Add a leading "0" to the Census Tract to match our format in other data frames.
self.df[self.GEOID_TRACT_FIELD_NAME] = (
"0" + self.df[self.GEOID_TRACT_FIELD_NAME]
)
# Calculate the top K% of prioritized communities
self.df[self.CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME] = (
self.df[self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME]
>= self.CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD
)
def load(self) -> None:
logger.info("Saving CalEnviroScreen CSV")
# write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)

View file

@ -0,0 +1,124 @@
import csv
import json
import os
from pathlib import Path
import geopandas as gpd
from data_pipeline.utils import get_module_logger, unzip_file_from_url
from .etl_utils import get_state_fips_codes
logger = get_module_logger(__name__)
def download_census_csvs(data_path: Path) -> None:
"""Download all census shape files from the Census FTP and extract the geojson
to generate national and by state Census Block Group CSVs and GeoJSONs
Args:
data_path (pathlib.Path): Name of the directory where the files and directories will
be created
Returns:
None
"""
# the fips_states_2010.csv is generated from data here
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
state_fips_codes = get_state_fips_codes(data_path)
geojson_dir_path = data_path / "census" / "geojson"
for fips in state_fips_codes:
# check if file exists
shp_file_path = data_path / "census" / "shp" / fips / f"tl_2010_{fips}_bg10.shp"
logger.info(f"Checking if {fips} file exists")
if not os.path.isfile(shp_file_path):
logger.info(f"Downloading and extracting {fips} shape file")
# 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/
# But using 2010 for now
cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
unzip_file_from_url(
cbg_state_url,
data_path / "tmp",
data_path / "census" / "shp" / fips,
)
cmd = (
"ogr2ogr -f GeoJSON data/census/geojson/"
+ fips
+ ".json data/census/shp/"
+ fips
+ "/tl_2010_"
+ fips
+ "_bg10.shp"
)
os.system(cmd)
# generate CBG CSV table for pandas
## load in memory
cbg_national = [] # in-memory global list
cbg_per_state: dict = {} # in-memory dict per state
for file in os.listdir(geojson_dir_path):
if file.endswith(".json"):
logger.info(f"Ingesting geoid10 for file {file}")
with open(geojson_dir_path / file) as f:
geojson = json.load(f)
for feature in geojson["features"]:
geoid10 = feature["properties"]["GEOID10"]
cbg_national.append(str(geoid10))
geoid10_state_id = geoid10[:2]
if not cbg_per_state.get(geoid10_state_id):
cbg_per_state[geoid10_state_id] = []
cbg_per_state[geoid10_state_id].append(geoid10)
csv_dir_path = data_path / "census" / "csv"
## write to individual state csv
for state_id in cbg_per_state:
geoid10_list = cbg_per_state[state_id]
with open(
csv_dir_path / f"{state_id}.csv", mode="w", newline=""
) as cbg_csv_file:
cbg_csv_file_writer = csv.writer(
cbg_csv_file,
delimiter=",",
quotechar='"',
quoting=csv.QUOTE_MINIMAL,
)
for geoid10 in geoid10_list:
cbg_csv_file_writer.writerow(
[
geoid10,
]
)
## write US csv
with open(csv_dir_path / "us.csv", mode="w", newline="") as cbg_csv_file:
cbg_csv_file_writer = csv.writer(
cbg_csv_file,
delimiter=",",
quotechar='"',
quoting=csv.QUOTE_MINIMAL,
)
for geoid10 in cbg_national:
cbg_csv_file_writer.writerow(
[
geoid10,
]
)
## create national geojson
logger.info("Generating national geojson file")
usa_df = gpd.GeoDataFrame()
for file_name in geojson_dir_path.rglob("*.json"):
logger.info(f"Ingesting {file_name}")
state_gdf = gpd.read_file(file_name)
usa_df = usa_df.append(state_gdf)
usa_df = usa_df.to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
logger.info("Writing national geojson file")
usa_df.to_file(geojson_dir_path / "us.json", driver="GeoJSON")
logger.info("Census block groups downloading complete")

View file

@ -0,0 +1,71 @@
import csv
import os
from pathlib import Path
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.utils import (
get_module_logger,
remove_all_dirs_from_dir,
remove_files_from_dir,
unzip_file_from_url,
)
logger = get_module_logger(__name__)
def reset_data_directories(data_path: Path) -> None:
census_data_path = data_path / "census"
# csv
csv_path = census_data_path / "csv"
remove_files_from_dir(csv_path, ".csv")
# geojson
geojson_path = census_data_path / "geojson"
remove_files_from_dir(geojson_path, ".json")
# shp
shp_path = census_data_path / "shp"
remove_all_dirs_from_dir(shp_path)
def get_state_fips_codes(data_path: Path) -> list:
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
# check if file exists
if not os.path.isfile(fips_csv_path):
logger.info("Downloading fips from S3 repository")
unzip_file_from_url(
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip",
data_path / "tmp",
data_path / "census" / "csv",
)
fips_state_list = []
with open(fips_csv_path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",")
line_count = 0
for row in csv_reader:
if line_count == 0:
line_count += 1
else:
fips = row[0].strip()
fips_state_list.append(fips)
return fips_state_list
def get_state_information(data_path: Path) -> pd.DataFrame:
"""Load the full state file as a dataframe.
Useful because of the state regional information.
"""
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
df = pd.read_csv(fips_csv_path)
# Left pad the FIPS codes with 0s
df["fips"] = df["fips"].astype(str).apply(lambda x: x.zfill(2))
return df

View file

@ -0,0 +1,103 @@
import pandas as pd
import censusdata
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class CensusACSETL(ExtractTransformLoad):
def __init__(self):
self.ACS_YEAR = 2019
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = "Linguistic isolation (total)"
self.LINGUISTIC_ISOLATION_FIELDS = [
"C16002_001E",
"C16002_004E",
"C16002_007E",
"C16002_010E",
"C16002_013E",
]
self.df: pd.DataFrame
def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
"""Create a FIPS code from the proprietary censusgeo index."""
fips = "".join([value for (key, value) in censusgeo.params()])
return fips
def extract(self) -> None:
dfs = []
for fips in get_state_fips_codes(self.DATA_PATH):
logger.info(f"Downloading data for state/territory with FIPS code {fips}")
dfs.append(
censusdata.download(
src="acs5",
year=self.ACS_YEAR,
geo=censusdata.censusgeo(
[("state", fips), ("county", "*"), ("block group", "*")]
),
var=[
# Emploment fields
"B23025_005E",
"B23025_003E",
]
+ self.LINGUISTIC_ISOLATION_FIELDS,
)
)
self.df = pd.concat(dfs)
self.df[self.GEOID_FIELD_NAME] = self.df.index.to_series().apply(
func=self._fips_from_censusdata_censusgeo
)
def transform(self) -> None:
logger.info("Starting Census ACS Transform")
# Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
# Calculate linguistic isolation.
individual_limited_english_fields = [
"C16002_004E",
"C16002_007E",
"C16002_010E",
"C16002_013E",
]
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = self.df[
individual_limited_english_fields
].sum(axis=1, skipna=True)
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
/ self.df["C16002_001E"]
)
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME].describe()
def load(self) -> None:
logger.info("Saving Census ACS Data")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
columns_to_include = [
self.GEOID_FIELD_NAME,
self.UNEMPLOYED_FIELD_NAME,
self.LINGUISTIC_ISOLATION_FIELD_NAME,
]
self.df[columns_to_include].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)
def validate(self) -> None:
logger.info("Validating Census ACS Data")
pass

View file

@ -0,0 +1,38 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class EJScreenETL(ExtractTransformLoad):
def __init__(self):
self.EJSCREEN_FTP_URL = (
"https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
)
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
self.df: pd.DataFrame
def extract(self) -> None:
logger.info("Downloading EJScreen Data")
super().extract(
self.EJSCREEN_FTP_URL, self.TMP_PATH,
)
def transform(self) -> None:
logger.info("Transforming EJScreen Data")
self.df = pd.read_csv(
self.EJSCREEN_CSV,
dtype={"ID": "string"},
# EJSCREEN writes the word "None" for NA data.
na_values=["None"],
low_memory=False,
)
def load(self) -> None:
logger.info("Saving EJScreen CSV")
# write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)

View file

@ -0,0 +1,58 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger, unzip_file_from_url
logger = get_module_logger(__name__)
class HousingTransportationETL(ExtractTransformLoad):
def __init__(self):
self.HOUSING_FTP_URL = (
"https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid="
)
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / "housing_and_transportation_index"
)
self.df: pd.DataFrame
def extract(self) -> None:
# Download each state / territory individually
dfs = []
zip_file_dir = self.TMP_PATH / "housing_and_transportation_index"
for fips in get_state_fips_codes(self.DATA_PATH):
logger.info(
f"Downloading housing data for state/territory with FIPS code {fips}"
)
# Puerto Rico has no data, so skip
if fips == "72":
continue
unzip_file_from_url(
f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
)
# New file name:
tmp_csv_file_path = zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
dfs.append(tmp_df)
self.df = pd.concat(dfs)
def transform(self) -> None:
logger.info("Transforming Housing and Transportation Data")
# Rename and reformat block group ID
self.df.rename(columns={"blkgrp": self.GEOID_FIELD_NAME}, inplace=True)
self.df[self.GEOID_FIELD_NAME] = self.df[self.GEOID_FIELD_NAME].str.replace(
'"', ""
)
def load(self) -> None:
logger.info("Saving Housing and Transportation Data")
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)

View file

@ -0,0 +1,289 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class HudHousingETL(ExtractTransformLoad):
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "hud_housing"
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
self.HOUSING_FTP_URL = (
"https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip"
)
self.HOUSING_ZIP_FILE_DIR = self.TMP_PATH / "hud_housing"
# We measure households earning less than 80% of HUD Area Median Family Income by county
# and paying greater than 30% of their income to housing costs.
self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)"
self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME = "HOUSING_BURDEN_NUMERATOR"
self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = "HOUSING_BURDEN_DENOMINATOR"
# Note: some variable definitions.
# HUD-adjusted median family income (HAMFI).
# The four housing problems are:
# - incomplete kitchen facilities,
# - incomplete plumbing facilities,
# - more than 1 person per room,
# - cost burden greater than 30%.
# Table 8 is the desired table.
self.df: pd.DataFrame
def extract(self) -> None:
logger.info("Extracting HUD Housing Data")
super().extract(
self.HOUSING_FTP_URL, self.HOUSING_ZIP_FILE_DIR,
)
def transform(self) -> None:
logger.info("Transforming HUD Housing Data")
# New file name:
tmp_csv_file_path = (
self.HOUSING_ZIP_FILE_DIR
/ "2012thru2016-140-csv"
/ "2012thru2016-140-csv"
/ "140"
/ "Table8.csv"
)
self.df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path, encoding="latin-1",)
# Rename and reformat block group ID
self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True)
# The CHAS data has census tract ids such as `14000US01001020100`
# Whereas the rest of our data uses, for the same tract, `01001020100`.
# the characters before `US`:
self.df[self.GEOID_TRACT_FIELD_NAME] = self.df[
self.GEOID_TRACT_FIELD_NAME
].str.replace(r"^.*?US", "", regex=True)
# Calculate housing burden
# This is quite a number of steps. It does not appear to be accessible nationally in a simpler format, though.
# See "CHAS data dictionary 12-16.xlsx"
# Owner occupied numerator fields
OWNER_OCCUPIED_NUMERATOR_FIELDS = [
# Column Name
# Line_Type
# Tenure
# Household income
# Cost burden
# Facilities
"T8_est7",
# Subtotal
# Owner occupied
# less than or equal to 30% of HAMFI
# greater than 30% but less than or equal to 50%
# All
"T8_est10",
# Subtotal
# Owner occupied
# less than or equal to 30% of HAMFI
# greater than 50%
# All
"T8_est20",
# Subtotal
# Owner occupied
# greater than 30% but less than or equal to 50% of HAMFI
# greater than 30% but less than or equal to 50%
# All
"T8_est23",
# Subtotal
# Owner occupied
# greater than 30% but less than or equal to 50% of HAMFI
# greater than 50%
# All
"T8_est33",
# Subtotal
# Owner occupied
# greater than 50% but less than or equal to 80% of HAMFI
# greater than 30% but less than or equal to 50%
# All
"T8_est36",
# Subtotal
# Owner occupied
# greater than 50% but less than or equal to 80% of HAMFI
# greater than 50%
# All
]
# These rows have the values where HAMFI was not computed, b/c of no or negative income.
OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [
# Column Name
# Line_Type
# Tenure
# Household income
# Cost burden
# Facilities
"T8_est13",
# Subtotal
# Owner occupied
# less than or equal to 30% of HAMFI
# not computed (no/negative income)
# All
"T8_est26",
# Subtotal
# Owner occupied
# greater than 30% but less than or equal to 50% of HAMFI
# not computed (no/negative income)
# All
"T8_est39",
# Subtotal
# Owner occupied
# greater than 50% but less than or equal to 80% of HAMFI
# not computed (no/negative income)
# All
"T8_est52",
# Subtotal
# Owner occupied
# greater than 80% but less than or equal to 100% of HAMFI
# not computed (no/negative income)
# All
"T8_est65",
# Subtotal
# Owner occupied
# greater than 100% of HAMFI
# not computed (no/negative income)
# All
]
OWNER_OCCUPIED_POPULATION_FIELD = "T8_est2"
# Subtotal
# Owner occupied
# All
# All
# All
# Renter occupied numerator fields
RENTER_OCCUPIED_NUMERATOR_FIELDS = [
# Column Name
# Line_Type
# Tenure
# Household income
# Cost burden
# Facilities
"T8_est73",
# Subtotal
# Renter occupied
# less than or equal to 30% of HAMFI
# greater than 30% but less than or equal to 50%
# All
"T8_est76",
# Subtotal
# Renter occupied
# less than or equal to 30% of HAMFI
# greater than 50%
# All
"T8_est86",
# Subtotal
# Renter occupied
# greater than 30% but less than or equal to 50% of HAMFI
# greater than 30% but less than or equal to 50%
# All
"T8_est89",
# Subtotal
# Renter occupied
# greater than 30% but less than or equal to 50% of HAMFI
# greater than 50%
# All
"T8_est99",
# Subtotal
# Renter occupied greater than 50% but less than or equal to 80% of HAMFI
# greater than 30% but less than or equal to 50%
# All
"T8_est102",
# Subtotal
# Renter occupied
# greater than 50% but less than or equal to 80% of HAMFI
# greater than 50%
# All
]
# These rows have the values where HAMFI was not computed, b/c of no or negative income.
RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [
# Column Name
# Line_Type
# Tenure
# Household income
# Cost burden
# Facilities
"T8_est79",
# Subtotal
# Renter occupied less than or equal to 30% of HAMFI
# not computed (no/negative income)
# All
"T8_est92",
# Subtotal
# Renter occupied greater than 30% but less than or equal to 50% of HAMFI
# not computed (no/negative income)
# All
"T8_est105",
# Subtotal
# Renter occupied
# greater than 50% but less than or equal to 80% of HAMFI
# not computed (no/negative income)
# All
"T8_est118",
# Subtotal
# Renter occupied greater than 80% but less than or equal to 100% of HAMFI
# not computed (no/negative income)
# All
"T8_est131",
# Subtotal
# Renter occupied
# greater than 100% of HAMFI
# not computed (no/negative income)
# All
]
# T8_est68 Subtotal Renter occupied All All All
RENTER_OCCUPIED_POPULATION_FIELD = "T8_est68"
# Math:
# (
# # of Owner Occupied Units Meeting Criteria
# + # of Renter Occupied Units Meeting Criteria
# )
# divided by
# (
# Total # of Owner Occupied Units
# + Total # of Renter Occupied Units
# - # of Owner Occupied Units with HAMFI Not Computed
# - # of Renter Occupied Units with HAMFI Not Computed
# )
self.df[self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME] = self.df[
OWNER_OCCUPIED_NUMERATOR_FIELDS
].sum(axis=1) + self.df[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)
self.df[self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME] = (
self.df[OWNER_OCCUPIED_POPULATION_FIELD]
+ self.df[RENTER_OCCUPIED_POPULATION_FIELD]
- self.df[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)
- self.df[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)
)
# TODO: add small sample size checks
self.df[self.HOUSING_BURDEN_FIELD_NAME] = self.df[
self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME
].astype(float) / self.df[self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME].astype(
float
)
def load(self) -> None:
logger.info("Saving HUD Housing Data")
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
# Drop unnecessary fields
self.df[
[
self.GEOID_TRACT_FIELD_NAME,
self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME,
self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME,
self.HOUSING_BURDEN_FIELD_NAME,
]
].to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)

View file

@ -0,0 +1,64 @@
import pandas as pd
import requests
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class HudRecapETL(ExtractTransformLoad):
def __init__(self):
# pylint: disable=line-too-long
self.HUD_RECAP_CSV_URL = "https://opendata.arcgis.com/api/v3/datasets/56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326" # noqa: E501
self.HUD_RECAP_CSV = (
self.TMP_PATH
/ "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
)
self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
# Definining some variable names
self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = "hud_recap_priority_community"
self.df: pd.DataFrame
def extract(self) -> None:
logger.info("Downloading HUD Recap Data")
download = requests.get(self.HUD_RECAP_CSV_URL, verify=None)
file_contents = download.content
csv_file = open(self.HUD_RECAP_CSV, "wb")
csv_file.write(file_contents)
csv_file.close()
def transform(self) -> None:
logger.info("Transforming HUD Recap Data")
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
self.df.rename(
columns={
"GEOID": self.GEOID_TRACT_FIELD_NAME,
# Interestingly, there's no data dictionary for the RECAP data that I could find.
# However, this site (http://www.schousing.com/library/Tax%20Credit/2020/QAP%20Instructions%20(2).pdf)
# suggests:
# "If RCAP_Current for the tract in which the site is located is 1, the tract is an R/ECAP. If RCAP_Current is 0, it is not."
"RCAP_Current": self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME,
},
inplace=True,
)
# Convert to boolean
self.df[self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME] = self.df[
self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME
].astype("bool")
self.df[self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME].value_counts()
self.df.sort_values(by=self.GEOID_TRACT_FIELD_NAME, inplace=True)
def load(self) -> None:
logger.info("Saving HUD Recap CSV")
# write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)

View file

@ -0,0 +1,3 @@
# Tree Equity Score
The Tree Equity Score was built by American Forest to assess how equitably trees were planted in a city. More information, checkout [https://treeequityscore.org](https://treeequityscore.org).

View file

@ -0,0 +1,87 @@
import geopandas as gpd
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class TreeEquityScoreETL(ExtractTransformLoad):
def __init__(self):
self.TES_URL = (
"https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
)
self.TES_CSV = self.TMP_PATH / "tes_2021_data.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
self.df: gpd.GeoDataFrame
self.states = [
"al",
"az",
"ar",
"ca",
"co",
"ct",
"de",
"dc",
"fl",
"ga",
"id",
"il",
"in",
"ia",
"ks",
"ky",
"la",
"me",
"md",
"ma",
"mi",
"mn",
"ms",
"mo",
"mt",
"ne",
"nv",
"nh",
"nj",
"nm",
"ny",
"nc",
"nd",
"oh",
"ok",
"or",
"pa",
"ri",
"sc",
"sd",
"tn",
"tx",
"ut",
"vt",
"va",
"wa",
"wv",
"wi",
"wy",
]
def extract(self) -> None:
logger.info("Downloading Tree Equity Score Data")
for state in self.states:
super().extract(
f"{self.TES_URL}{state}.zip.zip", f"{self.TMP_PATH}/{state}",
)
def transform(self) -> None:
logger.info("Transforming Tree Equity Score Data")
tes_state_dfs = []
for state in self.states:
tes_state_dfs.append(gpd.read_file(f"{self.TMP_PATH}/{state}/{state}.shp"))
self.df = gpd.GeoDataFrame(pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs)
def load(self) -> None:
logger.info("Saving Tree Equity Score GeoJSON")
# write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_file(self.CSV_PATH / "tes_conus.geojson", driver="GeoJSON")

View file

@ -0,0 +1,567 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "43c5dbee",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f97c95f6",
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b8a2b53e",
"metadata": {},
"outputs": [],
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"ACS_YEAR = \"2019\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "0d33e8db",
"metadata": {},
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "01e6dbe3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10\n",
"0 100010414002\n",
"1 100010415002\n",
"2 100010417011\n",
"3 100010417012\n",
"4 100010422011"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "341dbcb6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"dtype: object"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "eb25d4bf",
"metadata": {},
"outputs": [],
"source": [
"acs_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "d4c9d010",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010399620002</td>\n",
" <td>0.077108</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010399618002</td>\n",
" <td>0.126214</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010399616004</td>\n",
" <td>0.133172</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010399616002</td>\n",
" <td>0.028249</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010399616001</td>\n",
" <td>0.063037</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 010399620002 0.077108 \n",
"1 010399618002 0.126214 \n",
"2 010399616004 0.133172 \n",
"3 010399616002 0.028249 \n",
"4 010399616001 0.063037 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "dd390179",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"Unemployed civilians (percent) float64\n",
"Linguistic isolation (percent) float64\n",
"dtype: object"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "236eb093",
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" acs_df, on=\"GEOID10\", how=\"left\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "4fff1845",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.030612</td>\n",
" <td>0.065963</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.118056</td>\n",
" <td>0.010283</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.042373</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.042473</td>\n",
" <td>0.010435</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.054358</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 100010414002 0.030612 \n",
"1 100010415002 0.118056 \n",
"2 100010417011 0.042373 \n",
"3 100010417012 0.042473 \n",
"4 100010422011 0.054358 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.065963 \n",
"1 0.010283 \n",
"2 0.000000 \n",
"3 0.010435 \n",
"4 0.000000 "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "f8903557",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>100019900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>377</th>\n",
" <td>100030169041</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>392</th>\n",
" <td>100059900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400</th>\n",
" <td>100039901000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>416</th>\n",
" <td>100039801001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219505</th>\n",
" <td>340057048013</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219508</th>\n",
" <td>340057048024</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219758</th>\n",
" <td>340258047001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219807</th>\n",
" <td>340259900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220134</th>\n",
" <td>340076113001</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1462 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"34 100019900000 NaN \n",
"377 100030169041 NaN \n",
"392 100059900000 NaN \n",
"400 100039901000 NaN \n",
"416 100039801001 NaN \n",
"... ... ... \n",
"219505 340057048013 NaN \n",
"219508 340057048024 NaN \n",
"219758 340258047001 NaN \n",
"219807 340259900000 NaN \n",
"220134 340076113001 NaN \n",
"\n",
" Linguistic isolation (percent) \n",
"34 NaN \n",
"377 NaN \n",
"392 NaN \n",
"400 NaN \n",
"416 NaN \n",
"... ... \n",
"219505 NaN \n",
"219508 NaN \n",
"219758 NaN \n",
"219807 NaN \n",
"220134 0.0 \n",
"\n",
"[1462 rows x 3 columns]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b870a21f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,777 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "3ab8f7c1",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8c22494f",
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "eb31e9a1",
"metadata": {},
"outputs": [],
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "95a5f8d8",
"metadata": {},
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bdd9ab60",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10\n",
"0 100010414002\n",
"1 100010415002\n",
"2 100010417011\n",
"3 100010417012\n",
"4 100010422011"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "05a40080",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "114af777",
"metadata": {},
"outputs": [],
"source": [
"score_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d5f3ebd4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" <th>GEOID</th>\n",
" <th>State Abbreviation</th>\n",
" <th>County Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.808889</td>\n",
" <td>True</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.555160</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.272392</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.345686</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.472567</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220256</th>\n",
" <td>340076020004</td>\n",
" <td>0.921941</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220257</th>\n",
" <td>340076017002</td>\n",
" <td>0.934490</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220258</th>\n",
" <td>340076015005</td>\n",
" <td>0.889613</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220259</th>\n",
" <td>340076091032</td>\n",
" <td>0.627822</td>\n",
" <td>False</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220260</th>\n",
" <td>340076053002</td>\n",
" <td>0.762237</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>220261 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
"0 100010414002 0.808889 True \n",
"1 100010415002 0.555160 False \n",
"2 100010417011 0.272392 False \n",
"3 100010417012 0.345686 False \n",
"4 100010422011 0.472567 False \n",
"... ... ... ... \n",
"220256 340076020004 0.921941 True \n",
"220257 340076017002 0.934490 True \n",
"220258 340076015005 0.889613 True \n",
"220259 340076091032 0.627822 False \n",
"220260 340076053002 0.762237 True \n",
"\n",
" GEOID State Abbreviation County Name \n",
"0 10001 DE Kent County \n",
"1 10001 DE Kent County \n",
"2 10001 DE Kent County \n",
"3 10001 DE Kent County \n",
"4 10001 DE Kent County \n",
"... ... ... ... \n",
"220256 34007 NJ Camden County \n",
"220257 34007 NJ Camden County \n",
"220258 34007 NJ Camden County \n",
"220259 34007 NJ Camden County \n",
"220260 34007 NJ Camden County \n",
"\n",
"[220261 rows x 6 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f84f9e1d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"Score E (percentile) float64\n",
"Score E (top 25th percentile) bool\n",
"GEOID int64\n",
"State Abbreviation object\n",
"County Name object\n",
"dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8d61e29e",
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" score_df, on=\"GEOID10\", how=\"left\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7e8c2f2a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" <th>GEOID</th>\n",
" <th>State Abbreviation</th>\n",
" <th>County Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.808889</td>\n",
" <td>True</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.555160</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.272392</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.345686</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.472567</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220329</th>\n",
" <td>340076020004</td>\n",
" <td>0.921941</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220330</th>\n",
" <td>340076017002</td>\n",
" <td>0.934490</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220331</th>\n",
" <td>340076015005</td>\n",
" <td>0.889613</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220332</th>\n",
" <td>340076091032</td>\n",
" <td>0.627822</td>\n",
" <td>False</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220333</th>\n",
" <td>340076053002</td>\n",
" <td>0.762237</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>220334 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
"0 100010414002 0.808889 True \n",
"1 100010415002 0.555160 False \n",
"2 100010417011 0.272392 False \n",
"3 100010417012 0.345686 False \n",
"4 100010422011 0.472567 False \n",
"... ... ... ... \n",
"220329 340076020004 0.921941 True \n",
"220330 340076017002 0.934490 True \n",
"220331 340076015005 0.889613 True \n",
"220332 340076091032 0.627822 False \n",
"220333 340076053002 0.762237 True \n",
"\n",
" GEOID State Abbreviation County Name \n",
"0 10001.0 DE Kent County \n",
"1 10001.0 DE Kent County \n",
"2 10001.0 DE Kent County \n",
"3 10001.0 DE Kent County \n",
"4 10001.0 DE Kent County \n",
"... ... ... ... \n",
"220329 34007.0 NJ Camden County \n",
"220330 34007.0 NJ Camden County \n",
"220331 34007.0 NJ Camden County \n",
"220332 34007.0 NJ Camden County \n",
"220333 34007.0 NJ Camden County \n",
"\n",
"[220334 rows x 6 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e81b1321",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" <th>GEOID</th>\n",
" <th>State Abbreviation</th>\n",
" <th>County Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10614</th>\n",
" <td>515150501002</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10615</th>\n",
" <td>515150501003</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10627</th>\n",
" <td>515150501001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10628</th>\n",
" <td>515150501005</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10629</th>\n",
" <td>515150501004</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174140</th>\n",
" <td>040190029031</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174143</th>\n",
" <td>040190027012</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174184</th>\n",
" <td>040190027011</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174242</th>\n",
" <td>040194105021</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174243</th>\n",
" <td>040194105011</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>73 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
"10614 515150501002 NaN NaN \n",
"10615 515150501003 NaN NaN \n",
"10627 515150501001 NaN NaN \n",
"10628 515150501005 NaN NaN \n",
"10629 515150501004 NaN NaN \n",
"... ... ... ... \n",
"174140 040190029031 NaN NaN \n",
"174143 040190027012 NaN NaN \n",
"174184 040190027011 NaN NaN \n",
"174242 040194105021 NaN NaN \n",
"174243 040194105011 NaN NaN \n",
"\n",
" GEOID State Abbreviation County Name \n",
"10614 NaN NaN NaN \n",
"10615 NaN NaN NaN \n",
"10627 NaN NaN NaN \n",
"10628 NaN NaN NaN \n",
"10629 NaN NaN NaN \n",
"... ... ... ... \n",
"174140 NaN NaN NaN \n",
"174143 NaN NaN NaN \n",
"174184 NaN NaN NaN \n",
"174242 NaN NaN NaN \n",
"174243 NaN NaN NaN \n",
"\n",
"[73 rows x 6 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df[merged_df[\"Score E (percentile)\"].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1a7b71d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,949 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import math\n",
"import pathlib\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n",
" state_gdf = gpd.read_file(file_name)\n",
" state_repr = state_gdf.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")\n",
" state_merged = state_repr.merge(usa_df, on=\"GEOID10\", how=\"left\")\n",
" state_merged_simplified = state_merged[\n",
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
" ].reset_index(drop=True)\n",
" state_merged_simplified.rename(\n",
" columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
" )\n",
" return state_merged_simplified\n",
"\n",
"\n",
"def aggregate_to_tracts(block_group_df: pd.DataFrame):\n",
" # The tract identifier is the first 11 digits of the GEOID\n",
" block_group_df[\"tract\"] = block_group_df.apply(\n",
" lambda row: row[\"GEOID10\"][0:11], axis=1\n",
" )\n",
" state_tracts = block_group_df.dissolve(by=\"tract\", aggfunc=\"mean\")\n",
" return state_tracts\n",
"\n",
"\n",
"def create_buckets_from_tracts(state_tracts: pd.DataFrame, num_buckets: int):\n",
" # assign tracts to buckets by D_SCORE\n",
" state_tracts.sort_values(\"D_SCORE\", inplace=True)\n",
" D_SCORE_bucket = []\n",
" num_buckets = num_buckets\n",
" bucket_size = math.ceil(len(state_tracts.index) / num_buckets)\n",
" for i in range(len(state_tracts.index)):\n",
" D_SCORE_bucket.extend([math.floor(i / bucket_size)])\n",
" state_tracts[\"D_SCORE_bucket\"] = D_SCORE_bucket\n",
" return state_tracts\n",
"\n",
"\n",
"def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):\n",
" # dissolve tracts by bucket\n",
" state_attr = state_tracts[[\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]].reset_index(\n",
" drop=True\n",
" )\n",
" state_dissolve = state_attr.dissolve(by=\"D_SCORE_bucket\", aggfunc=agg_func)\n",
" return state_dissolve\n",
"\n",
"\n",
"def breakup_multipolygons(state_bucketed_df: pd.DataFrame, num_buckets: int):\n",
" compressed = []\n",
" for i in range(num_buckets):\n",
" for j in range(len(state_bucketed_df[\"geometry\"][i].geoms)):\n",
" compressed.append(\n",
" [\n",
" state_bucketed_df[\"D_SCORE\"][i],\n",
" state_bucketed_df[\"geometry\"][i].geoms[j],\n",
" ]\n",
" )\n",
" return compressed\n",
"\n",
"\n",
"def write_to_file(compressed: pd.DataFrame, file_name: str):\n",
" gdf_compressed = gpd.GeoDataFrame(\n",
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )\n",
" gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\")\n",
"\n",
"\n",
"def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):\n",
" print(f\"Processing file {file_name}...\")\n",
" state_merged_simplified = merge_and_simplify_file(file_name, usa_df)\n",
" state_tracts = aggregate_to_tracts(state_merged_simplified)\n",
" state_tracts = create_buckets_from_tracts(state_tracts, num_buckets)\n",
" state_bucketed_df = aggregate_buckets(state_tracts, \"mean\")\n",
" compressed = breakup_multipolygons(state_bucketed_df, num_buckets)\n",
" write_to_file(compressed, file_name)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "Ia5bqxS2LJqe"
},
"outputs": [],
"source": [
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n",
"CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n",
"score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "Dtf5qD50JvCw"
},
"outputs": [],
"source": [
"master_df = gpd.GeoDataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty GeoDataFrame\n",
"Columns: []\n",
"Index: []"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"master_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "PNdw8bERJyKk"
},
"outputs": [],
"source": [
"for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n",
" state_gdf = gpd.read_file(file_name)\n",
" master_df = master_df.append(state_gdf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "B5SS9y2pLwks"
},
"outputs": [],
"source": [
"master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_C6vaR9HQeLa",
"outputId": "fab3bc7f-e716-431e-bc76-bd26289ea4a4"
},
"outputs": [],
"source": [
"master_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oMoubjqCQiw5",
"outputId": "6195ffbc-6275-40c6-bb6a-e0a6bd1e71f0"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>STATEFP10</th>\n",
" <th>COUNTYFP10</th>\n",
" <th>TRACTCE10</th>\n",
" <th>BLKGRPCE10</th>\n",
" <th>GEOID10</th>\n",
" <th>NAMELSAD10</th>\n",
" <th>MTFCC10</th>\n",
" <th>FUNCSTAT10</th>\n",
" <th>ALAND10</th>\n",
" <th>AWATER10</th>\n",
" <th>INTPTLAT10</th>\n",
" <th>INTPTLON10</th>\n",
" <th>geometry</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>01</td>\n",
" <td>005</td>\n",
" <td>950500</td>\n",
" <td>2</td>\n",
" <td>010059505002</td>\n",
" <td>Block Group 2</td>\n",
" <td>G5030</td>\n",
" <td>S</td>\n",
" <td>191306077</td>\n",
" <td>605058</td>\n",
" <td>+31.7728221</td>\n",
" <td>-085.3325011</td>\n",
" <td>POLYGON ((-85.17240 31.82508, -85.17334 31.824...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>01</td>\n",
" <td>005</td>\n",
" <td>950500</td>\n",
" <td>1</td>\n",
" <td>010059505001</td>\n",
" <td>Block Group 1</td>\n",
" <td>G5030</td>\n",
" <td>S</td>\n",
" <td>44574612</td>\n",
" <td>8952734</td>\n",
" <td>+31.7523221</td>\n",
" <td>-085.2009470</td>\n",
" <td>POLYGON ((-85.16283 31.81051, -85.16284 31.813...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" STATEFP10 ... geometry\n",
"0 01 ... POLYGON ((-85.17240 31.82508, -85.17334 31.824...\n",
"1 01 ... POLYGON ((-85.16283 31.81051, -85.16284 31.813...\n",
"\n",
"[2 rows x 13 columns]"
]
},
"execution_count": 69,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"master_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "bAMmGSgzVml0"
},
"outputs": [],
"source": [
"usa_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "U7M7dExdV2Vh"
},
"outputs": [],
"source": [
"usa_merged = master_df.merge(usa_df, on=\"GEOID10\", how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Sr25DUkxWVhg",
"outputId": "1e804075-0f7d-4174-82d7-e21b8519c8bf"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>STATEFP10</th>\n",
" <th>COUNTYFP10</th>\n",
" <th>TRACTCE10</th>\n",
" <th>BLKGRPCE10</th>\n",
" <th>GEOID10</th>\n",
" <th>NAMELSAD10</th>\n",
" <th>MTFCC10</th>\n",
" <th>FUNCSTAT10</th>\n",
" <th>ALAND10</th>\n",
" <th>AWATER10</th>\n",
" <th>INTPTLAT10</th>\n",
" <th>INTPTLON10</th>\n",
" <th>geometry</th>\n",
" <th>Housing burden (percent)</th>\n",
" <th>Total population</th>\n",
" <th>Air toxics cancer risk</th>\n",
" <th>Respiratory hazard index</th>\n",
" <th>Diesel particulate matter</th>\n",
" <th>Particulate matter (PM2.5)</th>\n",
" <th>Ozone</th>\n",
" <th>Traffic proximity and volume</th>\n",
" <th>Proximity to RMP sites</th>\n",
" <th>Proximity to TSDF sites</th>\n",
" <th>Proximity to NPL sites</th>\n",
" <th>Wastewater discharge</th>\n",
" <th>Percent pre-1960s housing (lead paint indicator)</th>\n",
" <th>Individuals under 5 years old</th>\n",
" <th>Individuals over 64 years old</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" <th>Percent of households in linguistic isolation</th>\n",
" <th>Poverty (Less than 200% of federal poverty line)</th>\n",
" <th>Percent individuals age 25 or over with less than high school degree</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Housing + Transportation Costs % Income for the Regional Typical Household</th>\n",
" <th>GEOID10 (percentile)</th>\n",
" <th>Housing burden (percent) (percentile)</th>\n",
" <th>Total population (percentile)</th>\n",
" <th>Air toxics cancer risk (percentile)</th>\n",
" <th>Respiratory hazard index (percentile)</th>\n",
" <th>Diesel particulate matter (percentile)</th>\n",
" <th>...</th>\n",
" <th>Air toxics cancer risk (min-max normalized)</th>\n",
" <th>Respiratory hazard index (min-max normalized)</th>\n",
" <th>Diesel particulate matter (min-max normalized)</th>\n",
" <th>Particulate matter (PM2.5) (min-max normalized)</th>\n",
" <th>Ozone (min-max normalized)</th>\n",
" <th>Traffic proximity and volume (min-max normalized)</th>\n",
" <th>Proximity to RMP sites (min-max normalized)</th>\n",
" <th>Proximity to TSDF sites (min-max normalized)</th>\n",
" <th>Proximity to NPL sites (min-max normalized)</th>\n",
" <th>Wastewater discharge (min-max normalized)</th>\n",
" <th>Percent pre-1960s housing (lead paint indicator) (min-max normalized)</th>\n",
" <th>Individuals under 5 years old (min-max normalized)</th>\n",
" <th>Individuals over 64 years old (min-max normalized)</th>\n",
" <th>Linguistic isolation (percent) (min-max normalized)</th>\n",
" <th>Percent of households in linguistic isolation (min-max normalized)</th>\n",
" <th>Poverty (Less than 200% of federal poverty line) (min-max normalized)</th>\n",
" <th>Percent individuals age 25 or over with less than high school degree (min-max normalized)</th>\n",
" <th>Unemployed civilians (percent) (min-max normalized)</th>\n",
" <th>Housing + Transportation Costs % Income for the Regional Typical Household (min-max normalized)</th>\n",
" <th>Score A</th>\n",
" <th>Score B</th>\n",
" <th>Socioeconomic Factors</th>\n",
" <th>Sensitive populations</th>\n",
" <th>Environmental effects</th>\n",
" <th>Exposures</th>\n",
" <th>Pollution Burden</th>\n",
" <th>Population Characteristics</th>\n",
" <th>Score C</th>\n",
" <th>Score D</th>\n",
" <th>Score E</th>\n",
" <th>Score A (percentile)</th>\n",
" <th>Score A (top 25th percentile)</th>\n",
" <th>Score B (percentile)</th>\n",
" <th>Score B (top 25th percentile)</th>\n",
" <th>Score C (percentile)</th>\n",
" <th>Score C (top 25th percentile)</th>\n",
" <th>Score D (percentile)</th>\n",
" <th>Score D (top 25th percentile)</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>01</td>\n",
" <td>005</td>\n",
" <td>950500</td>\n",
" <td>2</td>\n",
" <td>010059505002</td>\n",
" <td>Block Group 2</td>\n",
" <td>G5030</td>\n",
" <td>S</td>\n",
" <td>191306077</td>\n",
" <td>605058</td>\n",
" <td>+31.7728221</td>\n",
" <td>-085.3325011</td>\n",
" <td>POLYGON ((-85.17240 31.82508, -85.17334 31.824...</td>\n",
" <td>0.176565</td>\n",
" <td>923.0</td>\n",
" <td>44.636463</td>\n",
" <td>0.784089</td>\n",
" <td>0.121767</td>\n",
" <td>9.536056</td>\n",
" <td>34.660008</td>\n",
" <td>0.880242</td>\n",
" <td>0.295180</td>\n",
" <td>0.023752</td>\n",
" <td>0.019262</td>\n",
" <td>0.050677</td>\n",
" <td>0.20177</td>\n",
" <td>0.047671</td>\n",
" <td>0.286024</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.276273</td>\n",
" <td>0.181102</td>\n",
" <td>0.159836</td>\n",
" <td>64.0</td>\n",
" <td>0.000631</td>\n",
" <td>0.25485</td>\n",
" <td>0.272930</td>\n",
" <td>0.944257</td>\n",
" <td>0.982043</td>\n",
" <td>0.082062</td>\n",
" <td>...</td>\n",
" <td>0.025691</td>\n",
" <td>0.181789</td>\n",
" <td>0.020039</td>\n",
" <td>0.444097</td>\n",
" <td>0.190363</td>\n",
" <td>0.000023</td>\n",
" <td>0.016043</td>\n",
" <td>0.000054</td>\n",
" <td>0.002143</td>\n",
" <td>1.179715e-07</td>\n",
" <td>0.20177</td>\n",
" <td>0.090801</td>\n",
" <td>0.286024</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.276273</td>\n",
" <td>0.181102</td>\n",
" <td>0.159836</td>\n",
" <td>0.322034</td>\n",
" <td>0.597295</td>\n",
" <td>0.335222</td>\n",
" <td>0.638895</td>\n",
" <td>0.535636</td>\n",
" <td>0.381877</td>\n",
" <td>0.494252</td>\n",
" <td>0.456794</td>\n",
" <td>0.587265</td>\n",
" <td>0.268259</td>\n",
" <td>0.149124</td>\n",
" <td>0.529853</td>\n",
" <td>0.617238</td>\n",
" <td>False</td>\n",
" <td>0.61452</td>\n",
" <td>False</td>\n",
" <td>0.615988</td>\n",
" <td>False</td>\n",
" <td>0.565349</td>\n",
" <td>False</td>\n",
" <td>0.576986</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>01</td>\n",
" <td>005</td>\n",
" <td>950500</td>\n",
" <td>1</td>\n",
" <td>010059505001</td>\n",
" <td>Block Group 1</td>\n",
" <td>G5030</td>\n",
" <td>S</td>\n",
" <td>44574612</td>\n",
" <td>8952734</td>\n",
" <td>+31.7523221</td>\n",
" <td>-085.2009470</td>\n",
" <td>POLYGON ((-85.16283 31.81051, -85.16284 31.813...</td>\n",
" <td>0.176565</td>\n",
" <td>818.0</td>\n",
" <td>44.636463</td>\n",
" <td>0.784089</td>\n",
" <td>0.121767</td>\n",
" <td>9.536056</td>\n",
" <td>34.660008</td>\n",
" <td>60.055410</td>\n",
" <td>0.232153</td>\n",
" <td>0.027767</td>\n",
" <td>0.018079</td>\n",
" <td>0.007115</td>\n",
" <td>0.00000</td>\n",
" <td>0.007335</td>\n",
" <td>0.264059</td>\n",
" <td>0.039261</td>\n",
" <td>0.038369</td>\n",
" <td>0.391198</td>\n",
" <td>0.186147</td>\n",
" <td>0.053125</td>\n",
" <td>80.0</td>\n",
" <td>0.000626</td>\n",
" <td>0.25485</td>\n",
" <td>0.200764</td>\n",
" <td>0.944257</td>\n",
" <td>0.982043</td>\n",
" <td>0.082062</td>\n",
" <td>...</td>\n",
" <td>0.025691</td>\n",
" <td>0.181789</td>\n",
" <td>0.020039</td>\n",
" <td>0.444097</td>\n",
" <td>0.190363</td>\n",
" <td>0.001598</td>\n",
" <td>0.012618</td>\n",
" <td>0.000063</td>\n",
" <td>0.002011</td>\n",
" <td>1.656256e-08</td>\n",
" <td>0.00000</td>\n",
" <td>0.013971</td>\n",
" <td>0.264059</td>\n",
" <td>0.039261</td>\n",
" <td>0.038369</td>\n",
" <td>0.391198</td>\n",
" <td>0.186147</td>\n",
" <td>0.053125</td>\n",
" <td>0.412429</td>\n",
" <td>0.693861</td>\n",
" <td>0.477826</td>\n",
" <td>0.728309</td>\n",
" <td>0.557538</td>\n",
" <td>0.264424</td>\n",
" <td>0.530404</td>\n",
" <td>0.441744</td>\n",
" <td>0.642924</td>\n",
" <td>0.284008</td>\n",
" <td>0.159628</td>\n",
" <td>0.589397</td>\n",
" <td>0.723269</td>\n",
" <td>False</td>\n",
" <td>0.73044</td>\n",
" <td>False</td>\n",
" <td>0.661758</td>\n",
" <td>False</td>\n",
" <td>0.608434</td>\n",
" <td>False</td>\n",
" <td>0.670349</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 98 columns</p>\n",
"</div>"
],
"text/plain": [
" STATEFP10 COUNTYFP10 ... Score E (percentile) Score E (top 25th percentile)\n",
"0 01 005 ... 0.576986 False\n",
"1 01 005 ... 0.670349 False\n",
"\n",
"[2 rows x 98 columns]"
]
},
"execution_count": 72,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"usa_merged.head(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ANMlAB8Qmtu8",
"outputId": "44934741-90a9-4664-fab5-2c39b348d2be"
},
"outputs": [],
"source": [
"usa_merged_compressed = gpd.GeoDataFrame(usa_merged, crs=\"EPSG:4326\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "PBPD9LQctvPJ"
},
"outputs": [],
"source": [
"usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qAAEr1z-WZAT"
},
"outputs": [],
"source": [
"usa_simplified = usa_merged[\n",
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
" ].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SCNUjEbzWg-o"
},
"outputs": [],
"source": [
"usa_simplified.rename(\n",
" columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Ej70uX0AmW0J",
"outputId": "88908f5e-b62d-494f-f0ea-649089b6652a"
},
"outputs": [],
"source": [
"usa_cbg_compressed = gpd.GeoDataFrame(\n",
" usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "UE12dWmame3I"
},
"outputs": [],
"source": [
"usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wWFBduQQXGtM"
},
"outputs": [],
"source": [
"usa_tracts = aggregate_to_tracts(usa_simplified)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"id": "L-PTnEWOpDtX"
},
"outputs": [],
"source": [
"num_buckets = 10"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kTJafXcqXC01",
"outputId": "bd197952-76b7-4f99-edef-983f20d7acfb"
},
"outputs": [],
"source": [
"tracts_compressed = gpd.GeoDataFrame(\n",
" usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "E2Nh97IlYhCF"
},
"outputs": [],
"source": [
"tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "557zPMWFZC8R"
},
"outputs": [],
"source": [
"usa_bucketed = create_buckets_from_tracts(usa_tracts)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "k6RRdKlsaO0a"
},
"outputs": [],
"source": [
"usa_aggregated = aggregate_buckets(usa_bucketed, agg_func=\"mean\")"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-cm5eET2pA1Z",
"outputId": "8d5d2e80-ad62-41d5-f1b0-922345f92d62"
},
"outputs": [
{
"data": {
"text/plain": [
"(10, 2)"
]
},
"execution_count": 80,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"usa_aggregated.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4ZvJra-RaZ4v"
},
"outputs": [],
"source": [
"compressed = breakup_multipolygons(usa_aggregated, num_buckets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RDS7Q2WAb4Rx",
"outputId": "dcd28a31-083d-482e-b000-b4cd1046d4c2"
},
"outputs": [
{
"data": {
"text/plain": [
"36836"
]
},
"execution_count": 82,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"len(compressed)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "VXTv8UuXb-qU"
},
"outputs": [],
"source": [
"gdf_compressed = gpd.GeoDataFrame(\n",
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5v7TyB_rcRgT",
"outputId": "997625cc-c57a-4335-9b27-a08e4f8ad117"
},
"outputs": [
{
"data": {
"text/plain": [
"(36836, 2)"
]
},
"execution_count": 84,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"gdf_compressed.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "5eAnPL8McJpn"
},
"outputs": [],
"source": [
"gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")"
]
}
],
"metadata": {
"colab": {
"name": "Score_Dissolve_Script",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -0,0 +1,150 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
" \n",
"from data_pipeline.utils import unzip_file_from_url\n",
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"STATE_CSV = DATA_PATH / \"census\" / \"csv\" / \"fips_states_2010.csv\"\n",
"SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa.csv\"\n",
"COUNTY_SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa-county.csv\"\n",
"CENSUS_COUNTIES_ZIP_URL = \"https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_counties_national.zip\"\n",
"CENSUS_COUNTIES_TXT = TMP_PATH / \"2020_Gaz_counties_national.txt\""
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"unzip_file_from_url(CENSUS_COUNTIES_ZIP_URL, TMP_PATH, TMP_PATH)"
],
"outputs": [],
"metadata": {
"scrolled": true
}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"counties_df = pd.read_csv(CENSUS_COUNTIES_TXT, sep=\"\\t\", dtype={\"GEOID\": \"string\", \"USPS\": \"string\"}, low_memory=False)\n",
"counties_df = counties_df[['USPS', 'GEOID', 'NAME']]\n",
"counties_df.rename(columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True)\n",
"counties_df.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"states_df = pd.read_csv(STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"})\n",
"states_df.rename(columns={\"fips\": \"State Code\", \"state_name\": \"State Name\", \"state_abbreviation\": \"State Abbreviation\"}, inplace=True)\n",
"states_df.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"county_state_merged = counties_df.join(states_df, rsuffix=' Other')\n",
"del county_state_merged[\"State Abbreviation Other\"]\n",
"county_state_merged.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"score_df = pd.read_csv(SCORE_CSV, dtype={\"GEOID10\": \"string\"})\n",
"score_df[\"GEOID\"] = score_df.GEOID10.str[:5]\n",
"score_df.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"score_county_state_merged = score_df.join(county_state_merged, rsuffix='_OTHER')\n",
"del score_county_state_merged[\"GEOID_OTHER\"]\n",
"score_county_state_merged.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"score_county_state_merged.to_csv(COUNTY_SCORE_CSV, index=False)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,59 @@
import os
from pathlib import Path
from subprocess import call
from data_pipeline.utils import get_module_logger, remove_all_from_dir
logger = get_module_logger(__name__)
def generate_tiles(data_path: Path) -> None:
score_tiles_path = data_path / "score" / "tiles"
high_tile_path = score_tiles_path / "high"
low_tile_path = score_tiles_path / "low"
score_geojson_dir = data_path / "score" / "geojson"
USA_HIGH_MIN_ZOOM = 5
USA_HIGH_MAX_ZOOM = 11
USA_LOW_MIN_ZOOM = 0
USA_LOW_MAX_ZOOM = 7
# remove existing mbtiles file
remove_all_from_dir(score_tiles_path)
# create dirs
os.mkdir(high_tile_path)
os.mkdir(low_tile_path)
# generate high mbtiles file
logger.info("Generating USA High mbtiles file")
cmd = "tippecanoe "
cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --layer=blocks "
cmd += f"--output={high_tile_path}/usa_high.mbtiles "
cmd += str(score_geojson_dir / "usa-high.json")
call(cmd, shell=True)
# generate high mvts
logger.info("Generating USA High mvt folders and files")
cmd = "tippecanoe "
cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --no-tile-compression "
cmd += f"--output-to-directory={high_tile_path} "
cmd += str(score_geojson_dir / "usa-high.json")
call(cmd, shell=True)
# generate low mbtiles file
logger.info("Generating USA Low mbtiles file")
cmd = "tippecanoe "
cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --layer=blocks "
cmd += f"--output={low_tile_path}/usa_low.mbtiles "
cmd += str(score_geojson_dir / "usa-low.json")
call(cmd, shell=True)
# generate low mvts
logger.info("Generating USA Low mvt folders and files")
cmd = "tippecanoe "
cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --no-tile-compression "
cmd += f"--output-to-directory={low_tile_path} "
cmd += str(score_geojson_dir / "usa-low.json")
call(cmd, shell=True)

File diff suppressed because it is too large Load diff