{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "a664f981", "metadata": {}, "outputs": [], "source": [ "# Before running this notebook, you must run the following notebooks (in any order):\n", "# 1. `ejscreen_etl.ipynb`\n", "# 2. `census_etl.ipynb`\n", "# 3. `housing_and_transportation_etl.ipynb`\n", "# 4. `hud_housing_etl.ipynb`\n", "\n", "import collections\n", "import functools\n", "from pathlib import Path\n", "import pandas as pd\n", "import csv\n", "import os\n", "import sys\n", "\n", "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", "\n", "from etl.sources.census.etl_utils import get_state_fips_codes\n", "\n", "# Define some global parameters\n", "GEOID_FIELD_NAME = \"GEOID10\"\n", "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", "BUCKET_SOCIOECONOMIC = \"Socioeconomic Factors\"\n", "BUCKET_SENSITIVE = \"Sensitive populations\"\n", "BUCKET_ENVIRONMENTAL = \"Environmental effects\"\n", "BUCKET_EXPOSURES = \"Exposures\"\n", "BUCKETS = [\n", " BUCKET_SOCIOECONOMIC,\n", " BUCKET_SENSITIVE,\n", " BUCKET_ENVIRONMENTAL,\n", " BUCKET_EXPOSURES,\n", "]\n", "\n", "# A few specific field names\n", "# TODO: clean this up, I name some fields but not others.\n", "UNEMPLOYED_FIELD_NAME = \"Unemployed civilians (percent)\"\n", "LINGUISTIC_ISOLATION_FIELD_NAME = \"Linguistic isolation (percent)\"\n", "HOUSING_BURDEN_FIELD_NAME = \"Housing burden (percent)\"\n", "POVERTY_FIELD_NAME = \"Poverty (Less than 200% of federal poverty line)\"\n", "HIGH_SCHOOL_FIELD_NAME = (\n", " \"Percent individuals age 25 or over with less than high school degree\"\n", ")\n", "\n", "# There's another aggregation level (a second level of \"buckets\").\n", "AGGREGATION_POLLUTION = \"Pollution Burden\"\n", "AGGREGATION_POPULATION = \"Population Characteristics\"\n", "\n", "PERCENTILE_FIELD_SUFFIX = \" (percentile)\"\n", "MIN_MAX_FIELD_SUFFIX = \" (min-max normalized)\"\n", "\n", "DATA_PATH = Path.cwd().parent / \"data\"\n", "SCORE_CSV_PATH = DATA_PATH / \"score\" / \"csv\"\n", "\n", "# Tell pandas to display all columns\n", "pd.set_option(\"display.max_columns\", None)" ] }, { "cell_type": "code", "execution_count": null, "id": "7df430cb", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# EJSCreen csv Load\n", "ejscreen_csv = DATA_PATH / \"dataset\" / \"ejscreen_2020\" / \"usa.csv\"\n", "ejscreen_df = pd.read_csv(ejscreen_csv, dtype={\"ID\": \"string\"}, low_memory=False)\n", "ejscreen_df.rename(columns={\"ID\": GEOID_FIELD_NAME}, inplace=True)\n", "ejscreen_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "daba69fb", "metadata": {}, "outputs": [], "source": [ "# Load census data\n", "census_csv = DATA_PATH / \"dataset\" / \"census_acs_2019\" / \"usa.csv\"\n", "census_df = pd.read_csv(\n", " census_csv, dtype={GEOID_FIELD_NAME: \"string\"}, low_memory=False\n", ")\n", "census_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "144bdde2", "metadata": {}, "outputs": [], "source": [ "# Load housing and transportation data\n", "housing_and_transportation_index_csv = (\n", " DATA_PATH / \"dataset\" / \"housing_and_transportation_index\" / \"usa.csv\"\n", ")\n", "housing_and_transportation_df = pd.read_csv(\n", " housing_and_transportation_index_csv,\n", " dtype={GEOID_FIELD_NAME: \"string\"},\n", " low_memory=False,\n", ")\n", "housing_and_transportation_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "a9202e5d", "metadata": {}, "outputs": [], "source": [ "# Load HUD housing data\n", "hud_housing_csv = DATA_PATH / \"dataset\" / \"hud_housing\" / \"usa.csv\"\n", "hud_housing_df = pd.read_csv(\n", " hud_housing_csv,\n", " dtype={GEOID_TRACT_FIELD_NAME: \"string\"},\n", " low_memory=False,\n", ")\n", "hud_housing_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "bf89efd8", "metadata": {}, "outputs": [], "source": [ "# Join all the data sources that use census block groups\n", "census_block_group_dfs = [ejscreen_df, census_df, housing_and_transportation_df]\n", "\n", "census_block_group_df = functools.reduce(\n", " lambda left, right: pd.merge(\n", " left=left, right=right, on=GEOID_FIELD_NAME, how=\"outer\"\n", " ),\n", " census_block_group_dfs,\n", ")\n", "\n", "\n", "if len(census_block_group_df) > 220333:\n", " raise ValueError(\"Too many rows in the join.\")\n", "\n", "census_block_group_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "e79ec27a", "metadata": {}, "outputs": [], "source": [ "# Sanity check the join.\n", "if len(census_block_group_df[GEOID_FIELD_NAME].str.len().unique()) != 1:\n", " raise ValueError(\n", " f\"One of the input CSVs uses {GEOID_FIELD_NAME} with a different length.\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "3d0d2915", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Join all the data sources that use census tracts\n", "# TODO: when there's more than one data source using census tract, reduce/merge them here.\n", "census_tract_df = hud_housing_df\n", "\n", "# Calculate the tract for the CBG data.\n", "census_block_group_df[GEOID_TRACT_FIELD_NAME] = census_block_group_df[\n", " GEOID_FIELD_NAME\n", "].str[0:11]\n", "\n", "df = census_block_group_df.merge(census_tract_df, on=GEOID_TRACT_FIELD_NAME)\n", "\n", "if len(census_block_group_df) > 220333:\n", " raise ValueError(\"Too many rows in the join.\")\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "b8567900", "metadata": {}, "outputs": [], "source": [ "# Define a named tuple that will be used for each data set input.\n", "DataSet = collections.namedtuple(\n", " typename=\"DataSet\", field_names=[\"input_field\", \"renamed_field\", \"bucket\"]\n", ")\n", "\n", "data_sets = [\n", " # The following data sets have `bucket=None`, because it's not used in the bucket based score (\"Score C\").\n", " DataSet(\n", " input_field=GEOID_FIELD_NAME,\n", " # Use the name `GEOID10` to enable geoplatform.gov's workflow.\n", " renamed_field=GEOID_FIELD_NAME,\n", " bucket=None,\n", " ),\n", " DataSet(\n", " input_field=HOUSING_BURDEN_FIELD_NAME,\n", " renamed_field=HOUSING_BURDEN_FIELD_NAME,\n", " bucket=None,\n", " ),\n", " DataSet(input_field=\"ACSTOTPOP\", renamed_field=\"Total population\", bucket=None),\n", " # The following data sets have buckets, because they're used in the score\n", " DataSet(\n", " input_field=\"CANCER\",\n", " renamed_field=\"Air toxics cancer risk\",\n", " bucket=BUCKET_EXPOSURES,\n", " ),\n", " DataSet(\n", " input_field=\"RESP\",\n", " renamed_field=\"Respiratory hazard index\",\n", " bucket=BUCKET_EXPOSURES,\n", " ),\n", " DataSet(\n", " input_field=\"DSLPM\",\n", " renamed_field=\"Diesel particulate matter\",\n", " bucket=BUCKET_EXPOSURES,\n", " ),\n", " DataSet(\n", " input_field=\"PM25\",\n", " renamed_field=\"Particulate matter (PM2.5)\",\n", " bucket=BUCKET_EXPOSURES,\n", " ),\n", " DataSet(input_field=\"OZONE\", renamed_field=\"Ozone\", bucket=BUCKET_EXPOSURES),\n", " DataSet(\n", " input_field=\"PTRAF\",\n", " renamed_field=\"Traffic proximity and volume\",\n", " bucket=BUCKET_EXPOSURES,\n", " ),\n", " DataSet(\n", " input_field=\"PRMP\",\n", " renamed_field=\"Proximity to RMP sites\",\n", " bucket=BUCKET_ENVIRONMENTAL,\n", " ),\n", " DataSet(\n", " input_field=\"PTSDF\",\n", " renamed_field=\"Proximity to TSDF sites\",\n", " bucket=BUCKET_ENVIRONMENTAL,\n", " ),\n", " DataSet(\n", " input_field=\"PNPL\",\n", " renamed_field=\"Proximity to NPL sites\",\n", " bucket=BUCKET_ENVIRONMENTAL,\n", " ),\n", " DataSet(\n", " input_field=\"PWDIS\",\n", " renamed_field=\"Wastewater discharge\",\n", " bucket=BUCKET_ENVIRONMENTAL,\n", " ),\n", " DataSet(\n", " input_field=\"PRE1960PCT\",\n", " renamed_field=\"Percent pre-1960s housing (lead paint indicator)\",\n", " bucket=BUCKET_ENVIRONMENTAL,\n", " ),\n", " DataSet(\n", " input_field=\"UNDER5PCT\",\n", " renamed_field=\"Individuals under 5 years old\",\n", " bucket=BUCKET_SENSITIVE,\n", " ),\n", " DataSet(\n", " input_field=\"OVER64PCT\",\n", " renamed_field=\"Individuals over 64 years old\",\n", " bucket=BUCKET_SENSITIVE,\n", " ),\n", " DataSet(\n", " input_field=LINGUISTIC_ISOLATION_FIELD_NAME,\n", " renamed_field=LINGUISTIC_ISOLATION_FIELD_NAME,\n", " bucket=BUCKET_SENSITIVE,\n", " ),\n", " DataSet(\n", " input_field=\"LINGISOPCT\",\n", " renamed_field=\"Percent of households in linguistic isolation\",\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", " DataSet(\n", " input_field=\"LOWINCPCT\",\n", " renamed_field=POVERTY_FIELD_NAME,\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", " DataSet(\n", " input_field=\"LESSHSPCT\",\n", " renamed_field=HIGH_SCHOOL_FIELD_NAME,\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", " DataSet(\n", " input_field=UNEMPLOYED_FIELD_NAME,\n", " renamed_field=UNEMPLOYED_FIELD_NAME,\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", " DataSet(\n", " input_field=\"ht_ami\",\n", " renamed_field=\"Housing + Transportation Costs % Income for the Regional Typical Household\",\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "e152a655", "metadata": {}, "outputs": [], "source": [ "# Rename columns:\n", "renaming_dict = {data_set.input_field: data_set.renamed_field for data_set in data_sets}\n", "\n", "df.rename(\n", " columns=renaming_dict,\n", " inplace=True,\n", " errors=\"raise\",\n", ")\n", "\n", "columns_to_keep = [data_set.renamed_field for data_set in data_sets]\n", "df = df[columns_to_keep]\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "1280cbd4", "metadata": {}, "outputs": [], "source": [ "# Convert all columns to numeric.\n", "for data_set in data_sets:\n", " # Skip GEOID_FIELD_NAME, because it's a string.\n", " if data_set.renamed_field == GEOID_FIELD_NAME:\n", " continue\n", " df[f\"{data_set.renamed_field}\"] = pd.to_numeric(df[data_set.renamed_field])" ] }, { "cell_type": "code", "execution_count": null, "id": "27677132", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# calculate percentiles\n", "for data_set in data_sets:\n", " df[f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[\n", " data_set.renamed_field\n", " ].rank(pct=True)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "f2088013", "metadata": {}, "outputs": [], "source": [ "# calculate min max\n", "# Math:\n", "# (\n", "# Observed value\n", "# - minimum of all values\n", "# )\n", "# divided by\n", "# (\n", "# Maximum of all values\n", "# - minimum of all values\n", "# )\n", "for data_set in data_sets:\n", " # Skip GEOID_FIELD_NAME, because it's a string.\n", " if data_set.renamed_field == GEOID_FIELD_NAME:\n", " continue\n", "\n", " min_value = df[data_set.renamed_field].min(skipna=True)\n", "\n", " max_value = df[data_set.renamed_field].max(skipna=True)\n", "\n", " print(\n", " f\"For data set {data_set.renamed_field}, the min value is {min_value} and the max value is {max_value}.\"\n", " )\n", "\n", " df[f\"{data_set.renamed_field}{MIN_MAX_FIELD_SUFFIX}\"] = (\n", " df[data_set.renamed_field] - min_value\n", " ) / (max_value - min_value)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "1f7b864f", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Calculate score \"A\" and score \"B\"\n", "df[\"Score A\"] = df[\n", " [\n", " \"Poverty (Less than 200% of federal poverty line) (percentile)\",\n", " \"Percent individuals age 25 or over with less than high school degree (percentile)\",\n", " ]\n", "].mean(axis=1)\n", "df[\"Score B\"] = (\n", " df[\"Poverty (Less than 200% of federal poverty line) (percentile)\"]\n", " * df[\n", " \"Percent individuals age 25 or over with less than high school degree (percentile)\"\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "0c107baf", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Calculate \"CalEnviroScreen for the US\" score\n", "# Average all the percentile values in each bucket into a single score for each of the four buckets.\n", "for bucket in BUCKETS:\n", " fields_in_bucket = [\n", " f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"\n", " for data_set in data_sets\n", " if data_set.bucket == bucket\n", " ]\n", " df[f\"{bucket}\"] = df[fields_in_bucket].mean(axis=1)\n", "\n", "# Combine the score from the two Exposures and Environmental Effects buckets into a single score called \"Pollution Burden\". The math for this score is: (1.0 * Exposures Score + 0.5 * Environment Effects score) / 1.5.\n", "df[AGGREGATION_POLLUTION] = (\n", " 1.0 * df[f\"{BUCKET_EXPOSURES}\"] + 0.5 * df[f\"{BUCKET_ENVIRONMENTAL}\"]\n", ") / 1.5\n", "\n", "# Average the score from the two Sensitive populations and Socioeconomic factors buckets into a single score called \"Population Characteristics\".\n", "df[AGGREGATION_POPULATION] = df[\n", " [f\"{BUCKET_SENSITIVE}\", f\"{BUCKET_SOCIOECONOMIC}\"]\n", "].mean(axis=1)\n", "\n", "# Multiply the \"Pollution Burden\" score and the \"Population Characteristics\" together to produce the cumulative impact score.\n", "df[\"Score C\"] = df[AGGREGATION_POLLUTION] * df[AGGREGATION_POPULATION]\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "f70106f5", "metadata": {}, "outputs": [], "source": [ "fields_to_use_in_score = [\n", " UNEMPLOYED_FIELD_NAME,\n", " LINGUISTIC_ISOLATION_FIELD_NAME,\n", " HOUSING_BURDEN_FIELD_NAME,\n", " POVERTY_FIELD_NAME,\n", " HIGH_SCHOOL_FIELD_NAME,\n", "]\n", "\n", "fields_min_max = [f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score]\n", "fields_percentile = [\n", " f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score\n", "]\n", "\n", "# Calculate \"Score D\", which uses min-max normalization\n", "# and calculate \"Score E\", which uses percentile normalization for the same fields\n", "df[\"Score D\"] = df[fields_min_max].mean(axis=1)\n", "df[\"Score E\"] = df[fields_percentile].mean(axis=1)\n", "\n", "print(df[\"Score D\"].describe())\n", "print(df[\"Score E\"].describe())" ] }, { "cell_type": "code", "execution_count": null, "id": "729aed12", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Create percentiles for the scores\n", "for score_field in [\"Score A\", \"Score B\", \"Score C\", \"Score D\", \"Score E\"]:\n", " df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[score_field].rank(pct=True)\n", " df[f\"{score_field} (top 25th percentile)\"] = (\n", " df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] >= 0.75\n", " )\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "b3a65af4", "metadata": {}, "outputs": [], "source": [ "# write nationwide csv\n", "df.to_csv(SCORE_CSV_PATH / f\"usa.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "58ddd8b3", "metadata": {}, "outputs": [], "source": [ "# write per state csvs\n", "for states_fips in get_state_fips_codes(DATA_PATH):\n", " print(f\"Generating data{states_fips} csv\")\n", " df1 = df[df[\"GEOID10\"].str[:2] == states_fips]\n", " # we need to name the file data01.csv for ogr2ogr csv merge to work\n", " df1.to_csv(SCORE_CSV_PATH / f\"data{states_fips}.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 5 }