Issue 1141: Definition M (#1151)

This commit is contained in:
Lucas Merrill Brown 2022-01-18 14:56:55 -05:00 committed by GitHub
parent a07bf752b0
commit 18f299c5f8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
21 changed files with 1000 additions and 143 deletions

View file

@ -256,57 +256,57 @@ const AreaDetail = ({properties}:IAreaDetailProps) => {
id: 'climate-change',
titleText: intl.formatMessage(EXPLORE_COPY.SIDE_PANEL_CATEGORY.CLIMATE),
indicators: [expAgLoss, expBldLoss, expPopLoss, lowInc],
isDisadvagtaged: properties[constants.IS_CLIMATE_FACTOR_DISADVANTAGED_L] ?
properties[constants.IS_CLIMATE_FACTOR_DISADVANTAGED_L] : null,
isDisadvagtaged: properties[constants.IS_CLIMATE_FACTOR_DISADVANTAGED_M] ?
properties[constants.IS_CLIMATE_FACTOR_DISADVANTAGED_M] : null,
},
{
id: 'clean-energy',
titleText: intl.formatMessage(EXPLORE_COPY.SIDE_PANEL_CATEGORY.CLEAN_ENERGY),
indicators: [energyBurden, pm25, lowInc],
isDisadvagtaged: properties[constants.IS_ENERGY_FACTOR_DISADVANTAGED_L] ?
properties[constants.IS_ENERGY_FACTOR_DISADVANTAGED_L] : null,
isDisadvagtaged: properties[constants.IS_ENERGY_FACTOR_DISADVANTAGED_M] ?
properties[constants.IS_ENERGY_FACTOR_DISADVANTAGED_M] : null,
},
{
id: 'clean-transport',
titleText: intl.formatMessage(EXPLORE_COPY.SIDE_PANEL_CATEGORY.CLEAN_TRANSPORT),
indicators: [dieselPartMatter, trafficVolume, lowInc],
isDisadvagtaged: properties[constants.IS_TRANSPORT_FACTOR_DISADVANTAGED_L] ?
properties[constants.IS_TRANSPORT_FACTOR_DISADVANTAGED_L] : null,
isDisadvagtaged: properties[constants.IS_TRANSPORT_FACTOR_DISADVANTAGED_M] ?
properties[constants.IS_TRANSPORT_FACTOR_DISADVANTAGED_M] : null,
},
{
id: 'sustain-house',
titleText: intl.formatMessage(EXPLORE_COPY.SIDE_PANEL_CATEGORY.SUSTAIN_HOUSE),
indicators: [houseBurden, leadPaint, lowInc],
isDisadvagtaged: properties[constants.IS_HOUSING_FACTOR_DISADVANTAGED_L] ?
properties[constants.IS_HOUSING_FACTOR_DISADVANTAGED_L] : null,
isDisadvagtaged: properties[constants.IS_HOUSING_FACTOR_DISADVANTAGED_M] ?
properties[constants.IS_HOUSING_FACTOR_DISADVANTAGED_M] : null,
},
{
id: 'leg-pollute',
titleText: intl.formatMessage(EXPLORE_COPY.SIDE_PANEL_CATEGORY.LEG_POLLUTE),
indicators: [proxHaz, proxNPL, proxRMP, lowInc],
isDisadvagtaged: properties[constants.IS_POLLUTION_FACTOR_DISADVANTAGED_L] ?
properties[constants.IS_POLLUTION_FACTOR_DISADVANTAGED_L] : null,
isDisadvagtaged: properties[constants.IS_POLLUTION_FACTOR_DISADVANTAGED_M] ?
properties[constants.IS_POLLUTION_FACTOR_DISADVANTAGED_M] : null,
},
{
id: 'clean-water',
titleText: intl.formatMessage(EXPLORE_COPY.SIDE_PANEL_CATEGORY.CLEAN_WATER),
indicators: [wasteWater, lowInc],
isDisadvagtaged: properties[constants.IS_WATER_FACTOR_DISADVANTAGED_L] ?
properties[constants.IS_WATER_FACTOR_DISADVANTAGED_L] : null,
isDisadvagtaged: properties[constants.IS_WATER_FACTOR_DISADVANTAGED_M] ?
properties[constants.IS_WATER_FACTOR_DISADVANTAGED_M] : null,
},
{
id: 'health-burdens',
titleText: intl.formatMessage(EXPLORE_COPY.SIDE_PANEL_CATEGORY.HEALTH_BURDEN),
indicators: [asthma, diabetes, heartDisease, lifeExpect, lowInc],
isDisadvagtaged: properties[constants.IS_HEALTH_FACTOR_DISADVANTAGED_L] ?
properties[constants.IS_HEALTH_FACTOR_DISADVANTAGED_L] : null,
isDisadvagtaged: properties[constants.IS_HEALTH_FACTOR_DISADVANTAGED_M] ?
properties[constants.IS_HEALTH_FACTOR_DISADVANTAGED_M] : null,
},
{
id: 'work-dev',
titleText: intl.formatMessage(EXPLORE_COPY.SIDE_PANEL_CATEGORY.WORK_DEV),
indicators: [lowMedInc, lingIso, unemploy, poverty, highSchool],
isDisadvagtaged: properties[constants.IS_WORKFORCE_FACTOR_DISADVANTAGED_L] ?
properties[constants.IS_WORKFORCE_FACTOR_DISADVANTAGED_L] : null,
isDisadvagtaged: properties[constants.IS_WORKFORCE_FACTOR_DISADVANTAGED_M] ?
properties[constants.IS_WORKFORCE_FACTOR_DISADVANTAGED_M] : null,
},
];

View file

@ -35,8 +35,8 @@ export const PERFORMANCE_MARKER_MAP_IDLE = 'MAP_IDLE';
export type J40Properties = { [key: string]: any };
// Properties
export const SCORE_PROPERTY_HIGH = 'SL_PFS';
export const SCORE_PROPERTY_LOW = 'L_SCORE';
export const SCORE_PROPERTY_HIGH = 'SM_PFS';
export const SCORE_PROPERTY_LOW = 'M_SCORE';
export const GEOID_PROPERTY = 'GEOID10';
// Indicator values:
@ -70,14 +70,14 @@ export const PROXIMITY_RMP_SITES_PERCENTILE = 'RMP_PFS';
export const PROXIMITY_TSDF_SITES_PERCENTILE = 'TSDF_PFS';
// Category booleans (disadvantaged or not):
export const IS_CLIMATE_FACTOR_DISADVANTAGED_L = 'L_CLT';
export const IS_ENERGY_FACTOR_DISADVANTAGED_L = 'L_ENY';
export const IS_TRANSPORT_FACTOR_DISADVANTAGED_L = 'L_TRN';
export const IS_HOUSING_FACTOR_DISADVANTAGED_L = 'L_HSG';
export const IS_POLLUTION_FACTOR_DISADVANTAGED_L = 'L_PLN';
export const IS_WATER_FACTOR_DISADVANTAGED_L = 'L_WTR';
export const IS_HEALTH_FACTOR_DISADVANTAGED_L = 'L_HLTH';
export const IS_WORKFORCE_FACTOR_DISADVANTAGED_L = 'L_WKFC';
export const IS_CLIMATE_FACTOR_DISADVANTAGED_M = 'M_CLT';
export const IS_ENERGY_FACTOR_DISADVANTAGED_M = 'M_ENY';
export const IS_TRANSPORT_FACTOR_DISADVANTAGED_M = 'M_TRN';
export const IS_HOUSING_FACTOR_DISADVANTAGED_M = 'M_HSG';
export const IS_POLLUTION_FACTOR_DISADVANTAGED_M = 'M_PLN';
export const IS_WATER_FACTOR_DISADVANTAGED_M = 'M_WTR';
export const IS_HEALTH_FACTOR_DISADVANTAGED_M = 'M_HLTH';
export const IS_WORKFORCE_FACTOR_DISADVANTAGED_M = 'M_WKFC';
// Total indicators values:
export const TOTAL_NUMBER_OF_DISADVANTAGE_INDICATORS = 'TC';

View file

@ -90,8 +90,8 @@ DATASET_LIST = [
"class_name": "HudRecapETL",
},
{
"name": "epa_rsei_aggregate",
"module_dir": "epa_rsei_aggregate",
"name": "epa_rsei",
"module_dir": "epa_rsei",
"class_name": "EPARiskScreeningEnvironmentalIndicatorsETL",
},
{

View file

@ -120,16 +120,16 @@ TILES_SCORE_COLUMNS = {
+ field_names.PERCENTILE_FIELD_SUFFIX: "UF_PFS",
field_names.WASTEWATER_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "WF_PFS",
field_names.L_WATER: "L_WTR",
field_names.L_WORKFORCE: "L_WKFC",
field_names.L_CLIMATE: "L_CLT",
field_names.L_ENERGY: "L_ENY",
field_names.L_TRANSPORTATION: "L_TRN",
field_names.L_HOUSING: "L_HSG",
field_names.L_POLLUTION: "L_PLN",
field_names.L_HEALTH: "L_HLTH",
field_names.SCORE_L_COMMUNITIES: "SL_C",
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX: "SL_PFS",
field_names.M_WATER: "M_WTR",
field_names.M_WORKFORCE: "M_WKFC",
field_names.M_CLIMATE: "M_CLT",
field_names.M_ENERGY: "M_ENY",
field_names.M_TRANSPORTATION: "M_TRN",
field_names.M_HOUSING: "M_HSG",
field_names.M_POLLUTION: "M_PLN",
field_names.M_HEALTH: "M_HLTH",
field_names.SCORE_M_COMMUNITIES: "SM_C",
field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD: "EPLRLI",
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD: "EALRLI",
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD: "EBLRLI",
@ -151,8 +151,8 @@ TILES_SCORE_COLUMNS = {
field_names.POVERTY_LOW_HS_EDUCATION_FIELD: "PLHSE",
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD: "LMILHSE",
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "ULHSE",
field_names.LOW_HS_EDUCATION_FIELD: "LHE",
field_names.FPL_200_SERIES: "FPL200S",
field_names.LOW_HS_EDUCATION_LOW_COLLEGE_ATTENDANCE_FIELD: "LHE",
field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES: "FPL200S",
field_names.THRESHOLD_COUNT: "TC",
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "IAULHSE",
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD: "ISPLHSE",
@ -191,10 +191,10 @@ TILES_SCORE_FLOAT_COLUMNS = [
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_HS_EDUCATION_FIELD,
field_names.LOW_HS_EDUCATION_LOW_COLLEGE_ATTENDANCE_FIELD,
field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX,
]
# Finally we augment with the GEOID10, county, and state
@ -203,9 +203,9 @@ DOWNLOADABLE_SCORE_COLUMNS = [
field_names.COUNTY_FIELD,
field_names.STATE_FIELD,
field_names.THRESHOLD_COUNT,
field_names.SCORE_L_COMMUNITIES,
field_names.SCORE_M_COMMUNITIES,
field_names.TOTAL_POP_FIELD,
field_names.FPL_200_SERIES,
field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,

View file

@ -33,12 +33,12 @@ class GeoScoreETL(ExtractTransformLoad):
self.DATA_PATH / "census" / "geojson" / "us.json"
)
# Import the shortened name for Score L percentile ("SL_PFS") that's used on the
# Import the shortened name for Score M percentile ("SM_PFS") that's used on the
# tiles.
self.TARGET_SCORE_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX
field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX
]
self.TARGET_SCORE_RENAME_TO = "L_SCORE"
self.TARGET_SCORE_RENAME_TO = "M_SCORE"
# Import the shortened name for tract ("GTF") that's used on the tiles.
self.TRACT_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[

View file

@ -323,7 +323,7 @@ class PostScoreETL(ExtractTransformLoad):
# Rename score column
downloadable_df_copy = downloadable_df.rename(
columns={
field_names.SCORE_L_COMMUNITIES: "Identified as disadvantaged (v0.1)"
field_names.SCORE_M_COMMUNITIES: "Identified as disadvantaged (v0.1)"
},
inplace=False,
)

File diff suppressed because one or more lines are too long

View file

@ -42,7 +42,7 @@ class CDCSVIIndex(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
logger.info("Extracting 43 MB CDC SVI INDEX")
logger.info("Downloading 43 MB CDC SVI INDEX")
self.df = pd.read_csv(
filepath_or_buffer=self.CDC_SVI_INDEX_URL,
dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},

View file

@ -22,9 +22,7 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
def __init__(self):
self.AGGREGATED_RSEI_SCORE_FILE_URL = "http://abt-rsei.s3.amazonaws.com/microdata2019/census_agg/CensusMicroTracts2019_2019_aggregated.zip"
self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "epa_rsei_aggregated"
)
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
self.NUMBER_FACILITIES_INPUT_FIELD = "NUMFACS"
@ -74,12 +72,12 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
unzip_file_from_url(
file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
download_path=self.TMP_PATH,
unzipped_file_path=self.TMP_PATH / "epa_rsei_aggregated",
unzipped_file_path=self.TMP_PATH / "epa_rsei",
)
self.df = pd.read_csv(
filepath_or_buffer=self.TMP_PATH
/ "epa_rsei_aggregated"
/ "epa_rsei"
/ "CensusMicroTracts2019_2019_aggregated.csv",
# The following need to remain as strings for all of their digits, not get
# converted to numbers.

View file

@ -33,7 +33,7 @@ class MarylandEJScreenETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
logger.info("Downloading Maryland EJSCREEN Data")
logger.info("Downloading 207MB Maryland EJSCREEN Data")
super().extract(
self.MARYLAND_EJSCREEN_URL,
self.TMP_PATH,

View file

@ -21,6 +21,7 @@
"import requests\n",
"import string\n",
"import sys\n",
"import time\n",
"import typing\n",
"import us\n",
"import zipfile\n",
@ -61,7 +62,10 @@
"# Set some global parameters\n",
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
"COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
"\n",
"time_str = time.strftime(\"%Y%m%d-%H%M%S\")\n",
"\n",
"COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\" / time_str\n",
"\n",
"# Make the dirs if they don't exist\n",
"TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
@ -109,7 +113,9 @@
"cell_type": "code",
"execution_count": null,
"id": "a251a0fb",
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Load EJSCREEN Areas of Concern data.\n",
@ -143,7 +149,9 @@
"cell_type": "code",
"execution_count": null,
"id": "e43a9e23",
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Merge EJSCREEN AoCs into CEJST data.\n",
@ -173,10 +181,13 @@
"source": [
"# Analyze one field at a time (useful for setting thresholds)\n",
"\n",
"quantile = 0.9\n",
"quantile = 0.95\n",
"\n",
"for field in [\n",
" field_names.MEDIAN_HOUSE_VALUE_FIELD,\n",
" field_names.COLLEGE_ATTENDANCE_FIELD,\n",
" field_names.HIGH_SCHOOL_ED_FIELD,\n",
" field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,\n",
" field_names.POVERTY_LESS_THAN_200_FPL_FIELD,\n",
"]:\n",
" print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
" print(cejst_df[field].describe())\n",
@ -223,7 +234,9 @@
"cell_type": "code",
"execution_count": null,
"id": "d8ec43dc",
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Load persistent poverty data\n",
@ -256,7 +269,9 @@
"cell_type": "code",
"execution_count": null,
"id": "81826d29",
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Load mapping inequality data\n",
@ -314,7 +329,9 @@
"cell_type": "code",
"execution_count": null,
"id": "605af1ff",
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Load alternative energy-related definition\n",
@ -333,7 +350,9 @@
"cell_type": "code",
"execution_count": null,
"id": "fe4a2939",
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Load Michigan EJSCREEN\n",
@ -356,15 +375,13 @@
"outputs": [],
"source": [
"# Load EPA RSEI EJSCREEN\n",
"epa_rsei_aggregate_data_path = (\n",
" DATA_DIR / \"dataset\" / \"epa_rsei_aggregated\" / \"usa.csv\"\n",
")\n",
"epa_rsei_aggregate_df = pd.read_csv(\n",
" epa_rsei_aggregate_data_path,\n",
"epa_rsei_data_path = DATA_DIR / \"dataset\" / \"epa_rsei\" / \"usa.csv\"\n",
"epa_rsei_df = pd.read_csv(\n",
" epa_rsei_data_path,\n",
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
")\n",
"\n",
"epa_rsei_aggregate_df.head()"
"epa_rsei_df.head()"
]
},
{
@ -382,7 +399,7 @@
" calenviroscreen_df,\n",
" persistent_poverty_df,\n",
" mapping_inequality_df,\n",
" epa_rsei_aggregate_df,\n",
" epa_rsei_df,\n",
" maryland_ejscreen_df,\n",
" energy_definition_alternative_draft_df,\n",
" michigan_ejscreen_df,\n",
@ -416,7 +433,9 @@
"cell_type": "code",
"execution_count": null,
"id": "2de78f71",
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Special handling for HOLC.\n",
@ -461,13 +480,41 @@
" field_names.L_NON_WORKFORCE,\n",
"]\n",
"\n",
"definition_m_factors = [\n",
" field_names.M_CLIMATE,\n",
" field_names.M_ENERGY,\n",
" field_names.M_TRANSPORTATION,\n",
" field_names.M_HOUSING,\n",
" field_names.M_POLLUTION,\n",
" field_names.M_WATER,\n",
" field_names.M_HEALTH,\n",
" field_names.M_WORKFORCE,\n",
" # Also include a combined factor for all the non-workforce elements.\n",
" field_names.M_NON_WORKFORCE,\n",
"]\n",
"\n",
"census_tract_indices = (\n",
" [\n",
" Index(\n",
" method_name=\"Definition M\",\n",
" priority_communities_field=field_names.SCORE_M_COMMUNITIES,\n",
" ),\n",
" ]\n",
" + [\n",
" Index(\n",
" method_name=\"Definition L\",\n",
" priority_communities_field=field_names.SCORE_L_COMMUNITIES,\n",
" ),\n",
" ]\n",
" # Insert indices for each of the factors from Definition M.\n",
" # Note: since these involve no renaming, we write them using list comprehension.\n",
" + [\n",
" Index(\n",
" method_name=factor,\n",
" priority_communities_field=factor,\n",
" )\n",
" for factor in definition_m_factors\n",
" ]\n",
" # Insert indices for each of the factors from Definition L.\n",
" # Note: since these involve no renaming, we write them using list comprehension.\n",
" + [\n",
@ -575,6 +622,7 @@
"comparison_fields = [\n",
" field_names.POVERTY_LESS_THAN_100_FPL_FIELD,\n",
" field_names.POVERTY_LESS_THAN_200_FPL_FIELD,\n",
" field_names.COLLEGE_ATTENDANCE_FIELD,\n",
" field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,\n",
" field_names.LINGUISTIC_ISO_FIELD,\n",
" field_names.UNEMPLOYMENT_FIELD,\n",
@ -584,6 +632,8 @@
" field_names.LIFE_EXPECTANCY_FIELD,\n",
" field_names.HEALTH_INSURANCE_FIELD,\n",
" field_names.PHYS_HEALTH_NOT_GOOD_FIELD,\n",
" field_names.DIABETES_FIELD,\n",
" field_names.LOW_READING_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,\n",
"]"
]
},
@ -874,7 +924,9 @@
"cell_type": "code",
"execution_count": null,
"id": "2bcbcabf",
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"directory = COMPARISON_OUTPUTS_DIR / \"tracts_basic_stats\"\n",
@ -1001,24 +1053,28 @@
" E.g., it might show that tracts prioritized by A but not B have a higher average income,\n",
" or that tracts prioritized by B but not A have a lower percent of unemployed people.\n",
" \"\"\"\n",
" df_subset = df[\n",
" [\n",
" method_a_priority_census_tracts_field,\n",
" method_b_priority_census_tracts_field,\n",
" ]\n",
" + comparison_fields\n",
" fields_to_group_by = [\n",
" method_a_priority_census_tracts_field,\n",
" method_b_priority_census_tracts_field,\n",
" ]\n",
"\n",
" df_subset = df[fields_to_group_by + comparison_fields]\n",
"\n",
" grouped_df = df_subset.groupby(\n",
" [\n",
" method_a_priority_census_tracts_field,\n",
" method_b_priority_census_tracts_field,\n",
" ],\n",
" fields_to_group_by,\n",
" dropna=False,\n",
" )\n",
"\n",
" # Run the comparison function on the groups.\n",
" comparison_df = grouped_df.mean().reset_index()\n",
" # Take the mean of all fields.\n",
" comparison_df = grouped_df.mean()\n",
"\n",
" # Also add in the count of census tracts.\n",
" count_field_name = \"Count of census tracts\"\n",
" comparison_df[count_field_name] = grouped_df.size().to_frame(\n",
" count_field_name\n",
" )\n",
"\n",
" comparison_df = comparison_df.reset_index()\n",
"\n",
" criteria_description_field_name = \"Description of criteria\"\n",
" comparison_df[criteria_description_field_name] = comparison_df.apply(\n",
@ -1030,10 +1086,13 @@
" )\n",
"\n",
" # Put criteria description column first.\n",
" new_column_order = [criteria_description_field_name] + [\n",
" col\n",
" for col in comparison_df.columns\n",
" if col != criteria_description_field_name\n",
" columns_to_put_first = (\n",
" [criteria_description_field_name]\n",
" + fields_to_group_by\n",
" + [count_field_name]\n",
" )\n",
" new_column_order = columns_to_put_first + [\n",
" col for col in comparison_df.columns if col not in columns_to_put_first\n",
" ]\n",
"\n",
" comparison_df = comparison_df[new_column_order]\n",
@ -1356,7 +1415,9 @@
"cell_type": "code",
"execution_count": null,
"id": "7d095ebd",
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Note: this is helpful because this file is long-running, so it alerts the user when the\n",
@ -1369,7 +1430,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -1383,7 +1444,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
"version": "3.9.6"
}
},
"nbformat": 4,

View file

@ -28,6 +28,8 @@ SCORE_I = "Score I"
SCORE_I_COMMUNITIES = "Score I (communities)"
SCORE_K = "NMTC (communities)"
SCORE_K_COMMUNITIES = "Score K (communities)"
# Definition L fields
SCORE_L = "Definition L"
SCORE_L_COMMUNITIES = "Definition L (communities)"
L_CLIMATE = "Climate Factor (Definition L)"
@ -39,6 +41,20 @@ L_WATER = "Water Factor (Definition L)"
L_HEALTH = "Health Factor (Definition L)"
L_WORKFORCE = "Workforce Factor (Definition L)"
L_NON_WORKFORCE = "Any Non-Workforce Factor (Definition L)"
# Definition M fields
SCORE_M = "Definition M"
SCORE_M_COMMUNITIES = "Definition M (communities)"
M_CLIMATE = "Climate Factor (Definition M)"
M_ENERGY = "Energy Factor (Definition M)"
M_TRANSPORTATION = "Transportation Factor (Definition M)"
M_HOUSING = "Housing Factor (Definition M)"
M_POLLUTION = "Pollution Factor (Definition M)"
M_WATER = "Water Factor (Definition M)"
M_HEALTH = "Health Factor (Definition M)"
M_WORKFORCE = "Workforce Factor (Definition M)"
M_NON_WORKFORCE = "Any Non-Workforce Factor (Definition M)"
PERCENTILE = 90
MEDIAN_HOUSE_VALUE_PERCENTILE = 90
@ -297,6 +313,8 @@ TRANSPORTATION_COSTS = "Transportation Costs"
#####
# Names for individual factors being exceeded
# TODO: for Definition M, create new output field names (different than those used by
# Definition L) and change all output fields to say low income and low college
# Climate Change
EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD = (
f"Greater than or equal to the {PERCENTILE}th percentile"
@ -352,6 +370,8 @@ LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD = (
)
# Workforce
# TODO: for Definition M, create new output field names (different than those used by
# Definition L) and change all output fields to say low HS and low college
UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD = (
f"Greater than or equal to the {PERCENTILE}th percentile for unemployment"
" and has low HS education"
@ -373,6 +393,9 @@ LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = (
)
LOW_HS_EDUCATION_FIELD = "Low high school education"
LOW_HS_EDUCATION_LOW_COLLEGE_ATTENDANCE_FIELD = (
"Low high school education and low college attendance"
)
# Workforce for island areas
ISLAND_AREAS_SUFFIX = " in 2009 (island areas)"
@ -420,5 +443,8 @@ LOW_READING_LOW_HS_EDUCATION_FIELD = (
THRESHOLD_COUNT = "Total threshold criteria exceeded"
FPL_200_SERIES = "Is low income?"
FPL_200_AND_COLLEGE_ATTENDANCE_SERIES = (
"Is low income and low college attendance?"
)
# End of names for individual factors being exceeded
####

View file

@ -120,54 +120,6 @@ class ScoreL(Score):
axis=1, skipna=True
)
def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score L")
self.df[field_names.THRESHOLD_COUNT] = 0
self.df[field_names.FPL_200_SERIES] = self._create_low_income_threshold(
self.df
)
self.df[field_names.L_CLIMATE] = self._climate_factor()
self.df[field_names.L_ENERGY] = self._energy_factor()
self.df[field_names.L_TRANSPORTATION] = self._transportation_factor()
self.df[field_names.L_HOUSING] = self._housing_factor()
self.df[field_names.L_POLLUTION] = self._pollution_factor()
self.df[field_names.L_WATER] = self._water_factor()
self.df[field_names.L_HEALTH] = self._health_factor()
self.df[field_names.L_WORKFORCE] = self._workforce_factor()
factors = [
field_names.L_CLIMATE,
field_names.L_ENERGY,
field_names.L_TRANSPORTATION,
field_names.L_HOUSING,
field_names.L_POLLUTION,
field_names.L_WATER,
field_names.L_HEALTH,
field_names.L_WORKFORCE,
]
self.df[field_names.SCORE_L_COMMUNITIES] = self.df[factors].any(axis=1)
# Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB.
non_workforce_factors = [
field_names.L_CLIMATE,
field_names.L_ENERGY,
field_names.L_TRANSPORTATION,
field_names.L_HOUSING,
field_names.L_POLLUTION,
field_names.L_WATER,
field_names.L_HEALTH,
]
self.df[field_names.L_NON_WORKFORCE] = self.df[
non_workforce_factors
].any(axis=1)
self.df[
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX
] = self.df[field_names.SCORE_L_COMMUNITIES].astype(int)
return self.df
def _climate_factor(self) -> bool:
# In Xth percentile or above for FEMAs Risk Index (Source: FEMA
# AND
@ -689,3 +641,51 @@ class ScoreL(Score):
workforce_combined_criteria_for_states
| workforce_combined_criteria_for_island_areas
)
def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score L")
self.df[field_names.THRESHOLD_COUNT] = 0
self.df[field_names.FPL_200_SERIES] = self._create_low_income_threshold(
self.df
)
self.df[field_names.L_CLIMATE] = self._climate_factor()
self.df[field_names.L_ENERGY] = self._energy_factor()
self.df[field_names.L_TRANSPORTATION] = self._transportation_factor()
self.df[field_names.L_HOUSING] = self._housing_factor()
self.df[field_names.L_POLLUTION] = self._pollution_factor()
self.df[field_names.L_WATER] = self._water_factor()
self.df[field_names.L_HEALTH] = self._health_factor()
self.df[field_names.L_WORKFORCE] = self._workforce_factor()
factors = [
field_names.L_CLIMATE,
field_names.L_ENERGY,
field_names.L_TRANSPORTATION,
field_names.L_HOUSING,
field_names.L_POLLUTION,
field_names.L_WATER,
field_names.L_HEALTH,
field_names.L_WORKFORCE,
]
self.df[field_names.SCORE_L_COMMUNITIES] = self.df[factors].any(axis=1)
# Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB.
non_workforce_factors = [
field_names.L_CLIMATE,
field_names.L_ENERGY,
field_names.L_TRANSPORTATION,
field_names.L_HOUSING,
field_names.L_POLLUTION,
field_names.L_WATER,
field_names.L_HEALTH,
]
self.df[field_names.L_NON_WORKFORCE] = self.df[
non_workforce_factors
].any(axis=1)
self.df[
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX
] = self.df[field_names.SCORE_L_COMMUNITIES].astype(int)
return self.df

View file

@ -0,0 +1,770 @@
import numpy as np
import pandas as pd
from data_pipeline.score.score import Score
import data_pipeline.score.field_names as field_names
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreM(Score):
"""Very similar to Score L, with a few minor modifications."""
def __init__(self, df: pd.DataFrame) -> None:
self.LOW_INCOME_THRESHOLD: float = 0.65
self.MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
super().__init__(df)
def _combine_island_areas_with_states_and_set_thresholds(
self,
df: pd.DataFrame,
column_from_island_areas: str,
column_from_decennial_census: str,
combined_column_name: str,
threshold_cutoff_for_island_areas: float,
) -> (pd.DataFrame, str):
"""Steps to set thresholds for island areas.
This function is fairly logically complicated. It takes the following steps:
1. Combine the two different fields into a single field.
2. Calculate the 90th percentile cutoff raw value for the combined field.
3. Create a boolean series that is true for any census tract in the island
areas (and only the island areas) that exceeds this cutoff.
For step one, it combines data that is either the island area's Decennial Census
value in 2009 or the state's value in 5-year ACS ending in 2010.
This will be used to generate the percentile cutoff for the 90th percentile.
The stateside decennial census stopped asking economic comparisons,
so this is as close to apples-to-apples as we get. We use 5-year ACS for data
robustness over 1-year ACS.
"""
# Create the combined field.
# TODO: move this combined field percentile calculation to `etl_score`,
# since most other percentile logic is there.
# There should only be one entry in either 2009 or 2019 fields, not one in both.
# But just to be safe, we take the mean and ignore null values so if there
# *were* entries in both, this result would make sense.
df[combined_column_name] = df[
[column_from_island_areas, column_from_decennial_census]
].mean(axis=1, skipna=True)
logger.info(
f"Combined field `{combined_column_name}` has "
f"{df[combined_column_name].isnull().sum()} "
f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) "
f"missing values for census tracts. "
)
# Calculate the percentile threshold raw value.
raw_threshold = np.nanquantile(
a=df[combined_column_name], q=threshold_cutoff_for_island_areas
)
logger.info(
f"For combined field `{combined_column_name}`, "
f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a "
f"raw value of {raw_threshold:.3f}."
)
threshold_column_name = (
f"{column_from_island_areas} exceeds "
f"{threshold_cutoff_for_island_areas*100:.0f}th percentile"
)
df[threshold_column_name] = (
df[column_from_island_areas] >= raw_threshold
)
percent_of_tracts_highlighted = (
100
* df[threshold_column_name].sum()
/ df[column_from_island_areas].notnull().sum()
)
logger.info(
f"For `{threshold_column_name}`, "
f"{df[threshold_column_name].sum()} ("
f"{percent_of_tracts_highlighted:.2f}% of tracts that have non-null data "
f"in the column) have a value of TRUE."
)
return df, threshold_column_name
def _create_low_income_and_low_college_attendance_threshold(
self, df: pd.DataFrame
) -> pd.Series:
"""
Returns a pandas series (really a numpy array)
of booleans based on the condition of the FPL at 200%
is at or more than some established threshold
"""
return (
(
df[
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.LOW_INCOME_THRESHOLD
)
) & (
(
df[field_names.COLLEGE_ATTENDANCE_FIELD]
<= self.MAX_COLLEGE_ATTENDANCE_THRESHOLD
)
| (
# If college attendance data is null for this tract, just rely on the
# poverty data
df[field_names.COLLEGE_ATTENDANCE_FIELD].isna()
)
)
def _increment_total_eligibility_exceeded(
self, columns_for_subset: list
) -> None:
"""
Increments the total eligible factors for a given tract
"""
self.df[field_names.THRESHOLD_COUNT] += self.df[columns_for_subset].sum(
axis=1, skipna=True
)
def _climate_factor(self) -> bool:
# In Xth percentile or above for FEMAs Risk Index (Source: FEMA
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
climate_eligibility_columns = [
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD,
# field_names.EXTREME_HEAT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD,
]
expected_population_loss_threshold = (
self.df[
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
expected_agriculture_loss_threshold = (
self.df[
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
expected_building_loss_threshold = (
self.df[
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
extreme_heat_and_median_house_value_threshold = (
self.df[
field_names.EXTREME_HEAT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
) & (
self.df[
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
<= self.MEDIAN_HOUSE_VALUE_THRESHOLD
)
self.df[field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD] = (
expected_population_loss_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD] = (
expected_agriculture_loss_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD] = (
expected_building_loss_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[
field_names.EXTREME_HEAT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD
] = (
extreme_heat_and_median_house_value_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(climate_eligibility_columns)
return self.df[climate_eligibility_columns].any(axis="columns")
def _energy_factor(self) -> bool:
# In Xth percentile or above for DOEs energy cost burden score (Source: LEAD Score)
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
energy_eligibility_columns = [
field_names.PM25_EXPOSURE_LOW_INCOME_FIELD,
field_names.ENERGY_BURDEN_LOW_INCOME_FIELD,
]
energy_burden_threshold = (
self.df[
field_names.ENERGY_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
pm25_threshold = (
self.df[
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.PM25_EXPOSURE_LOW_INCOME_FIELD] = (
pm25_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.ENERGY_BURDEN_LOW_INCOME_FIELD] = (
energy_burden_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(energy_eligibility_columns)
return self.df[energy_eligibility_columns].any(axis="columns")
def _transportation_factor(self) -> bool:
# In Xth percentile or above for diesel particulate matter (Source: EPA National Air Toxics Assessment (NATA)
# or
# In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)]
# or
# In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
transportion_eligibility_columns = [
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD,
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD,
]
diesel_threshold = (
self.df[
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
traffic_threshold = (
self.df[
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD] = (
diesel_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD] = (
traffic_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
transportion_eligibility_columns
)
return self.df[transportion_eligibility_columns].any(axis="columns")
def _housing_factor(self) -> bool:
# (
# In Xth percentile or above for lead paint (Source: Census's American Community Surveys
# percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
# AND
# In Yth percentile or below for Median House Value (Source: Census's American Community Survey)
# )
# or
# In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
housing_eligibility_columns = [
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD,
field_names.HOUSING_BURDEN_LOW_INCOME_FIELD,
]
lead_paint_median_home_value_threshold = (
self.df[
field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
) & (
self.df[
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
<= self.MEDIAN_HOUSE_VALUE_THRESHOLD
)
housing_burden_threshold = (
self.df[
field_names.HOUSING_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
# series by series indicators
self.df[field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD] = (
lead_paint_median_home_value_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.HOUSING_BURDEN_LOW_INCOME_FIELD] = (
housing_burden_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(housing_eligibility_columns)
return self.df[housing_eligibility_columns].any(axis="columns")
def _pollution_factor(self) -> bool:
# Proximity to Risk Management Plan sites is > X
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
pollution_eligibility_columns = [
field_names.RMP_LOW_INCOME_FIELD,
field_names.SUPERFUND_LOW_INCOME_FIELD,
field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD,
]
rmp_sites_threshold = (
self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
npl_sites_threshold = (
self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
tsdf_sites_threshold = (
self.df[
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
# individual series-by-series
self.df[field_names.RMP_LOW_INCOME_FIELD] = (
rmp_sites_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.SUPERFUND_LOW_INCOME_FIELD] = (
npl_sites_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD] = (
tsdf_sites_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
pollution_eligibility_columns
)
return self.df[pollution_eligibility_columns].any(axis="columns")
def _water_factor(self) -> bool:
# In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model)
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
wastewater_threshold = (
self.df[
field_names.WASTEWATER_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD] = (
wastewater_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD]
)
return self.df[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD]
def _health_factor(self) -> bool:
# In Xth percentile or above for diabetes (Source: CDC Places)
# or
# In Xth percentile or above for asthma (Source: CDC Places)
# or
# In Xth percentile or above for heart disease
# or
# In Xth percentile or above for low life expectancy (Source: CDC Places)
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
health_eligibility_columns = [
field_names.DIABETES_LOW_INCOME_FIELD,
field_names.ASTHMA_LOW_INCOME_FIELD,
field_names.HEART_DISEASE_LOW_INCOME_FIELD,
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD,
# field_names.HEALTHY_FOOD_LOW_INCOME_FIELD,
]
diabetes_threshold = (
self.df[
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
asthma_threshold = (
self.df[
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
heart_disease_threshold = (
self.df[
field_names.HEART_DISEASE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
low_life_expectancy_threshold = (
self.df[
field_names.LOW_LIFE_EXPECTANCY_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
healthy_food_threshold = (
self.df[
field_names.HEALTHY_FOOD_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.DIABETES_LOW_INCOME_FIELD] = (
diabetes_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.ASTHMA_LOW_INCOME_FIELD] = (
asthma_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.HEART_DISEASE_LOW_INCOME_FIELD] = (
heart_disease_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD] = (
low_life_expectancy_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.HEALTHY_FOOD_LOW_INCOME_FIELD] = (
healthy_food_threshold
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(health_eligibility_columns)
return self.df[health_eligibility_columns].any(axis="columns")
def _workforce_factor(self) -> bool:
# Where unemployment is above Xth percentile
# or
# Where median income as a percent of area median income is above Xth percentile
# or
# Where the percent of households at or below 100% of the federal poverty level
# is above Xth percentile
# or
# Where linguistic isolation is above Xth percentile
# AND
# Where the high school degree achievement rates for adults 25 years and older
# is less than Y%
# (necessary to screen out university tracts)
# Workforce criteria for states fields.
workforce_eligibility_columns = [
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD,
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
]
self.df[field_names.LOW_HS_EDUCATION_LOW_COLLEGE_ATTENDANCE_FIELD] = (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
) & (
(
self.df[field_names.COLLEGE_ATTENDANCE_FIELD]
<= self.MAX_COLLEGE_ATTENDANCE_THRESHOLD
)
| (
# If college attendance data is null for this tract, just rely on the
# poverty/AMI data
self.df[field_names.COLLEGE_ATTENDANCE_FIELD].isna()
)
)
unemployment_threshold = (
self.df[
field_names.UNEMPLOYMENT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
low_median_income_threshold = (
self.df[
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
linguistic_isolation_threshold = (
self.df[
field_names.LINGUISTIC_ISO_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
poverty_threshold = (
self.df[
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD] = (
linguistic_isolation_threshold
& self.df[field_names.LOW_HS_EDUCATION_LOW_COLLEGE_ATTENDANCE_FIELD]
)
self.df[field_names.POVERTY_LOW_HS_EDUCATION_FIELD] = (
poverty_threshold
& self.df[field_names.LOW_HS_EDUCATION_LOW_COLLEGE_ATTENDANCE_FIELD]
)
self.df[field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD] = (
low_median_income_threshold
& self.df[field_names.LOW_HS_EDUCATION_LOW_COLLEGE_ATTENDANCE_FIELD]
)
self.df[field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD] = (
unemployment_threshold
& self.df[field_names.LOW_HS_EDUCATION_LOW_COLLEGE_ATTENDANCE_FIELD]
)
workforce_combined_criteria_for_states = self.df[
workforce_eligibility_columns
].any(axis="columns")
self._increment_total_eligibility_exceeded(
workforce_eligibility_columns
)
# Now, calculate workforce criteria for island territories.
island_areas_workforce_eligibility_columns = [
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
]
# First, combine unemployment.
(
self.df,
island_areas_unemployment_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010,
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
# Next, combine poverty.
(
self.df,
island_areas_poverty_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
# Also check whether low area median income is 90th percentile or higher
# within the islands.
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name = (
f"{field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009} exceeds "
f"{field_names.PERCENTILE}th percentile"
)
self.df[
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name
] = (
self.df[
field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] = (
self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
)
self.df[
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD
] = (
self.df[island_areas_unemployment_criteria_field_name]
& self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
)
self.df[field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD] = (
self.df[island_areas_poverty_criteria_field_name]
& self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
)
self.df[
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD
] = (
self.df[
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name
]
& self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
)
workforce_combined_criteria_for_island_areas = self.df[
island_areas_workforce_eligibility_columns
].any(axis="columns")
self._increment_total_eligibility_exceeded(
island_areas_workforce_eligibility_columns
)
percent_of_island_tracts_highlighted = (
100
* workforce_combined_criteria_for_island_areas.sum()
# Choosing a random column from island areas to calculate the denominator.
/ self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009]
.notnull()
.sum()
)
logger.info(
f"For workforce criteria in island areas, "
f"{workforce_combined_criteria_for_island_areas.sum()} ("
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
f"in the column) have a value of TRUE."
)
# A tract is included if it meets either the states tract criteria or the
# island areas tract criteria.
return (
workforce_combined_criteria_for_states
| workforce_combined_criteria_for_island_areas
)
def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score M")
self.df[field_names.THRESHOLD_COUNT] = 0
self.df[
field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES
] = self._create_low_income_and_low_college_attendance_threshold(
self.df
)
self.df[field_names.M_CLIMATE] = self._climate_factor()
self.df[field_names.M_ENERGY] = self._energy_factor()
self.df[field_names.M_TRANSPORTATION] = self._transportation_factor()
self.df[field_names.M_HOUSING] = self._housing_factor()
self.df[field_names.M_POLLUTION] = self._pollution_factor()
self.df[field_names.M_WATER] = self._water_factor()
self.df[field_names.M_HEALTH] = self._health_factor()
self.df[field_names.M_WORKFORCE] = self._workforce_factor()
factors = [
field_names.M_CLIMATE,
field_names.M_ENERGY,
field_names.M_TRANSPORTATION,
field_names.M_HOUSING,
field_names.M_POLLUTION,
field_names.M_WATER,
field_names.M_HEALTH,
field_names.M_WORKFORCE,
]
self.df[field_names.SCORE_M_COMMUNITIES] = self.df[factors].any(axis=1)
# Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB.
non_workforce_factors = [
field_names.M_CLIMATE,
field_names.M_ENERGY,
field_names.M_TRANSPORTATION,
field_names.M_HOUSING,
field_names.M_POLLUTION,
field_names.M_WATER,
field_names.M_HEALTH,
]
self.df[field_names.M_NON_WORKFORCE] = self.df[
non_workforce_factors
].any(axis=1)
self.df[
field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX
] = self.df[field_names.SCORE_M_COMMUNITIES].astype(int)
return self.df

View file

@ -9,6 +9,7 @@ from data_pipeline.score.score_h import ScoreH
from data_pipeline.score.score_i import ScoreI
from data_pipeline.score.score_k import ScoreK
from data_pipeline.score.score_l import ScoreL
from data_pipeline.score.score_m import ScoreM
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
@ -33,6 +34,7 @@ class ScoreRunner:
self.df = ScoreI(df=self.df).add_columns()
self.df = ScoreK(df=self.df).add_columns()
self.df = ScoreL(df=self.df).add_columns()
self.df = ScoreM(df=self.df).add_columns()
# TODO do this with each score instead of in a bundle
# Create percentiles for these index scores