j40-cejst-2/data/data-pipeline/data_pipeline/content/config/scratch.ipynb
Travis Newby a27ca46b1d
Update dependencies to fix safety check failures (#2142)
* Update dependencies

Update dependencies causing safety check to fail

* Remove nb_black from jupyter notebooks

Because of the build issue on M1 macs, nb_black was removed as a dev dependency. This change removes the lines referencing nb_black (%load_ext lab_black) from all jupyter notebooks.
2023-02-02 16:43:59 -06:00

798 lines
41 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "cf8f39b0-7735-4f7c-9178-61bbf2257951",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "66639c20-be5e-4bf6-9b58-98338874f7cc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Median value ($) of owner-occupied housing units (percentile)'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check = pd.read_csv(\n",
" \"/Users/emmausds/j40/data_pipeline/data/score/downloadable/codebook.csv\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5e525e4e-6764-4d4d-9119-b4d400ba022f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>score_name</th>\n",
" <th>csv_field_type</th>\n",
" <th>csv_label</th>\n",
" <th>excel_label</th>\n",
" <th>calculation_notes</th>\n",
" <th>threshold_category</th>\n",
" <th>notes</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>GEOID10_TRACT</td>\n",
" <td>string</td>\n",
" <td>Census tract ID</td>\n",
" <td>Census tract ID</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>County Name</td>\n",
" <td>string</td>\n",
" <td>County Name</td>\n",
" <td>County Name</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>State/Territory</td>\n",
" <td>string</td>\n",
" <td>State/Territory</td>\n",
" <td>State/Territory</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Total threshold criteria exceeded</td>\n",
" <td>int64</td>\n",
" <td>Total threshold criteria exceeded</td>\n",
" <td>Total threshold criteria exceeded</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Lists out the total number of criteria (where ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Definition M (communities)</td>\n",
" <td>bool</td>\n",
" <td>Identified as disadvantaged</td>\n",
" <td>Identified as disadvantaged</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>True / False variable for whether a tract is a...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>Percentage households below 100% of federal po...</td>\n",
" <td>percentage</td>\n",
" <td>Percentage households below 100% of federal po...</td>\n",
" <td>Percentage households below 100% of federal po...</td>\n",
" <td>Because not all data is available for the Nati...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>bool</td>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>Because not all data is available for the Nati...</td>\n",
" <td>training and workforce development</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>bool</td>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>Because not all data is available for the Nati...</td>\n",
" <td>training and workforce development</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80</th>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>bool</td>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>Because not all data is available for the Nati...</td>\n",
" <td>training and workforce development</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81</th>\n",
" <td>Percent of population not currently enrolled i...</td>\n",
" <td>percentage</td>\n",
" <td>Percent of residents who are not currently enr...</td>\n",
" <td>Percent of residents who are not currently enr...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>82 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" score_name csv_field_type \\\n",
"0 GEOID10_TRACT string \n",
"1 County Name string \n",
"2 State/Territory string \n",
"3 Total threshold criteria exceeded int64 \n",
"4 Definition M (communities) bool \n",
".. ... ... \n",
"77 Percentage households below 100% of federal po... percentage \n",
"78 Greater than or equal to the 90th percentile f... bool \n",
"79 Greater than or equal to the 90th percentile f... bool \n",
"80 Greater than or equal to the 90th percentile f... bool \n",
"81 Percent of population not currently enrolled i... percentage \n",
"\n",
" csv_label \\\n",
"0 Census tract ID \n",
"1 County Name \n",
"2 State/Territory \n",
"3 Total threshold criteria exceeded \n",
"4 Identified as disadvantaged \n",
".. ... \n",
"77 Percentage households below 100% of federal po... \n",
"78 Greater than or equal to the 90th percentile f... \n",
"79 Greater than or equal to the 90th percentile f... \n",
"80 Greater than or equal to the 90th percentile f... \n",
"81 Percent of residents who are not currently enr... \n",
"\n",
" excel_label \\\n",
"0 Census tract ID \n",
"1 County Name \n",
"2 State/Territory \n",
"3 Total threshold criteria exceeded \n",
"4 Identified as disadvantaged \n",
".. ... \n",
"77 Percentage households below 100% of federal po... \n",
"78 Greater than or equal to the 90th percentile f... \n",
"79 Greater than or equal to the 90th percentile f... \n",
"80 Greater than or equal to the 90th percentile f... \n",
"81 Percent of residents who are not currently enr... \n",
"\n",
" calculation_notes \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
".. ... \n",
"77 Because not all data is available for the Nati... \n",
"78 Because not all data is available for the Nati... \n",
"79 Because not all data is available for the Nati... \n",
"80 Because not all data is available for the Nati... \n",
"81 NaN \n",
"\n",
" threshold_category \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
".. ... \n",
"77 NaN \n",
"78 training and workforce development \n",
"79 training and workforce development \n",
"80 training and workforce development \n",
"81 NaN \n",
"\n",
" notes \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 Lists out the total number of criteria (where ... \n",
"4 True / False variable for whether a tract is a... \n",
".. ... \n",
"77 NaN \n",
"78 NaN \n",
"79 NaN \n",
"80 NaN \n",
"81 NaN \n",
"\n",
"[82 rows x 7 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "d86c867a-1a55-4ec0-82a6-040841406236",
"metadata": {},
"outputs": [],
"source": [
"codebook = pd.DataFrame(to_frame_dict)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "6215deaf-b004-4da0-a70b-bc54f636601a",
"metadata": {},
"outputs": [],
"source": [
"details_to_merge = pd.DataFrame(mapping_dictionary)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "ac4e65c2-09e6-4978-9440-37b3be057f65",
"metadata": {},
"outputs": [],
"source": [
"shapefile_codes = pd.read_csv(\n",
" \"/Users/emmausds/j40/data_pipeline/data/score/shapefile/columns.csv\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 153,
"id": "31cfd9ec-5f5f-4642-a51f-6875c2c279a4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)',\n",
" 'Expected building loss rate (Natural Hazards Risk Index) (percentile)',\n",
" 'Expected population loss rate (Natural Hazards Risk Index) (percentile)',\n",
" 'Energy burden (percentile)',\n",
" 'PM2.5 in the air (percentile)',\n",
" 'Diesel particulate matter exposure (percentile)',\n",
" 'Traffic proximity and volume (percentile)',\n",
" 'Housing burden (percent) (percentile)',\n",
" 'Percent pre-1960s housing (lead paint indicator) (percentile)',\n",
" 'Median value ($) of owner-occupied housing units (percentile)',\n",
" 'Proximity to hazardous waste sites (percentile)',\n",
" 'Proximity to NPL sites (percentile)',\n",
" 'Proximity to Risk Management Plan (RMP) facilities (percentile)',\n",
" 'Wastewater discharge (percentile)',\n",
" 'Current asthma among adults aged greater than or equal to 18 years (percentile)',\n",
" 'Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)',\n",
" 'Coronary heart disease among adults aged greater than or equal to 18 years (percentile)',\n",
" 'Low life expectancy (percentile)',\n",
" 'Low median household income as a percent of area median income (percentile)',\n",
" 'Linguistic isolation (percent) (percentile)',\n",
" 'Unemployment (percent) (percentile)',\n",
" 'Percent of individuals below 200% Federal Poverty Line (percentile)',\n",
" 'Percent of individuals < 100% Federal Poverty Line (percentile)',\n",
" 'Percent individuals age 25 or over with less than high school degree (percentile)',\n",
" 'Definition M (percentile)',\n",
" 'Low median household income as a percent of territory median income in 2009 (percentile)',\n",
" 'Percentage households below 100% of federal poverty line in 2009 for island areas (percentile)',\n",
" 'Unemployment (percent) in 2009 for island areas (percentile)']"
]
},
"execution_count": 153,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 154,
"id": "66dde4fc-48e6-4bdf-b3a6-16c766e94d8a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" - column_name: Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Expected building loss rate (Natural Hazards Risk Index) (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Expected population loss rate (Natural Hazards Risk Index) (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Energy burden (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: PM2.5 in the air (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Diesel particulate matter exposure (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Traffic proximity and volume (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Housing burden (percent) (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Percent pre-1960s housing (lead paint indicator) (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Median value ($) of owner-occupied housing units (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Proximity to hazardous waste sites (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Proximity to NPL sites (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Proximity to Risk Management Plan (RMP) facilities (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Wastewater discharge (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Current asthma among adults aged greater than or equal to 18 years (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Coronary heart disease among adults aged greater than or equal to 18 years (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Low life expectancy (percentile)\n",
" notes: (1) this percentile is reversed, meaning the lowest raw numbers become the highest percentiles, and (2) all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Low median household income as a percent of area median income (percentile)\n",
" notes: (1) this percentile is reversed, meaning the lowest raw numbers become the highest percentiles, and (2) all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Linguistic isolation (percent) (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Unemployment (percent) (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Percent of individuals below 200% Federal Poverty Line (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Percent of individuals < 100% Federal Poverty Line (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Percent individuals age 25 or over with less than high school degree (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Definition M (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Low median household income as a percent of territory median income in 2009 (percentile)\n",
" notes: (1) this percentile is reversed, meaning the lowest raw numbers become the highest percentiles, and (2) all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Percentage households below 100% of federal poverty line in 2009 for island areas (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
" - column_name: Unemployment (percent) in 2009 for island areas (percentile)\n",
" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n"
]
}
],
"source": [
"for col in [\n",
" col for col in download_codebook.index.to_list() if \"(percentile)\" in col\n",
"]:\n",
" print(f\" - column_name: {col}\")\n",
" if \"Low\" not in col:\n",
" print(\n",
" f\" notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\"\n",
" )\n",
" else:\n",
" print(\n",
" f\" notes: (1) this percentile is reversed, meaning the lowest raw numbers become the highest percentiles, and (2) all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 143,
"id": "5c08708e-4ebf-4cfe-8efb-7ee6c7930427",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>excel_label</th>\n",
" <th>format</th>\n",
" <th>shapefile_column</th>\n",
" <th>notes</th>\n",
" <th>category</th>\n",
" </tr>\n",
" <tr>\n",
" <th>score_name</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>GEOID10_TRACT</th>\n",
" <td>Census tract ID</td>\n",
" <td>string</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>County Name</th>\n",
" <td>County Name</td>\n",
" <td>string</td>\n",
" <td>CF</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>State/Territory</th>\n",
" <td>State/Territory</td>\n",
" <td>string</td>\n",
" <td>SF</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Total threshold criteria exceeded</th>\n",
" <td>Total threshold criteria exceeded</td>\n",
" <td>int64</td>\n",
" <td>TC</td>\n",
" <td>Lists out the total number of criteria (where ...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Definition M (communities)</th>\n",
" <td>Identified as disadvantaged</td>\n",
" <td>bool</td>\n",
" <td>SM_C</td>\n",
" <td>True / False variable for whether a tract is a...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Unemployment (percent) in 2009 (island areas) and 2010 (states and PR)</th>\n",
" <td>Unemployment (percent) in 2009 (island areas) ...</td>\n",
" <td>percentage</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Percentage households below 100% of federal poverty line in 2009 (island areas) and 2010 (states and PR)</th>\n",
" <td>Percentage households below 100% of federal po...</td>\n",
" <td>percentage</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Greater than or equal to the 90th percentile for unemployment and has low HS education in 2009 (island areas)?</th>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>bool</td>\n",
" <td>IAULHSE</td>\n",
" <td>island area information comes from the dicenni...</td>\n",
" <td>training and workforce development</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Greater than or equal to the 90th percentile for households at or below 100% federal poverty level and has low HS education in 2009 (island areas)?</th>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>bool</td>\n",
" <td>IAPLHSE</td>\n",
" <td>island area information comes from the dicenni...</td>\n",
" <td>training and workforce development</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Greater than or equal to the 90th percentile for low median household income as a percent of area median income and has low HS education in 2009 (island areas)?</th>\n",
" <td>Greater than or equal to the 90th percentile f...</td>\n",
" <td>bool</td>\n",
" <td>IALMILHSE</td>\n",
" <td>island area information comes from the dicenni...</td>\n",
" <td>training and workforce development</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>82 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" excel_label \\\n",
"score_name \n",
"GEOID10_TRACT Census tract ID \n",
"County Name County Name \n",
"State/Territory State/Territory \n",
"Total threshold criteria exceeded Total threshold criteria exceeded \n",
"Definition M (communities) Identified as disadvantaged \n",
"... ... \n",
"Unemployment (percent) in 2009 (island areas) a... Unemployment (percent) in 2009 (island areas) ... \n",
"Percentage households below 100% of federal pov... Percentage households below 100% of federal po... \n",
"Greater than or equal to the 90th percentile fo... Greater than or equal to the 90th percentile f... \n",
"Greater than or equal to the 90th percentile fo... Greater than or equal to the 90th percentile f... \n",
"Greater than or equal to the 90th percentile fo... Greater than or equal to the 90th percentile f... \n",
"\n",
" format \\\n",
"score_name \n",
"GEOID10_TRACT string \n",
"County Name string \n",
"State/Territory string \n",
"Total threshold criteria exceeded int64 \n",
"Definition M (communities) bool \n",
"... ... \n",
"Unemployment (percent) in 2009 (island areas) a... percentage \n",
"Percentage households below 100% of federal pov... percentage \n",
"Greater than or equal to the 90th percentile fo... bool \n",
"Greater than or equal to the 90th percentile fo... bool \n",
"Greater than or equal to the 90th percentile fo... bool \n",
"\n",
" shapefile_column \\\n",
"score_name \n",
"GEOID10_TRACT NaN \n",
"County Name CF \n",
"State/Territory SF \n",
"Total threshold criteria exceeded TC \n",
"Definition M (communities) SM_C \n",
"... ... \n",
"Unemployment (percent) in 2009 (island areas) a... NaN \n",
"Percentage households below 100% of federal pov... NaN \n",
"Greater than or equal to the 90th percentile fo... IAULHSE \n",
"Greater than or equal to the 90th percentile fo... IAPLHSE \n",
"Greater than or equal to the 90th percentile fo... IALMILHSE \n",
"\n",
" notes \\\n",
"score_name \n",
"GEOID10_TRACT NaN \n",
"County Name NaN \n",
"State/Territory NaN \n",
"Total threshold criteria exceeded Lists out the total number of criteria (where ... \n",
"Definition M (communities) True / False variable for whether a tract is a... \n",
"... ... \n",
"Unemployment (percent) in 2009 (island areas) a... NaN \n",
"Percentage households below 100% of federal pov... NaN \n",
"Greater than or equal to the 90th percentile fo... island area information comes from the dicenni... \n",
"Greater than or equal to the 90th percentile fo... island area information comes from the dicenni... \n",
"Greater than or equal to the 90th percentile fo... island area information comes from the dicenni... \n",
"\n",
" category \n",
"score_name \n",
"GEOID10_TRACT NaN \n",
"County Name NaN \n",
"State/Territory NaN \n",
"Total threshold criteria exceeded NaN \n",
"Definition M (communities) NaN \n",
"... ... \n",
"Unemployment (percent) in 2009 (island areas) a... NaN \n",
"Percentage households below 100% of federal pov... NaN \n",
"Greater than or equal to the 90th percentile fo... training and workforce development \n",
"Greater than or equal to the 90th percentile fo... training and workforce development \n",
"Greater than or equal to the 90th percentile fo... training and workforce development \n",
"\n",
"[82 rows x 5 columns]"
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"download_codebook.dropna(subset=[\"format\"]).reset_index()[\"score_name\"]"
]
},
{
"cell_type": "code",
"execution_count": 137,
"id": "7139ce5d-db5e-49dd-8bb3-122c7b73b395",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>excel_label</th>\n",
" <th>format</th>\n",
" <th>shapefile_column</th>\n",
" <th>notes</th>\n",
" <th>category</th>\n",
" </tr>\n",
" <tr>\n",
" <th>score_name</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [excel_label, format, shapefile_column, notes, category]\n",
"Index: []"
]
},
"execution_count": 137,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"download_codebook.loc[\n",
" sum([download_codebook[col] == \"percentile\" for col in [\"format\"]]) > 0\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 134,
"id": "e31ef01c-b225-48f0-bdf5-1efb8d4ed95c",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Cannot index with multidimensional key",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Input \u001b[0;32mIn [134]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdownload_codebook\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloc\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdownload_codebook\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlike\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mformat\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpercentile\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n",
"File \u001b[0;32m/usr/local/lib/python3.9/site-packages/pandas/core/indexing.py:931\u001b[0m, in \u001b[0;36m_LocationIndexer.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 928\u001b[0m axis \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxis \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 930\u001b[0m maybe_callable \u001b[38;5;241m=\u001b[39m com\u001b[38;5;241m.\u001b[39mapply_if_callable(key, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj)\n\u001b[0;32m--> 931\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmaybe_callable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/usr/local/lib/python3.9/site-packages/pandas/core/indexing.py:1151\u001b[0m, in \u001b[0;36m_LocIndexer._getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(labels, MultiIndex)):\n\u001b[1;32m 1150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(key, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mndim\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m key\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m-> 1151\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot index with multidimensional key\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1153\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_iterable(key, axis\u001b[38;5;241m=\u001b[39maxis)\n\u001b[1;32m 1155\u001b[0m \u001b[38;5;66;03m# nested tuple slicing\u001b[39;00m\n",
"\u001b[0;31mValueError\u001b[0m: Cannot index with multidimensional key"
]
}
],
"source": [
"download_codebook.loc[download_codebook.filter(like=\"format\") == \"percentile\"]"
]
},
{
"cell_type": "code",
"execution_count": 131,
"id": "73268de4-3378-4ac7-bf85-f483a78c3966",
"metadata": {},
"outputs": [],
"source": [
"download_codebook = pd.concat(\n",
" [\n",
" codebook.set_index(\"score_name\"),\n",
" shapefile_codes.rename(\n",
" columns={\"meaning\": \"shapefile_column\", \"column\": \"score_name\"}\n",
" ).set_index(\"score_name\"),\n",
" details_to_merge.set_index(\"score_name\"),\n",
" ],\n",
" axis=1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6321ed42-aee6-40fc-8bf8-2a4ce4276eca",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}