j40-cejst-2/data/data-pipeline/data_pipeline/content/config/scratch.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cf8f39b0-7735-4f7c-9178-61bbf2257951",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "66639c20-be5e-4bf6-9b58-98338874f7cc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Median value ($) of owner-occupied housing units (percentile)'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "check = pd.read_csv(\n",
    "    \"/Users/emmausds/j40/data_pipeline/data/score/downloadable/codebook.csv\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5e525e4e-6764-4d4d-9119-b4d400ba022f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>score_name</th>\n",
       "      <th>csv_field_type</th>\n",
       "      <th>csv_label</th>\n",
       "      <th>excel_label</th>\n",
       "      <th>calculation_notes</th>\n",
       "      <th>threshold_category</th>\n",
       "      <th>notes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>GEOID10_TRACT</td>\n",
       "      <td>string</td>\n",
       "      <td>Census tract ID</td>\n",
       "      <td>Census tract ID</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>County Name</td>\n",
       "      <td>string</td>\n",
       "      <td>County Name</td>\n",
       "      <td>County Name</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>State/Territory</td>\n",
       "      <td>string</td>\n",
       "      <td>State/Territory</td>\n",
       "      <td>State/Territory</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Total threshold criteria exceeded</td>\n",
       "      <td>int64</td>\n",
       "      <td>Total threshold criteria exceeded</td>\n",
       "      <td>Total threshold criteria exceeded</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Lists out the total number of criteria (where ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Definition M (communities)</td>\n",
       "      <td>bool</td>\n",
       "      <td>Identified as disadvantaged</td>\n",
       "      <td>Identified as disadvantaged</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True / False variable for whether a tract is a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>77</th>\n",
       "      <td>Percentage households below 100% of federal po...</td>\n",
       "      <td>percentage</td>\n",
       "      <td>Percentage households below 100% of federal po...</td>\n",
       "      <td>Percentage households below 100% of federal po...</td>\n",
       "      <td>Because not all data is available for the Nati...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78</th>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>bool</td>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>Because not all data is available for the Nati...</td>\n",
       "      <td>training and workforce development</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79</th>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>bool</td>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>Because not all data is available for the Nati...</td>\n",
       "      <td>training and workforce development</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80</th>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>bool</td>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>Because not all data is available for the Nati...</td>\n",
       "      <td>training and workforce development</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81</th>\n",
       "      <td>Percent of population not currently enrolled i...</td>\n",
       "      <td>percentage</td>\n",
       "      <td>Percent of residents who are not currently enr...</td>\n",
       "      <td>Percent of residents who are not currently enr...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>82 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           score_name csv_field_type  \\\n",
       "0                                       GEOID10_TRACT         string   \n",
       "1                                         County Name         string   \n",
       "2                                     State/Territory         string   \n",
       "3                   Total threshold criteria exceeded          int64   \n",
       "4                          Definition M (communities)           bool   \n",
       "..                                                ...            ...   \n",
       "77  Percentage households below 100% of federal po...     percentage   \n",
       "78  Greater than or equal to the 90th percentile f...           bool   \n",
       "79  Greater than or equal to the 90th percentile f...           bool   \n",
       "80  Greater than or equal to the 90th percentile f...           bool   \n",
       "81  Percent of population not currently enrolled i...     percentage   \n",
       "\n",
       "                                            csv_label  \\\n",
       "0                                     Census tract ID   \n",
       "1                                         County Name   \n",
       "2                                     State/Territory   \n",
       "3                   Total threshold criteria exceeded   \n",
       "4                         Identified as disadvantaged   \n",
       "..                                                ...   \n",
       "77  Percentage households below 100% of federal po...   \n",
       "78  Greater than or equal to the 90th percentile f...   \n",
       "79  Greater than or equal to the 90th percentile f...   \n",
       "80  Greater than or equal to the 90th percentile f...   \n",
       "81  Percent of residents who are not currently enr...   \n",
       "\n",
       "                                          excel_label  \\\n",
       "0                                     Census tract ID   \n",
       "1                                         County Name   \n",
       "2                                     State/Territory   \n",
       "3                   Total threshold criteria exceeded   \n",
       "4                         Identified as disadvantaged   \n",
       "..                                                ...   \n",
       "77  Percentage households below 100% of federal po...   \n",
       "78  Greater than or equal to the 90th percentile f...   \n",
       "79  Greater than or equal to the 90th percentile f...   \n",
       "80  Greater than or equal to the 90th percentile f...   \n",
       "81  Percent of residents who are not currently enr...   \n",
       "\n",
       "                                    calculation_notes  \\\n",
       "0                                                 NaN   \n",
       "1                                                 NaN   \n",
       "2                                                 NaN   \n",
       "3                                                 NaN   \n",
       "4                                                 NaN   \n",
       "..                                                ...   \n",
       "77  Because not all data is available for the Nati...   \n",
       "78  Because not all data is available for the Nati...   \n",
       "79  Because not all data is available for the Nati...   \n",
       "80  Because not all data is available for the Nati...   \n",
       "81                                                NaN   \n",
       "\n",
       "                    threshold_category  \\\n",
       "0                                  NaN   \n",
       "1                                  NaN   \n",
       "2                                  NaN   \n",
       "3                                  NaN   \n",
       "4                                  NaN   \n",
       "..                                 ...   \n",
       "77                                 NaN   \n",
       "78  training and workforce development   \n",
       "79  training and workforce development   \n",
       "80  training and workforce development   \n",
       "81                                 NaN   \n",
       "\n",
       "                                                notes  \n",
       "0                                                 NaN  \n",
       "1                                                 NaN  \n",
       "2                                                 NaN  \n",
       "3   Lists out the total number of criteria (where ...  \n",
       "4   True / False variable for whether a tract is a...  \n",
       "..                                                ...  \n",
       "77                                                NaN  \n",
       "78                                                NaN  \n",
       "79                                                NaN  \n",
       "80                                                NaN  \n",
       "81                                                NaN  \n",
       "\n",
       "[82 rows x 7 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "d86c867a-1a55-4ec0-82a6-040841406236",
   "metadata": {},
   "outputs": [],
   "source": [
    "codebook = pd.DataFrame(to_frame_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "6215deaf-b004-4da0-a70b-bc54f636601a",
   "metadata": {},
   "outputs": [],
   "source": [
    "details_to_merge = pd.DataFrame(mapping_dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "ac4e65c2-09e6-4978-9440-37b3be057f65",
   "metadata": {},
   "outputs": [],
   "source": [
    "shapefile_codes = pd.read_csv(\n",
    "    \"/Users/emmausds/j40/data_pipeline/data/score/shapefile/columns.csv\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "id": "31cfd9ec-5f5f-4642-a51f-6875c2c279a4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)',\n",
       " 'Expected building loss rate (Natural Hazards Risk Index) (percentile)',\n",
       " 'Expected population loss rate (Natural Hazards Risk Index) (percentile)',\n",
       " 'Energy burden (percentile)',\n",
       " 'PM2.5 in the air (percentile)',\n",
       " 'Diesel particulate matter exposure (percentile)',\n",
       " 'Traffic proximity and volume (percentile)',\n",
       " 'Housing burden (percent) (percentile)',\n",
       " 'Percent pre-1960s housing (lead paint indicator) (percentile)',\n",
       " 'Median value ($) of owner-occupied housing units (percentile)',\n",
       " 'Proximity to hazardous waste sites (percentile)',\n",
       " 'Proximity to NPL sites (percentile)',\n",
       " 'Proximity to Risk Management Plan (RMP) facilities (percentile)',\n",
       " 'Wastewater discharge (percentile)',\n",
       " 'Current asthma among adults aged greater than or equal to 18 years (percentile)',\n",
       " 'Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)',\n",
       " 'Coronary heart disease among adults aged greater than or equal to 18 years (percentile)',\n",
       " 'Low life expectancy (percentile)',\n",
       " 'Low median household income as a percent of area median income (percentile)',\n",
       " 'Linguistic isolation (percent) (percentile)',\n",
       " 'Unemployment (percent) (percentile)',\n",
       " 'Percent of individuals below 200% Federal Poverty Line (percentile)',\n",
       " 'Percent of individuals < 100% Federal Poverty Line (percentile)',\n",
       " 'Percent individuals age 25 or over with less than high school degree (percentile)',\n",
       " 'Definition M (percentile)',\n",
       " 'Low median household income as a percent of territory median income in 2009 (percentile)',\n",
       " 'Percentage households below 100% of federal poverty line in 2009 for island areas (percentile)',\n",
       " 'Unemployment (percent) in 2009 for island areas (percentile)']"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "id": "66dde4fc-48e6-4bdf-b3a6-16c766e94d8a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    - column_name: Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Expected building loss rate (Natural Hazards Risk Index) (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Expected population loss rate (Natural Hazards Risk Index) (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Energy burden (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: PM2.5 in the air (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Diesel particulate matter exposure (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Traffic proximity and volume (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Housing burden (percent) (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Percent pre-1960s housing (lead paint indicator) (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Median value ($) of owner-occupied housing units (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Proximity to hazardous waste sites (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Proximity to NPL sites (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Proximity to Risk Management Plan (RMP) facilities (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Wastewater discharge (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Current asthma among adults aged greater than or equal to 18 years (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Coronary heart disease among adults aged greater than or equal to 18 years (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Low life expectancy (percentile)\n",
      "      notes: (1) this percentile is reversed, meaning the lowest raw numbers become the highest percentiles, and (2) all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Low median household income as a percent of area median income (percentile)\n",
      "      notes: (1) this percentile is reversed, meaning the lowest raw numbers become the highest percentiles, and (2) all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Linguistic isolation (percent) (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Unemployment (percent) (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Percent of individuals below 200% Federal Poverty Line (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Percent of individuals < 100% Federal Poverty Line (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Percent individuals age 25 or over with less than high school degree (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Definition M (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Low median household income as a percent of territory median income in 2009 (percentile)\n",
      "      notes: (1) this percentile is reversed, meaning the lowest raw numbers become the highest percentiles, and (2) all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Percentage households below 100% of federal poverty line in 2009 for island areas (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n",
      "    - column_name: Unemployment (percent) in 2009 for island areas (percentile)\n",
      "      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\n"
     ]
    }
   ],
   "source": [
    "for col in [\n",
    "    col for col in download_codebook.index.to_list() if \"(percentile)\" in col\n",
    "]:\n",
    "    print(f\"    - column_name: {col}\")\n",
    "    if \"Low\" not in col:\n",
    "        print(\n",
    "            f\"      notes: all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\"\n",
    "        )\n",
    "    else:\n",
    "        print(\n",
    "            f\"      notes: (1) this percentile is reversed, meaning the lowest raw numbers become the highest percentiles, and (2) all percentiles are floored (rounded down to the nearest percentile). For example, 89.7th percentile is rounded down to 89 for this field.\"\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "id": "5c08708e-4ebf-4cfe-8efb-7ee6c7930427",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>excel_label</th>\n",
       "      <th>format</th>\n",
       "      <th>shapefile_column</th>\n",
       "      <th>notes</th>\n",
       "      <th>category</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>score_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>GEOID10_TRACT</th>\n",
       "      <td>Census tract ID</td>\n",
       "      <td>string</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>County Name</th>\n",
       "      <td>County Name</td>\n",
       "      <td>string</td>\n",
       "      <td>CF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>State/Territory</th>\n",
       "      <td>State/Territory</td>\n",
       "      <td>string</td>\n",
       "      <td>SF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Total threshold criteria exceeded</th>\n",
       "      <td>Total threshold criteria exceeded</td>\n",
       "      <td>int64</td>\n",
       "      <td>TC</td>\n",
       "      <td>Lists out the total number of criteria (where ...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Definition M (communities)</th>\n",
       "      <td>Identified as disadvantaged</td>\n",
       "      <td>bool</td>\n",
       "      <td>SM_C</td>\n",
       "      <td>True / False variable for whether a tract is a...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Unemployment (percent) in 2009 (island areas) and 2010 (states and PR)</th>\n",
       "      <td>Unemployment (percent) in 2009 (island areas) ...</td>\n",
       "      <td>percentage</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Percentage households below 100% of federal poverty line in 2009 (island areas) and 2010 (states and PR)</th>\n",
       "      <td>Percentage households below 100% of federal po...</td>\n",
       "      <td>percentage</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than or equal to the 90th percentile for unemployment and has low HS education in 2009 (island areas)?</th>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>bool</td>\n",
       "      <td>IAULHSE</td>\n",
       "      <td>island area information comes from the dicenni...</td>\n",
       "      <td>training and workforce development</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than or equal to the 90th percentile for households at or below 100% federal poverty level and has low HS education in 2009 (island areas)?</th>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>bool</td>\n",
       "      <td>IAPLHSE</td>\n",
       "      <td>island area information comes from the dicenni...</td>\n",
       "      <td>training and workforce development</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than or equal to the 90th percentile for low median household income as a percent of area median income and has low HS education in 2009 (island areas)?</th>\n",
       "      <td>Greater than or equal to the 90th percentile f...</td>\n",
       "      <td>bool</td>\n",
       "      <td>IALMILHSE</td>\n",
       "      <td>island area information comes from the dicenni...</td>\n",
       "      <td>training and workforce development</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>82 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                          excel_label  \\\n",
       "score_name                                                                                              \n",
       "GEOID10_TRACT                                                                         Census tract ID   \n",
       "County Name                                                                               County Name   \n",
       "State/Territory                                                                       State/Territory   \n",
       "Total threshold criteria exceeded                                   Total threshold criteria exceeded   \n",
       "Definition M (communities)                                                Identified as disadvantaged   \n",
       "...                                                                                               ...   \n",
       "Unemployment (percent) in 2009 (island areas) a...  Unemployment (percent) in 2009 (island areas) ...   \n",
       "Percentage households below 100% of federal pov...  Percentage households below 100% of federal po...   \n",
       "Greater than or equal to the 90th percentile fo...  Greater than or equal to the 90th percentile f...   \n",
       "Greater than or equal to the 90th percentile fo...  Greater than or equal to the 90th percentile f...   \n",
       "Greater than or equal to the 90th percentile fo...  Greater than or equal to the 90th percentile f...   \n",
       "\n",
       "                                                        format  \\\n",
       "score_name                                                       \n",
       "GEOID10_TRACT                                           string   \n",
       "County Name                                             string   \n",
       "State/Territory                                         string   \n",
       "Total threshold criteria exceeded                        int64   \n",
       "Definition M (communities)                                bool   \n",
       "...                                                        ...   \n",
       "Unemployment (percent) in 2009 (island areas) a...  percentage   \n",
       "Percentage households below 100% of federal pov...  percentage   \n",
       "Greater than or equal to the 90th percentile fo...        bool   \n",
       "Greater than or equal to the 90th percentile fo...        bool   \n",
       "Greater than or equal to the 90th percentile fo...        bool   \n",
       "\n",
       "                                                   shapefile_column  \\\n",
       "score_name                                                            \n",
       "GEOID10_TRACT                                                   NaN   \n",
       "County Name                                                      CF   \n",
       "State/Territory                                                  SF   \n",
       "Total threshold criteria exceeded                                TC   \n",
       "Definition M (communities)                                     SM_C   \n",
       "...                                                             ...   \n",
       "Unemployment (percent) in 2009 (island areas) a...              NaN   \n",
       "Percentage households below 100% of federal pov...              NaN   \n",
       "Greater than or equal to the 90th percentile fo...          IAULHSE   \n",
       "Greater than or equal to the 90th percentile fo...          IAPLHSE   \n",
       "Greater than or equal to the 90th percentile fo...        IALMILHSE   \n",
       "\n",
       "                                                                                                notes  \\\n",
       "score_name                                                                                              \n",
       "GEOID10_TRACT                                                                                     NaN   \n",
       "County Name                                                                                       NaN   \n",
       "State/Territory                                                                                   NaN   \n",
       "Total threshold criteria exceeded                   Lists out the total number of criteria (where ...   \n",
       "Definition M (communities)                          True / False variable for whether a tract is a...   \n",
       "...                                                                                               ...   \n",
       "Unemployment (percent) in 2009 (island areas) a...                                                NaN   \n",
       "Percentage households below 100% of federal pov...                                                NaN   \n",
       "Greater than or equal to the 90th percentile fo...  island area information comes from the dicenni...   \n",
       "Greater than or equal to the 90th percentile fo...  island area information comes from the dicenni...   \n",
       "Greater than or equal to the 90th percentile fo...  island area information comes from the dicenni...   \n",
       "\n",
       "                                                                              category  \n",
       "score_name                                                                              \n",
       "GEOID10_TRACT                                                                      NaN  \n",
       "County Name                                                                        NaN  \n",
       "State/Territory                                                                    NaN  \n",
       "Total threshold criteria exceeded                                                  NaN  \n",
       "Definition M (communities)                                                         NaN  \n",
       "...                                                                                ...  \n",
       "Unemployment (percent) in 2009 (island areas) a...                                 NaN  \n",
       "Percentage households below 100% of federal pov...                                 NaN  \n",
       "Greater than or equal to the 90th percentile fo...  training and workforce development  \n",
       "Greater than or equal to the 90th percentile fo...  training and workforce development  \n",
       "Greater than or equal to the 90th percentile fo...  training and workforce development  \n",
       "\n",
       "[82 rows x 5 columns]"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "download_codebook.dropna(subset=[\"format\"]).reset_index()[\"score_name\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "7139ce5d-db5e-49dd-8bb3-122c7b73b395",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>excel_label</th>\n",
       "      <th>format</th>\n",
       "      <th>shapefile_column</th>\n",
       "      <th>notes</th>\n",
       "      <th>category</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>score_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [excel_label, format, shapefile_column, notes, category]\n",
       "Index: []"
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "download_codebook.loc[\n",
    "    sum([download_codebook[col] == \"percentile\" for col in [\"format\"]]) > 0\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "e31ef01c-b225-48f0-bdf5-1efb8d4ed95c",
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Cannot index with multidimensional key",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Input \u001b[0;32mIn [134]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdownload_codebook\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloc\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdownload_codebook\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlike\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mformat\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpercentile\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n",
      "File \u001b[0;32m/usr/local/lib/python3.9/site-packages/pandas/core/indexing.py:931\u001b[0m, in \u001b[0;36m_LocationIndexer.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m    928\u001b[0m axis \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxis \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m    930\u001b[0m maybe_callable \u001b[38;5;241m=\u001b[39m com\u001b[38;5;241m.\u001b[39mapply_if_callable(key, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj)\n\u001b[0;32m--> 931\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmaybe_callable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/usr/local/lib/python3.9/site-packages/pandas/core/indexing.py:1151\u001b[0m, in \u001b[0;36m_LocIndexer._getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m   1148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(labels, MultiIndex)):\n\u001b[1;32m   1150\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(key, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mndim\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m key\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m-> 1151\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot index with multidimensional key\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1153\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_iterable(key, axis\u001b[38;5;241m=\u001b[39maxis)\n\u001b[1;32m   1155\u001b[0m \u001b[38;5;66;03m# nested tuple slicing\u001b[39;00m\n",
      "\u001b[0;31mValueError\u001b[0m: Cannot index with multidimensional key"
     ]
    }
   ],
   "source": [
    "download_codebook.loc[download_codebook.filter(like=\"format\") == \"percentile\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "id": "73268de4-3378-4ac7-bf85-f483a78c3966",
   "metadata": {},
   "outputs": [],
   "source": [
    "download_codebook = pd.concat(\n",
    "    [\n",
    "        codebook.set_index(\"score_name\"),\n",
    "        shapefile_codes.rename(\n",
    "            columns={\"meaning\": \"shapefile_column\", \"column\": \"score_name\"}\n",
    "        ).set_index(\"score_name\"),\n",
    "        details_to_merge.set_index(\"score_name\"),\n",
    "    ],\n",
    "    axis=1,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6321ed42-aee6-40fc-8bf8-2a4ce4276eca",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}