"
+ ],
+ "text/plain": [
+ " name FIPS_tract_id \\\n",
+ "4203 Census Tract 3923, Contra Costa County, Califo... 14000US06013392300 \n",
+ "4617 Census Tract 38.06, Kern County, California 14000US06029003806 \n",
+ "5262 Census Tract 2060.20, Los Angeles County, Cali... 14000US06037206020 \n",
+ "5565 Census Tract 2626.01, Los Angeles County, Cali... 14000US06037262601 \n",
+ "12908 Census Tract 303, Fairfield County, Connecticut 14000US09001030300 \n",
+ "14086 Census Tract 102, District of Columbia, Distri... 14000US11001010200 \n",
+ "17668 Census Tract 273.23, Pinellas County, Florida 14000US12103027323 \n",
+ "18102 Census Tract 208.10, Seminole County, Florida 14000US12117020810 \n",
+ "19796 Census Tract 101.02, Liberty County, Georgia 14000US13179010102 \n",
+ "21166 Census Tract 507, Cook County, Illinois 14000US17031050700 \n",
+ "30658 Census Tract 7053, Montgomery County, Maryland 14000US24031705300 \n",
+ "41123 Census Tract 6075.04, Camden County, New Jersey 14000US34007607504 \n",
+ "42632 Census Tract 363.02, Union County, New Jersey 14000US34039036302 \n",
+ "45335 Census Tract 3033.01, Nassau County, New York 14000US36059303301 \n",
+ "51549 Census Tract 69.10, Franklin County, Ohio 14000US39049006910 \n",
+ "62114 Census Tract 1917.01, Bexar County, Texas 14000US48029191701 \n",
+ "66504 Census Tract 22.12, Travis County, Texas 14000US48453002212 \n",
+ "67740 Census Tract 1016.03, Arlington County, Virginia 14000US51013101603 \n",
+ "69094 Census Tract 111, Hampton city, Virginia 14000US51650011100 \n",
+ "69486 Census Tract 432, Virginia Beach city, Virginia 14000US51810043200 \n",
+ "\n",
+ " ratio_pre ratio_post \n",
+ "4203 0.20 1.01 \n",
+ "4617 0.10 1.03 \n",
+ "5262 0.26 1.07 \n",
+ "5565 0.07 1.06 \n",
+ "12908 0.09 1.04 \n",
+ "14086 0.15 1.02 \n",
+ "17668 0.18 1.02 \n",
+ "18102 0.14 1.03 \n",
+ "19796 0.33 1.06 \n",
+ "21166 0.16 1.03 \n",
+ "30658 0.07 1.09 \n",
+ "41123 0.09 1.05 \n",
+ "42632 0.20 1.02 \n",
+ "45335 0.31 1.02 \n",
+ "51549 0.14 1.10 \n",
+ "62114 0.16 1.10 \n",
+ "66504 0.18 1.02 \n",
+ "67740 0.16 1.01 \n",
+ "69094 0.21 1.13 \n",
+ "69486 0.51 1.04 "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housing_df[housing_df[\"ratio_post\"] > 1][['name', \"FIPS_tract_id\", \"ratio_pre\", 'ratio_post']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Key Takeaways from the 20 tracts with greater than 100%\n",
+ "\n",
+ "1. Current college enrollment is not prevalent across all tracts\n",
+ "2. The results may make sense - given the median value (USD) of an owned propert any individual, relative to all other low-income housholds (regardless if owned and rented). These may be areas of gentrification, for example. Further analysis through time may be pertinent as a follow-up (not included in this notebook)."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_12_2011_relative_differences_between_methodologies-ranking-percentile.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_12_2011_relative_differences_between_methodologies-ranking-percentile.ipynb
index 7d13e42c..deff9367 100644
--- a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_12_2011_relative_differences_between_methodologies-ranking-percentile.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_12_2011_relative_differences_between_methodologies-ranking-percentile.ipynb
@@ -1527,7 +1527,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "Python 3",
"language": "python",
"name": "python3"
},
@@ -1541,7 +1541,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.9"
+ "version": "3.6.2"
}
},
"nbformat": 4,
diff --git a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_22_2021-revised-denominator_explore_austin_af_analysis.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_22_2021-revised-denominator_explore_austin_af_analysis.ipynb
index 1fb5df81..1eda2136 100644
--- a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_22_2021-revised-denominator_explore_austin_af_analysis.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_22_2021-revised-denominator_explore_austin_af_analysis.ipynb
@@ -737,7 +737,7 @@
],
"source": [
"plt.figure(figsize=(12, 8))\n",
- "plt.title('Relative Housing Burden for Low-Income Hosuing Only')\n",
+ "plt.title('Relative Housing Burden for Low-Income Housing Only')\n",
"# Set x-axis label\n",
"plt.xlabel('Ratio')\n",
"# Set y-axis label\n",
@@ -785,147 +785,6 @@
"sns.histplot(housing_df[\"ratio_pre\"])"
]
},
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
FIPS_tract_id
\n",
- "
name
\n",
- "
state
\n",
- "
cnty
\n",
- "
tract
\n",
- "
numerator_pre
\n",
- "
denominator_pre
\n",
- "
denominator_post
\n",
- "
ratio_pre
\n",
- "
ratio_post
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
29008
\n",
- "
14000US22071004402
\n",
- "
Census Tract 44.02, Orleans Parish, Louisiana
\n",
- "
22
\n",
- "
71
\n",
- "
4402
\n",
- "
75
\n",
- "
75
\n",
- "
165
\n",
- "
1.00
\n",
- "
0.45
\n",
- "
\n",
- "
\n",
- "
37514
\n",
- "
14000US29001951000
\n",
- "
Census Tract 9510, Adair County, Missouri
\n",
- "
29
\n",
- "
1
\n",
- "
951000
\n",
- "
55
\n",
- "
55
\n",
- "
75
\n",
- "
1.00
\n",
- "
0.73
\n",
- "
\n",
- "
\n",
- "
43932
\n",
- "
14000US36027640002
\n",
- "
Census Tract 6400.02, Dutchess County, New York
\n",
- "
36
\n",
- "
27
\n",
- "
640002
\n",
- "
48
\n",
- "
50
\n",
- "
50
\n",
- "
0.96
\n",
- "
0.96
\n",
- "
\n",
- "
\n",
- "
71681
\n",
- "
14000US55025001102
\n",
- "
Census Tract 11.02, Dane County, Wisconsin
\n",
- "
55
\n",
- "
25
\n",
- "
1102
\n",
- "
60
\n",
- "
60
\n",
- "
89
\n",
- "
1.00
\n",
- "
0.67
\n",
- "
\n",
- "
\n",
- "
71689
\n",
- "
14000US55025001603
\n",
- "
Census Tract 16.03, Dane County, Wisconsin
\n",
- "
55
\n",
- "
25
\n",
- "
1603
\n",
- "
1460
\n",
- "
1599
\n",
- "
1934
\n",
- "
0.91
\n",
- "
0.75
\n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " FIPS_tract_id name \\\n",
- "29008 14000US22071004402 Census Tract 44.02, Orleans Parish, Louisiana \n",
- "37514 14000US29001951000 Census Tract 9510, Adair County, Missouri \n",
- "43932 14000US36027640002 Census Tract 6400.02, Dutchess County, New York \n",
- "71681 14000US55025001102 Census Tract 11.02, Dane County, Wisconsin \n",
- "71689 14000US55025001603 Census Tract 16.03, Dane County, Wisconsin \n",
- "\n",
- " state cnty tract numerator_pre denominator_pre denominator_post \\\n",
- "29008 22 71 4402 75 75 165 \n",
- "37514 29 1 951000 55 55 75 \n",
- "43932 36 27 640002 48 50 50 \n",
- "71681 55 25 1102 60 60 89 \n",
- "71689 55 25 1603 1460 1599 1934 \n",
- "\n",
- " ratio_pre ratio_post \n",
- "29008 1.00 0.45 \n",
- "37514 1.00 0.73 \n",
- "43932 0.96 0.96 \n",
- "71681 1.00 0.67 \n",
- "71689 0.91 0.75 "
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# So only 4 that are > 90%\n",
- "housing_df[housing_df[\"ratio_pre\"] > 0.90]"
- ]
- },
{
"cell_type": "code",
"execution_count": 15,
@@ -2617,18 +2476,11 @@
"source": [
"seg_austin_2013.statistic"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "Python 3",
"language": "python",
"name": "python3"
},
@@ -2642,7 +2494,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.4"
+ "version": "3.6.2"
}
},
"nbformat": 4,
diff --git a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_26_2011_relative_differences_between_methodologies-ranking-percentile-comparison-difference-presentation.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_26_2011_relative_differences_between_methodologies-ranking-percentile-comparison-difference-presentation.ipynb
new file mode 100644
index 00000000..d8af856a
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_26_2011_relative_differences_between_methodologies-ranking-percentile-comparison-difference-presentation.ipynb
@@ -0,0 +1,2992 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Extreme Housing Burden \n",
+ "\n",
+ "The Extreme Housing Burden indicator represents the proportion of low-income households that have to spend more than half their income on rent. These households experience higher levels of stress, report lower health, and may delay medical treatment because of its high cost.\n",
+ "\n",
+ "The Extreme Housing Burden indicator measures the percent of households in a census tract that are:\n",
+ "\n",
+ "1. Making less than 80% of the Area Median Family Income as determined by the Department of Housing and Urban Development (HUD), and\n",
+ "2. Paying greater than 50% of their income to housing costs. \n",
+ "\n",
+ "This data is sourced from the 2014-2018 Comprehensive Housing Affordability Strategy dataset from the Department of Housing and Urban Development (HUD) using the census tract geographic summary level, and contains cost burdens for households by percent HUD-adjusted median family income (HAMFI) category. This data can be found [here](https://www.huduser.gov/portal/datasets/cp.html). \n",
+ "\n",
+ "Because CHAS data is based on American Communities Survey (ACS) estimates, which come from a sample of the population, they may be unreliable if based on a small sample or population size.\n",
+ "\n",
+ "The standard error and relative standard error were used to evaluate the reliability of each estimate using CalEnviroScreen’s methodology. \n",
+ "\n",
+ "Census tract estimates that met either of the following criteria were considered reliable and included in the analysis [(CalEnviroScreen, 2017, page 129)](https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf ):\n",
+ "\n",
+ "- Relative standard error less than 50 (meaning the standard error was less than half of the estimate), OR \n",
+ "- Standard error less than the mean standard error of all census tract estimates \n",
+ "\n",
+ "Formulas for calculating the standard error of sums, proportions, and ratio come from the [American Communities Survey Office](https://www2.census.gov/programs-surveys/acs/tech_docs/accuracy/MultiyearACSAccuracyofData2013.pdf).\n",
+ "\n",
+ "Note that this code creates a score and rank by state, for every state."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The relevant variables in table 8 of the CHAS dataset are the following (CHAS data dictionary available [here](https://www.huduser.gov/portal/datasets/cp/CHAS-data-dictionary-14-18.xlsx)):\n",
+ "\n",
+ "| Name | Label |\n",
+ "|---------|-----------------------------------------------------|\n",
+ "|T1_est1 | Total Occupied housing units | \n",
+ "|T8_est10 | Owner occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est23 |Owner occupied greater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est36 |Owner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est76 | Renter occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est89 |Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est102|Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Percentiles Comparison"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Current methodology for housing burden where tracts meet the environemental burden threshold (n = 7323 tracts)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(12, 8))\n",
+ "plt.title(\"Distribution of Percentiles for Housing Burden (Score L)\")\n",
+ "# Set x-axis label\n",
+ "plt.xlabel('Percentile (although currently not represented as a percentage)')\n",
+ "# Set y-axis label\n",
+ "plt.ylabel('Relative Frequency in Support')\n",
+ "\n",
+ "sns.histplot(non_null_df[\"current_methodology_percentile_rank\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Below is the summarization of the tracts that are not common between Score L and the CalEnviroScreen's Ranked Percentile \n",
+ "\n",
+ "Here (n = 5013 tracts in the set of 67, 813 tracts with reliable population estimates - as per CalEnviroScreen's methodology). Of these 2, 741 tracts met Score L's threshold for environemental burden (I will be reviewing this with further summaries), but were not in the burdened threshold using CalEnviroScreen's Ranked Percentile methodology. Conversely, 2, 272 did _not_ meet Score L's threshold for environemental burden (again, I will be reviewing this with further summaries), but were not in the burdened threshold using CalEnviroScreen's Ranked Percentile methodology.\n",
+ "\n",
+ "It should be noted that 53 tracts, because of unreliable population estimates - as per CalEnviroScreen's methodology - were not included in the threshold computation. However, these same 53 tracts are considered burdened in Score L. This is equivalent to 3145 households.\n",
+ "\n",
+ "Below is a summary of the aggregate 5013 tracts. Each column presents a grouped statistic of the median percentage of the specified population characteristic across all tracts in that state. To present trends and contrasts between states, I prepared bar graphs relative to the midpoint value of the medians in that given column (really the median of medians). The attached Excel file represents the same data, but I highlight the cells where the given median for the state is greater than the median of all of the state samples for any given population characteristic."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
Percent of individuals < 200% Federal Poverty Line (median across all tracts)
Percent individuals age 25 or over with less than high school degree (median across all tracts)
Percent enrollment in college or graduate school (median across all tracts)
Percent Black or African American alone (median across all tracts)
Percent American Indian and Alaska Native alone (median across all tracts)
Percent Non-Hispanic White (median across all tracts)
Percent Hispanic or Latino (median across all tracts)
Total Number of Unique Tracts
Total Owned and Rented Burdened Households (Current Aggregation Methodology)
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Compute the correlation matrix\n",
+ "import seaborn as sns\n",
+ "corr = merged_df[[\"hbrd_rank\", \n",
+ " \"current_methodology_percentile_rank\"] + percent_cols].corr()\n",
+ "\n",
+ "# Generate a mask for the upper triangle\n",
+ "mask = np.triu(np.ones_like(corr, dtype=bool))\n",
+ "\n",
+ "# Set up the matplotlib figure\n",
+ "f, ax = plt.subplots(figsize=(15, 12))\n",
+ "\n",
+ "# Generate a custom diverging colormap\n",
+ "cmap = sns.diverging_palette(230, 20, as_cmap=True)\n",
+ "\n",
+ "# Draw the heatmap with the mask and correct aspect ratio\n",
+ "sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,\n",
+ " square=True, linewidths=.5, cbar_kws={\"shrink\": .5})"
]
},
{
diff --git a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27-highlevel-summarization-variance.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27-highlevel-summarization-variance.ipynb
new file mode 100644
index 00000000..caa4202d
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27-highlevel-summarization-variance.ipynb
@@ -0,0 +1,8783 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Whereas we have presented statistics that describe the central tendencies of a data set, we are also interested in ones that describe the spread or variability of the data values. A statistic that could be used for this purpose would be one that measures the average value of the squares of the distances between the data values and the sample mean. This is accomplished by the sample variance, which for technical reasons divides the sum of the squares of the differences by $n-1$ rather than $n$, where $n$ is the size of the data set."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Indicator reviewed: \n",
+ "\n",
+ "Socioeconomic Factors Indicator reviewed\n",
+ "* [Extreme Housing Burden](#housingburden)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import math\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pandas as pd\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### ETL process for acquiring relevant tables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### NOTE: If you ran the ETL Process to acquire Table 8 in the other notebook of this draft PR you do not need to run the ETL cell block again"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copy and adapt certain sections of code from data_pipeline.utils \n",
+ "\n",
+ "def download_hud_dataset():\n",
+ " DOWNLOAD_FILENAME = \"HUD_ZIPPED.csv\"\n",
+ " HOUSING_FTP_URL = \"https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip\" \n",
+ " response = requests.get(HOUSING_FTP_URL, verify=True)\n",
+ " if response.status_code == 200:\n",
+ " file_contents = response.content\n",
+ " else:\n",
+ " sys.exit(\n",
+ " f\"HTTP response {response.status_code} from url {file_url}. Info: {response.content}\"\n",
+ " )\n",
+ "\n",
+ " # Write the contents to disk.\n",
+ " file = open(DOWNLOAD_FILENAME, \"wb\")\n",
+ " file.write(file_contents)\n",
+ " file.close()\n",
+ " \n",
+ "def extract_zipped_download(zip_file_path, unzipped_path):\n",
+ " with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
+ " zip_ref.extractall(unzipped_path)\n",
+ " # cleanup temporary file\n",
+ " os.remove(zip_file_path)\n",
+ " \n",
+ "def up_one_directory(path):\n",
+ " try:\n",
+ " # from Python 3.6\n",
+ " parent_dir = Path(path).parents[1]\n",
+ " # for Python 3.4/3.5, use str to convert the path to string\n",
+ " # parent_dir = str(Path(path).parents[1])\n",
+ " shutil.move(path, parent_dir)\n",
+ " except IndexError:\n",
+ " # no upper directory\n",
+ " pass\n",
+ "\n",
+ "CURRENT_DIRECTORY = os.getcwd()\n",
+ "download_hud_dataset()\n",
+ "extract_zipped_download(CURRENT_DIRECTORY + \"/HUD_ZIPPED.csv\", CURRENT_DIRECTORY) \n",
+ "up_one_directory(CURRENT_DIRECTORY + \"/140/Table8.csv\")\n",
+ "shutil.rmtree(\"./140/\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Extreme Housing Burden \n",
+ "\n",
+ "The Extreme Housing Burden indicator represents the proportion of low-income households that have to spend more than half their income on rent. These households experience higher levels of stress, report lower health, and may delay medical treatment because of its high cost.\n",
+ "\n",
+ "The Extreme Housing Burden indicator measures the percent of households in a census tract that are:\n",
+ "\n",
+ "1. Making less than 80% of the Area Median Family Income as determined by the Department of Housing and Urban Development (HUD), and\n",
+ "2. Paying greater than 50% of their income to housing costs. \n",
+ "\n",
+ "This data is sourced from the 2014-2018 Comprehensive Housing Affordability Strategy dataset from the Department of Housing and Urban Development (HUD) using the census tract geographic summary level, and contains cost burdens for households by percent HUD-adjusted median family income (HAMFI) category. This data can be found [here](https://www.huduser.gov/portal/datasets/cp.html). \n",
+ "\n",
+ "Because CHAS data is based on American Communities Survey (ACS) estimates, which come from a sample of the population, they may be unreliable if based on a small sample or population size.\n",
+ "\n",
+ "The standard error and relative standard error were used to evaluate the reliability of each estimate using CalEnviroScreen’s methodology. \n",
+ "\n",
+ "Census tract estimates that met either of the following criteria were considered reliable and included in the analysis [(CalEnviroScreen, 2017, page 129)](https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf ):\n",
+ "\n",
+ "- Relative standard error less than 50 (meaning the standard error was less than half of the estimate), OR \n",
+ "- Standard error less than the mean standard error of all census tract estimates \n",
+ "\n",
+ "Formulas for calculating the standard error of sums, proportions, and ratio come from the [American Communities Survey Office](https://www2.census.gov/programs-surveys/acs/tech_docs/accuracy/MultiyearACSAccuracyofData2013.pdf).\n",
+ "\n",
+ "Note that this code creates a score and rank by state, for every state."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The relevant variables in table 8 of the CHAS dataset are the following (CHAS data dictionary available [here](https://www.huduser.gov/portal/datasets/cp/CHAS-data-dictionary-14-18.xlsx)):\n",
+ "\n",
+ "| Name | Label |\n",
+ "|---------|-----------------------------------------------------|\n",
+ "|T1_est1 | Total Occupied housing units | \n",
+ "|T8_est10 | Owner occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est23 |Owner occupied greater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est36 |Owner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est76 | Renter occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est89 |Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est102|Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Below I also propose an alternate means for ranking census tracts\n",
+ "### These steps are outlined and commented below"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/series.py:726: RuntimeWarning: invalid value encountered in sqrt\n",
+ " result = getattr(ufunc, method)(*inputs, **kwargs)\n",
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " iloc._setitem_with_indexer(indexer, value)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Read in the data from https://www.huduser.gov/portal/datasets/cp.html\n",
+ "housing = pd.read_csv(\"Table8.csv\", \n",
+ " encoding = \"ISO-8859-1\", \n",
+ " dtype = {'Tract_ID': object, 'st': object, 'geoid': object})\n",
+ "\n",
+ "# Remove data for states that aren't included in the census (e.g. American Samoa, Guam, etc.):\n",
+ "housing.drop(housing.loc[housing['st'] == '72'].index, inplace = True)\n",
+ "\n",
+ "# Combine owner and renter occupied low-income households that make less than 80% of HAMFI into one variable\n",
+ "housing['summed'] = (housing['T8_est10'] + \n",
+ " housing['T8_est23'] + \n",
+ " housing['T8_est36'] + \n",
+ " housing['T8_est76'] + \n",
+ " housing['T8_est89'] + \n",
+ " housing['T8_est102'])\n",
+ "\n",
+ "# Create a variable for the standard error of the summed variables\n",
+ "housing['summed_se'] = np.sqrt((housing['T8_moe10'] / 1.645)**2 + \n",
+ " (housing['T8_moe23'] / 1.645)**2 + \n",
+ " (housing['T8_moe36'] / 1.645)**2 + \n",
+ " (housing['T8_moe76'] / 1.645)**2 + \n",
+ " (housing['T8_moe89'] / 1.645)**2 + \n",
+ " (housing['T8_moe102'] / 1.645)**2)\n",
+ "\n",
+ "# Remove the first 7 digits in the FIPS Census Tract ID \n",
+ "housing['geoid'] = housing['geoid'].str[-11:]\n",
+ "\n",
+ "# Find the estimate of the proportion of the population that is heavily rent burdened\n",
+ "housing['hbrd_score'] = housing['summed'] / housing['T8_est1']\n",
+ "\n",
+ "# Change rates where the population is 0 to nan\n",
+ "housing['hbrd_score'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Create function for calculating the standard error, using the proportions standard error formula\n",
+ "# if the value under the radical is negative, use the ratio standard error formula\n",
+ "def se_prop(x, y, se_x, moe_y): \n",
+ " se_y = moe_y / 1.645\n",
+ " test = se_x**2 - (((x**2)/(y**2))*((se_y)**2))\n",
+ " se = np.where(test < 0,\n",
+ " (1/y) * np.sqrt(se_x**2 + (((x**2)/(y**2))*(se_y**2))), \n",
+ " (1/y) * np.sqrt(se_x**2 - (((x**2)/(y**2))*(se_y**2))))\n",
+ " return se\n",
+ "\n",
+ "housing['se'] = se_prop(housing['summed'], housing['T8_est1'], housing['summed_se'], housing['T8_moe1'])\n",
+ "\n",
+ "# Calculate the relative standard error\n",
+ "housing['rse'] = housing['se'] / housing['hbrd_score']*100\n",
+ "\n",
+ "# Change infinite rse's where the housing burden is 0 to np.nan\n",
+ "housing['rse'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Calculate the mean standard error for each state\n",
+ "housing['mean_state_se'] = np.zeros(len(housing))\n",
+ "\n",
+ "for state in housing['st'].unique():\n",
+ " mean_se = np.mean(housing[housing['st'] == state]['se'])\n",
+ " housing['mean_state_se'].loc[housing['st'] == state] = mean_se\n",
+ " \n",
+ "# Find census tract estimates that meet both of the following criteria and are thus considered unreliable estimates: \n",
+ "# RSE less than 50 AND\n",
+ "# SE less than the mean state SE or housing burdened low income households\n",
+ "# Convert these scores to nan\n",
+ "housing.loc[(housing['rse'] >= 50) & (housing['rse'] >= housing['mean_state_se']), 'hbrd_score'] = np.nan\n",
+ "\n",
+ "# Rename columns\n",
+ "housing = housing.rename(columns = {'geoid' :'FIPS_tract_id',\n",
+ " 'st' : 'state'\n",
+ " })\n",
+ "\n",
+ "# Calculate percentile rank for census tracts with a score above 0, set percentile to 0 if score is 0, for each state\n",
+ "housing['hbrd_rank'] = housing[\n",
+ " housing['hbrd_score'] != 0][['hbrd_score',\n",
+ " 'state']].groupby('state').rank( \n",
+ " na_option = 'keep', \n",
+ " pct = True) * 100\n",
+ "\n",
+ "housing.loc[housing['hbrd_score'] == 0, 'hbrd_rank'] = 0\n",
+ "\n",
+ "# Create final housing burden df\n",
+ "housingburden = housing.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
source
\n",
+ "
sumlevel
\n",
+ "
FIPS_tract_id
\n",
+ "
name
\n",
+ "
state
\n",
+ "
cnty
\n",
+ "
tract
\n",
+ "
T8_est1
\n",
+ "
T8_est2
\n",
+ "
T8_est3
\n",
+ "
...
\n",
+ "
T8_moe131
\n",
+ "
T8_moe132
\n",
+ "
T8_moe133
\n",
+ "
summed
\n",
+ "
summed_se
\n",
+ "
hbrd_score
\n",
+ "
se
\n",
+ "
rse
\n",
+ "
mean_state_se
\n",
+ "
hbrd_rank
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020100
\n",
+ "
Census Tract 201, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20100
\n",
+ "
765
\n",
+ "
570
\n",
+ "
50
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
80
\n",
+ "
31.721807
\n",
+ "
0.104575
\n",
+ "
0.041032
\n",
+ "
39.237314
\n",
+ "
0.036604
\n",
+ "
46.298077
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020200
\n",
+ "
Census Tract 202, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20200
\n",
+ "
720
\n",
+ "
465
\n",
+ "
65
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
138
\n",
+ "
45.531874
\n",
+ "
0.191667
\n",
+ "
0.061614
\n",
+ "
32.146659
\n",
+ "
0.036604
\n",
+ "
83.269231
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020300
\n",
+ "
Census Tract 203, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20300
\n",
+ "
1295
\n",
+ "
840
\n",
+ "
60
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
170
\n",
+ "
53.722921
\n",
+ "
0.131274
\n",
+ "
0.040927
\n",
+ "
31.176999
\n",
+ "
0.036604
\n",
+ "
63.653846
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020400
\n",
+ "
Census Tract 204, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20400
\n",
+ "
1640
\n",
+ "
1260
\n",
+ "
15
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
145
\n",
+ "
46.288510
\n",
+ "
0.088415
\n",
+ "
0.027822
\n",
+ "
31.467397
\n",
+ "
0.036604
\n",
+ "
34.615385
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020500
\n",
+ "
Census Tract 205, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20500
\n",
+ "
4175
\n",
+ "
2320
\n",
+ "
175
\n",
+ "
...
\n",
+ "
17
\n",
+ "
17
\n",
+ "
17
\n",
+ "
595
\n",
+ "
147.221693
\n",
+ "
0.142515
\n",
+ "
0.034760
\n",
+ "
24.390193
\n",
+ "
0.036604
\n",
+ "
68.221154
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 280 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " source sumlevel FIPS_tract_id \\\n",
+ "0 2014thru2018 140 01001020100 \n",
+ "1 2014thru2018 140 01001020200 \n",
+ "2 2014thru2018 140 01001020300 \n",
+ "3 2014thru2018 140 01001020400 \n",
+ "4 2014thru2018 140 01001020500 \n",
+ "\n",
+ " name state cnty tract T8_est1 \\\n",
+ "0 Census Tract 201, Autauga County, Alabama 01 1 20100 765 \n",
+ "1 Census Tract 202, Autauga County, Alabama 01 1 20200 720 \n",
+ "2 Census Tract 203, Autauga County, Alabama 01 1 20300 1295 \n",
+ "3 Census Tract 204, Autauga County, Alabama 01 1 20400 1640 \n",
+ "4 Census Tract 205, Autauga County, Alabama 01 1 20500 4175 \n",
+ "\n",
+ " T8_est2 T8_est3 ... T8_moe131 T8_moe132 T8_moe133 summed summed_se \\\n",
+ "0 570 50 ... 12 12 12 80 31.721807 \n",
+ "1 465 65 ... 12 12 12 138 45.531874 \n",
+ "2 840 60 ... 12 12 12 170 53.722921 \n",
+ "3 1260 15 ... 12 12 12 145 46.288510 \n",
+ "4 2320 175 ... 17 17 17 595 147.221693 \n",
+ "\n",
+ " hbrd_score se rse mean_state_se hbrd_rank \n",
+ "0 0.104575 0.041032 39.237314 0.036604 46.298077 \n",
+ "1 0.191667 0.061614 32.146659 0.036604 83.269231 \n",
+ "2 0.131274 0.040927 31.176999 0.036604 63.653846 \n",
+ "3 0.088415 0.027822 31.467397 0.036604 34.615385 \n",
+ "4 0.142515 0.034760 24.390193 0.036604 68.221154 \n",
+ "\n",
+ "[5 rows x 280 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(73056, 280)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### As desired we see a uniform distribution for the percentile rank for burdened households"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we compute for a baseline comparison "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Owner occupied numerator fields\n",
+ "OWNER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est7\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est10\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est20\",\n",
+ " \n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est23\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est33\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est36\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est13\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est26\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est39\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est52\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est65\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_FIELD = \"T8_est2\"\n",
+ "# Subtotal\n",
+ "# Owner occupied\n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_HAMFI_FIELD = \"T8_est3\"\n",
+ "# Subtotal\n",
+ "# Owner occupied \n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "# Renter occupied numerator fields\n",
+ "RENTER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est73\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est76\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est86\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est89\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est99\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est102\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est79\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tless than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est92\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est105\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est118\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est131\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# T8_est68\tSubtotalRenter occupied\tAll\tAll\tAll\n",
+ "RENTER_OCCUPIED_POPULATION_FIELD = \"T8_est68\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_summed_methodology\"] = housingburden[\n",
+ " OWNER_OCCUPIED_NUMERATOR_FIELDS\n",
+ "].sum(axis=1) + housingburden[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ " - housingburden[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ " - housingburden[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator_sans_not_computed\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_percent\"] = np.round(\n",
+ " (housingburden[\"current_summed_methodology\"] / housingburden[\"current_methodology_denominator\"] ), 2) * 100"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we construct the distribution of differences in the number of owned and rented burdened households\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Percentiles Comparison"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_df = housingburden[['FIPS_tract_id', 'state','hbrd_rank','hbrd_score', 'summed', \n",
+ " 'current_summed_methodology', 'T8_est1', \n",
+ " \"current_methodology_denominator_sans_not_computed\",\n",
+ " 'current_methodology_denominator', 'current_methodology_percent']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### First notice here that **T8_est1** and **current_methodology_denominator** should represent same or similar aggregates. In general, we cen see that the current computation performed results in a differerntial that undercounts the total occupied and rental households."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator_sans_not_computed\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(12, 8))\n",
+ "plt.title('Distribution of differences between aggregate totals that normalizes tabulation of poverty households (with removal of not computed fields) ')\n",
+ "# Set x-axis label\n",
+ "plt.xlabel('Aggregate differences in total owner and renter occupied low-income households')\n",
+ "# Set y-axis label\n",
+ "plt.ylabel('Relative Frequency in Support')\n",
+ "\n",
+ "sns.histplot(final_df[\"differences_aggregate_denominator_sans_not_computed\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " after removing the cwd from sys.path.\n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"current_methodology_percentile_rank\"] = final_df[\"current_methodology_percent\"].rank(\n",
+ " pct=True,\n",
+ " # Set ascending to the parameter value.\n",
+ " ascending=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \"\"\"Entry point for launching an IPython kernel.\n",
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " after removing the cwd from sys.path.\n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"new_threshold_exceeded\"] = (final_df['hbrd_rank'] >= 90)\n",
+ "\n",
+ "final_df[\"current_threshold_exceeded\"] = (final_df[\n",
+ " 'current_methodology_percentile_rank'] >= 0.90)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Only include non-NA tracts for comparison purposes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# first save NA tracts that were considered unreliable\n",
+ "ineligible_tracts = list(final_df[final_df[\"hbrd_rank\"].isna()][\"FIPS_tract_id\"].values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5243"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(ineligible_tracts)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### How many tracts are ineligible according to CalEnvironScreen but are considerd in Score L?\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_current_methodology = final_df[final_df[\"current_methodology_percentile_rank\"] >= 0.90]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(7323, 15)"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "final_current_methodology.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(53, 15)"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 53 tracts\n",
+ "final_current_methodology[\n",
+ " final_current_methodology.FIPS_tract_id.isin(ineligible_tracts)].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "non_null_df = final_df.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(7323, 15)"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# number of tracks eligible\n",
+ "non_null_df[non_null_df[\"current_methodology_percentile_rank\"] >= 0.90].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(12, 8))\n",
+ "plt.title(\"Distribution of Percentiles for Housing Burden (Score L)\")\n",
+ "# Set x-axis label\n",
+ "plt.xlabel('Percentile (although currently not represented as a percentage)')\n",
+ "# Set y-axis label\n",
+ "plt.ylabel('Relative Frequency in Support')\n",
+ "\n",
+ "sns.histplot(non_null_df[\"current_methodology_percentile_rank\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False 65733\n",
+ "True 7323\n",
+ "Name: current_threshold_exceeded, dtype: int64"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "non_null_df[\"current_threshold_exceeded\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False 66255\n",
+ "True 6801\n",
+ "Name: new_threshold_exceeded, dtype: int64"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "non_null_df[\"new_threshold_exceeded\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Score L Criterion as burden but not Calenvironscreen"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2794, 15)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# this includes uineligible np.nan values too\n",
+ "predicate_1 = (non_null_df['current_threshold_exceeded'] == True) & (non_null_df['new_threshold_exceeded'] != True)\n",
+ "\n",
+ "non_null_df[predicate_1].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Where Score L is considered burdened but not in CalEnviroScreen\n",
+ "score_l_considered_burdened = non_null_df[predicate_1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import us\n",
+ "\n",
+ "mapping = us.states.mapping('fips', 'abbr')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Inserted after the basic stats definition.\n",
+ "# Load demographic data\n",
+ "import pathlib\n",
+ "\n",
+ "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
+ "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
+ "\n",
+ "demographics_path = DATA_DIR / \"dataset\" / \"census_acs_2019\" / \"usa.csv\"\n",
+ "\n",
+ "demographics_df = pd.read_csv(\n",
+ " demographics_path,\n",
+ " dtype={\"GEOID10_TRACT\": \"string\"},\n",
+ ")\n",
+ "\n",
+ "# Set some field names\n",
+ "BLACK_FIELD_NAME = \"Black or African American alone\"\n",
+ "AMERICAN_INDIAN_FIELD_NAME = \"American Indian and Alaska Native alone\"\n",
+ "ASIAN_FIELD_NAME = \"Asian alone\"\n",
+ "HAWAIIAN_FIELD_NAME = \"Native Hawaiian and Other Pacific alone\"\n",
+ "TWO_OR_MORE_RACES_FIELD_NAME = \"Two or more races\"\n",
+ "NON_HISPANIC_WHITE_FIELD_NAME = \"Non-Hispanic White\"\n",
+ "HISPANIC_FIELD_NAME = \"Hispanic or Latino\"\n",
+ "PERCENT_PREFIX = \"Percent \"\n",
+ "\n",
+ "RE_OUTPUT_FIELDS = [\n",
+ " BLACK_FIELD_NAME,\n",
+ " AMERICAN_INDIAN_FIELD_NAME,\n",
+ " ASIAN_FIELD_NAME,\n",
+ " HAWAIIAN_FIELD_NAME,\n",
+ " TWO_OR_MORE_RACES_FIELD_NAME,\n",
+ " NON_HISPANIC_WHITE_FIELD_NAME,\n",
+ " HISPANIC_FIELD_NAME,\n",
+ "]\n",
+ "\n",
+ "RE_PERCENT_OUTPUT_FIELDS = [PERCENT_PREFIX + field for field in RE_OUTPUT_FIELDS]\n",
+ "\n",
+ "columns_to_keep = (\n",
+ " [\"GEOID10_TRACT\"]\n",
+ " + RE_OUTPUT_FIELDS\n",
+ " + RE_PERCENT_OUTPUT_FIELDS\n",
+ " + ['Percent of individuals < 200% Federal Poverty Line', \n",
+ " 'Median value ($) of owner-occupied housing units',\n",
+ " 'Percent individuals age 25 or over with less than high school degree',\n",
+ " 'Percent enrollment in college or graduate school',\n",
+ " 'Linguistic isolation (percent)']\n",
+ ")\n",
+ "\n",
+ "\n",
+ "def highlight_deviations(s):\n",
+ " # unbiased estimate of variance N-1\n",
+ " # \n",
+ " is_greater_than_std = s > np.var(s, ddof=1) \n",
+ " return ['color: pink; background-color:#7272FE'\n",
+ " if cell else '' for cell in is_greater_than_std]\n",
+ "# Join the demographics in.\n",
+ "merged_df_score_l = score_l_considered_burdened.merge(\n",
+ " demographics_df[columns_to_keep],\n",
+ " left_on=\"FIPS_tract_id\",\n",
+ " right_on=\"GEOID10_TRACT\",\n",
+ " how=\"inner\"\n",
+ ")\n",
+ "\n",
+ "# these are not converted into percent 0 - 100 scale\n",
+ "percent_cols = [x for x in merged_df_score_l.columns if \n",
+ " 'Percent' in x or '(percent)' in x\n",
+ " ]\n",
+ "\n",
+ "merged_df_score_l[\n",
+ " percent_cols] = merged_df_score_l[\n",
+ " percent_cols].apply(lambda x: x * 100)\n",
+ "\n",
+ "for idx, row in merged_df_score_l.iterrows():\n",
+ " current_row = str(merged_df_score_l.loc[idx, 'state'])\n",
+ " state = mapping.get(current_row, None)\n",
+ " merged_df_score_l.loc[idx, 'state_name'] = state\n",
+ "\n",
+ "grouped_stats_score_l = merged_df_score_l.groupby([\"state_name\"]).agg({\n",
+ " 'GEOID10_TRACT': ['nunique'],\n",
+ " 'Percent of individuals < 200% Federal Poverty Line': [np.median, np.std],\n",
+ " 'Median value ($) of owner-occupied housing units': [np.median, np.std],\n",
+ " 'Percent individuals age 25 or over with less than high school degree': [np.median, np.std],\n",
+ " 'Percent enrollment in college or graduate school': [np.median, np.std],\n",
+ " 'Percent Black or African American alone': [np.median, np.std],\n",
+ " 'Percent American Indian and Alaska Native alone': [np.median, np.std],\n",
+ " 'Percent Non-Hispanic White': [np.median, np.std], \n",
+ " 'Linguistic isolation (percent)': [np.median, np.std],\n",
+ " 'Percent Hispanic or Latino': [np.median, np.std],\n",
+ " 'hbrd_rank': [np.median, np.std],\n",
+ " 'current_methodology_percent': [np.median, np.std],\n",
+ " 'current_summed_methodology': [np.median, np.std, np.sum]\n",
+ "}).reset_index()\n",
+ "\n",
+ "\n",
+ "grouped_stats_score_l.columns = [' '.join(col).strip() for \n",
+ " col in grouped_stats_score_l.columns.values]\n",
+ "\n",
+ "grouped_stats_score_l = grouped_stats_score_l[[x for x in grouped_stats_score_l \n",
+ " if \"std\" in x and \n",
+ " \"Percent\" in x] + [\"state_name\", \"current_summed_methodology std\"]]\n",
+ "\n",
+ "grouped_stats_score_l.set_index(\"state_name\", inplace=True)\n",
+ "\n",
+ "grouped_stats_score_l = grouped_stats_score_l.rename(columns={\n",
+ "'Percent of individuals < 200% Federal Poverty Line std'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (variance across all tracts)', \n",
+ "'Percent individuals age 25 or over with less than high school degree std':\n",
+ "'Percent individuals age 25 or over with less than high school degree (variance across all tracts)', \n",
+ "'Percent enrollment in college or graduate school std'\n",
+ " :'Percent enrollment in college or graduate school (variance across all tracts)',\n",
+ "'Percent Black or African American alone std':\n",
+ " 'Percent Black or African American alone (variance across all tracts)',\n",
+ "'Percent American Indian and Alaska Native alone std':\n",
+ " 'Percent American Indian and Alaska Native alone (variance across all tracts)',\n",
+ "'Percent Non-Hispanic White std':\n",
+ " 'Percent Non-Hispanic White (variance across all tracts)',\n",
+ "'Percent Hispanic or Latino std':\n",
+ " 'Percent Hispanic or Latino (variance across all tracts)',\n",
+ "'GEOID10_TRACT std': \"Total Number of Unique Tracts\",\n",
+ "\"current_summed_methodology std\": \"Variance: Owned and Rented Burdened Households (Current Aggregation Methodology)\" \n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
Percent of individuals < 200% Federal Poverty Line (variance across all tracts)
Percent individuals age 25 or over with less than high school degree (variance across all tracts)
Percent enrollment in college or graduate school (variance across all tracts)
Percent Black or African American alone (variance across all tracts)
Percent American Indian and Alaska Native alone (variance across all tracts)
Percent Non-Hispanic White (variance across all tracts)
Percent Hispanic or Latino (variance across all tracts)
Variance: Owned and Rented Burdened Households (Current Aggregation Methodology)
state_name
\n",
+ "
\n",
+ "
AK
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
\n",
+ "
\n",
+ "
AL
\n",
+ "
13.096420
\n",
+ "
6.644986
\n",
+ "
18.768604
\n",
+ "
22.959179
\n",
+ "
0.223087
\n",
+ "
19.330296
\n",
+ "
8.007999
\n",
+ "
270.819158
\n",
+ "
\n",
+ "
\n",
+ "
AR
\n",
+ "
10.127382
\n",
+ "
2.668122
\n",
+ "
18.672822
\n",
+ "
27.711878
\n",
+ "
0.128748
\n",
+ "
25.898087
\n",
+ "
10.756406
\n",
+ "
209.143292
\n",
+ "
\n",
+ "
\n",
+ "
AZ
\n",
+ "
10.486301
\n",
+ "
12.746237
\n",
+ "
10.384138
\n",
+ "
6.623942
\n",
+ "
3.516648
\n",
+ "
20.546898
\n",
+ "
22.871205
\n",
+ "
256.224598
\n",
+ "
\n",
+ "
\n",
+ "
CA
\n",
+ "
12.549906
\n",
+ "
14.188819
\n",
+ "
6.086854
\n",
+ "
10.530709
\n",
+ "
1.368453
\n",
+ "
15.571862
\n",
+ "
23.690174
\n",
+ "
256.209407
\n",
+ "
\n",
+ "
\n",
+ "
CO
\n",
+ "
10.722463
\n",
+ "
11.736295
\n",
+ "
3.463817
\n",
+ "
10.505513
\n",
+ "
1.945477
\n",
+ "
14.866184
\n",
+ "
20.168076
\n",
+ "
332.327502
\n",
+ "
\n",
+ "
\n",
+ "
CT
\n",
+ "
15.465593
\n",
+ "
8.918645
\n",
+ "
15.309077
\n",
+ "
15.980431
\n",
+ "
1.014369
\n",
+ "
18.551638
\n",
+ "
16.610853
\n",
+ "
308.514106
\n",
+ "
\n",
+ "
\n",
+ "
DC
\n",
+ "
8.961433
\n",
+ "
5.354725
\n",
+ "
9.025222
\n",
+ "
15.403556
\n",
+ "
0.762443
\n",
+ "
6.219621
\n",
+ "
11.580841
\n",
+ "
205.735892
\n",
+ "
\n",
+ "
\n",
+ "
DE
\n",
+ "
14.287035
\n",
+ "
5.871034
\n",
+ "
1.561105
\n",
+ "
39.316172
\n",
+ "
0.528421
\n",
+ "
23.042214
\n",
+ "
16.399730
\n",
+ "
186.876965
\n",
+ "
\n",
+ "
\n",
+ "
FL
\n",
+ "
11.714190
\n",
+ "
10.517453
\n",
+ "
11.329509
\n",
+ "
28.076545
\n",
+ "
1.098837
\n",
+ "
22.576143
\n",
+ "
27.968331
\n",
+ "
336.712451
\n",
+ "
\n",
+ "
\n",
+ "
GA
\n",
+ "
12.454072
\n",
+ "
10.852123
\n",
+ "
10.962984
\n",
+ "
28.469164
\n",
+ "
1.329249
\n",
+ "
15.984924
\n",
+ "
20.115423
\n",
+ "
397.215028
\n",
+ "
\n",
+ "
\n",
+ "
HI
\n",
+ "
13.353624
\n",
+ "
10.272067
\n",
+ "
4.015181
\n",
+ "
1.586014
\n",
+ "
0.424435
\n",
+ "
12.917891
\n",
+ "
7.466775
\n",
+ "
234.634661
\n",
+ "
\n",
+ "
\n",
+ "
IA
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
\n",
+ "
\n",
+ "
ID
\n",
+ "
2.004231
\n",
+ "
2.748204
\n",
+ "
11.360449
\n",
+ "
0.175057
\n",
+ "
0.037512
\n",
+ "
22.369833
\n",
+ "
19.631435
\n",
+ "
659.730627
\n",
+ "
\n",
+ "
\n",
+ "
IL
\n",
+ "
13.006770
\n",
+ "
12.071458
\n",
+ "
13.097981
\n",
+ "
36.591041
\n",
+ "
0.845755
\n",
+ "
19.745136
\n",
+ "
33.788104
\n",
+ "
262.443072
\n",
+ "
\n",
+ "
\n",
+ "
IN
\n",
+ "
15.355784
\n",
+ "
8.296981
\n",
+ "
31.632866
\n",
+ "
28.682654
\n",
+ "
0.642208
\n",
+ "
27.218018
\n",
+ "
7.313774
\n",
+ "
427.114975
\n",
+ "
\n",
+ "
\n",
+ "
KS
\n",
+ "
0.105077
\n",
+ "
14.301996
\n",
+ "
0.843603
\n",
+ "
12.411843
\n",
+ "
0.074661
\n",
+ "
0.480065
\n",
+ "
6.454849
\n",
+ "
308.298557
\n",
+ "
\n",
+ "
\n",
+ "
KY
\n",
+ "
6.181253
\n",
+ "
9.634419
\n",
+ "
38.657259
\n",
+ "
11.530427
\n",
+ "
0.635681
\n",
+ "
12.385339
\n",
+ "
10.232876
\n",
+ "
336.240655
\n",
+ "
\n",
+ "
\n",
+ "
LA
\n",
+ "
12.026310
\n",
+ "
10.236242
\n",
+ "
14.053154
\n",
+ "
20.922732
\n",
+ "
0.560395
\n",
+ "
14.902354
\n",
+ "
9.279719
\n",
+ "
305.110126
\n",
+ "
\n",
+ "
\n",
+ "
MA
\n",
+ "
11.892144
\n",
+ "
8.977388
\n",
+ "
13.075436
\n",
+ "
20.118430
\n",
+ "
1.210053
\n",
+ "
22.399207
\n",
+ "
20.572927
\n",
+ "
253.379191
\n",
+ "
\n",
+ "
\n",
+ "
MD
\n",
+ "
13.540719
\n",
+ "
8.718223
\n",
+ "
8.779790
\n",
+ "
27.535370
\n",
+ "
2.603511
\n",
+ "
19.685054
\n",
+ "
14.383766
\n",
+ "
259.769712
\n",
+ "
\n",
+ "
\n",
+ "
ME
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
\n",
+ "
\n",
+ "
MI
\n",
+ "
12.206230
\n",
+ "
10.414820
\n",
+ "
8.344255
\n",
+ "
32.771988
\n",
+ "
1.274467
\n",
+ "
23.474760
\n",
+ "
15.933231
\n",
+ "
243.462106
\n",
+ "
\n",
+ "
\n",
+ "
MN
\n",
+ "
12.561878
\n",
+ "
12.686021
\n",
+ "
2.782432
\n",
+ "
6.117358
\n",
+ "
2.959547
\n",
+ "
5.976334
\n",
+ "
7.635743
\n",
+ "
372.645274
\n",
+ "
\n",
+ "
\n",
+ "
MO
\n",
+ "
13.801131
\n",
+ "
6.159233
\n",
+ "
28.603340
\n",
+ "
32.635864
\n",
+ "
0.684768
\n",
+ "
27.405787
\n",
+ "
8.255299
\n",
+ "
432.829801
\n",
+ "
\n",
+ "
\n",
+ "
MS
\n",
+ "
12.689076
\n",
+ "
14.917930
\n",
+ "
3.427233
\n",
+ "
31.276492
\n",
+ "
0.196270
\n",
+ "
18.861527
\n",
+ "
9.070286
\n",
+ "
261.432326
\n",
+ "
\n",
+ "
\n",
+ "
NC
\n",
+ "
9.108144
\n",
+ "
11.824309
\n",
+ "
27.548123
\n",
+ "
21.370015
\n",
+ "
0.743425
\n",
+ "
20.209928
\n",
+ "
15.247821
\n",
+ "
347.903709
\n",
+ "
\n",
+ "
\n",
+ "
NE
\n",
+ "
13.801541
\n",
+ "
31.910941
\n",
+ "
5.980482
\n",
+ "
5.475750
\n",
+ "
0.314786
\n",
+ "
26.539885
\n",
+ "
35.131407
\n",
+ "
280.014285
\n",
+ "
\n",
+ "
\n",
+ "
NH
\n",
+ "
13.905494
\n",
+ "
5.839268
\n",
+ "
2.002443
\n",
+ "
6.864877
\n",
+ "
0.045579
\n",
+ "
19.519998
\n",
+ "
9.904841
\n",
+ "
107.145929
\n",
+ "
\n",
+ "
\n",
+ "
NJ
\n",
+ "
12.751987
\n",
+ "
10.917427
\n",
+ "
7.627366
\n",
+ "
29.765513
\n",
+ "
0.828170
\n",
+ "
20.250063
\n",
+ "
26.757247
\n",
+ "
250.668671
\n",
+ "
\n",
+ "
\n",
+ "
NV
\n",
+ "
8.890420
\n",
+ "
13.933522
\n",
+ "
9.220581
\n",
+ "
8.618445
\n",
+ "
1.541162
\n",
+ "
12.816087
\n",
+ "
20.475023
\n",
+ "
281.744159
\n",
+ "
\n",
+ "
\n",
+ "
NY
\n",
+ "
15.367349
\n",
+ "
10.482482
\n",
+ "
9.360769
\n",
+ "
27.298293
\n",
+ "
1.159137
\n",
+ "
23.472950
\n",
+ "
24.695955
\n",
+ "
355.029729
\n",
+ "
\n",
+ "
\n",
+ "
OH
\n",
+ "
12.115013
\n",
+ "
7.563112
\n",
+ "
17.611845
\n",
+ "
28.132683
\n",
+ "
0.520454
\n",
+ "
22.310130
\n",
+ "
8.078694
\n",
+ "
232.867404
\n",
+ "
\n",
+ "
\n",
+ "
OK
\n",
+ "
16.722990
\n",
+ "
12.331665
\n",
+ "
26.929758
\n",
+ "
10.812809
\n",
+ "
2.692355
\n",
+ "
22.097798
\n",
+ "
21.618860
\n",
+ "
282.164508
\n",
+ "
\n",
+ "
\n",
+ "
OR
\n",
+ "
8.049124
\n",
+ "
7.106145
\n",
+ "
2.336371
\n",
+ "
6.059180
\n",
+ "
1.664208
\n",
+ "
16.590982
\n",
+ "
17.607034
\n",
+ "
339.545074
\n",
+ "
\n",
+ "
\n",
+ "
PA
\n",
+ "
13.320894
\n",
+ "
8.621532
\n",
+ "
12.548759
\n",
+ "
29.858866
\n",
+ "
0.511653
\n",
+ "
24.167443
\n",
+ "
25.464024
\n",
+ "
244.351269
\n",
+ "
\n",
+ "
\n",
+ "
RI
\n",
+ "
9.497666
\n",
+ "
10.185129
\n",
+ "
7.827082
\n",
+ "
7.240360
\n",
+ "
1.648013
\n",
+ "
16.325544
\n",
+ "
18.332274
\n",
+ "
162.327648
\n",
+ "
\n",
+ "
\n",
+ "
SC
\n",
+ "
5.626563
\n",
+ "
8.208185
\n",
+ "
4.638011
\n",
+ "
21.833560
\n",
+ "
0.298131
\n",
+ "
20.698122
\n",
+ "
4.274111
\n",
+ "
230.949161
\n",
+ "
\n",
+ "
\n",
+ "
SD
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
\n",
+ "
\n",
+ "
TN
\n",
+ "
11.489832
\n",
+ "
9.057370
\n",
+ "
18.859428
\n",
+ "
33.167614
\n",
+ "
0.212381
\n",
+ "
25.728519
\n",
+ "
13.148975
\n",
+ "
353.619736
\n",
+ "
\n",
+ "
\n",
+ "
TX
\n",
+ "
10.942711
\n",
+ "
13.820835
\n",
+ "
6.522051
\n",
+ "
24.081024
\n",
+ "
0.963748
\n",
+ "
13.660377
\n",
+ "
26.095366
\n",
+ "
338.926031
\n",
+ "
\n",
+ "
\n",
+ "
UT
\n",
+ "
26.412869
\n",
+ "
11.331583
\n",
+ "
59.414547
\n",
+ "
0.127636
\n",
+ "
1.377046
\n",
+ "
31.890933
\n",
+ "
34.367822
\n",
+ "
372.645274
\n",
+ "
\n",
+ "
\n",
+ "
VA
\n",
+ "
8.769725
\n",
+ "
8.298224
\n",
+ "
21.723519
\n",
+ "
24.526696
\n",
+ "
0.354601
\n",
+ "
18.392528
\n",
+ "
14.595742
\n",
+ "
336.832852
\n",
+ "
\n",
+ "
\n",
+ "
WA
\n",
+ "
15.505374
\n",
+ "
12.217954
\n",
+ "
3.154185
\n",
+ "
8.710185
\n",
+ "
0.981045
\n",
+ "
20.333769
\n",
+ "
22.762020
\n",
+ "
268.429693
\n",
+ "
\n",
+ "
\n",
+ "
WI
\n",
+ "
9.369116
\n",
+ "
14.496039
\n",
+ "
9.633473
\n",
+ "
33.212954
\n",
+ "
1.195305
\n",
+ "
15.495687
\n",
+ "
22.123829
\n",
+ "
215.928941
\n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_stats_score_l.style.bar(\n",
+ " subset=grouped_stats_score_l.columns, align='mid', color=['#d65f5f', '#5fba7d'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['Percent of individuals < 200% Federal Poverty Line (variance across all tracts)',\n",
+ " 'Percent individuals age 25 or over with less than high school degree (variance across all tracts)',\n",
+ " 'Percent enrollment in college or graduate school (variance across all tracts)',\n",
+ " 'Percent Black or African American alone (variance across all tracts)',\n",
+ " 'Percent American Indian and Alaska Native alone (variance across all tracts)',\n",
+ " 'Percent Non-Hispanic White (variance across all tracts)',\n",
+ " 'Percent Hispanic or Latino (variance across all tracts)'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_stats_score_l.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If the samples are samples from the population - a big assumption - in question and the sampling follows rules and things we take the mean:\n",
+ "\n",
+ "The mean of the median distribution (an order statistic) is the best estimate for the median. It's the best least unbiased estimator. The answer is not the median of that distribution.\n",
+ "\n",
+ "The answer is also not if the mean is an estimate of the median. Completely unrelated.\n",
+ "\n",
+ "Even if the original population is skewed, the distribution of a sampling statistic will be normalized - recall the the central limit theorem for more details\n",
+ "\n",
+ "The standard error of that mean should give you what you want to know to confidently make statements of the true population median across all states"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "list_of_columns = list(grouped_stats_score_l.columns)\n",
+ "values_1 = list(grouped_stats_score_l[list_of_columns].var())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_1.extend([True, False])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[17.256903160392767,\n",
+ " 21.04537908818882,\n",
+ " 134.1363662670021,\n",
+ " 126.27122759971488,\n",
+ " 0.6672345565394615,\n",
+ " 37.71963763383144,\n",
+ " 66.32917757533399,\n",
+ " 8194.046353700449,\n",
+ " True,\n",
+ " False]"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "values_1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CalenviroScreen Burden"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predicate_2 = (non_null_df['current_threshold_exceeded'] == False) & (non_null_df['new_threshold_exceeded'] == True)\n",
+ "\n",
+ "cal_ej_screen_burdened = non_null_df[predicate_2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Join the demographics in.\n",
+ "merged_df_score_ejcreen = cal_ej_screen_burdened.merge(\n",
+ " demographics_df[columns_to_keep],\n",
+ " left_on=\"FIPS_tract_id\",\n",
+ " right_on=\"GEOID10_TRACT\",\n",
+ " how=\"inner\"\n",
+ ")\n",
+ "\n",
+ "# these are not converted into percent 0 - 100 scale\n",
+ "percent_cols = [x for x in merged_df_score_ejcreen.columns if \n",
+ " 'Percent' in x or '(percent)' in x\n",
+ " ]\n",
+ "\n",
+ "merged_df_score_ejcreen[\n",
+ " percent_cols] = merged_df_score_ejcreen[\n",
+ " percent_cols].apply(lambda x: x * 100)\n",
+ "\n",
+ "for idx, row in merged_df_score_ejcreen.iterrows():\n",
+ " current_row = str(merged_df_score_ejcreen.loc[idx, 'state'])\n",
+ " state = mapping.get(current_row, None)\n",
+ " merged_df_score_ejcreen.loc[idx, 'state_name'] = state\n",
+ "\n",
+ "grouped_stats_score_ej_screen = merged_df_score_ejcreen.groupby([\"state_name\"]).agg({\n",
+ " 'GEOID10_TRACT': 'nunique',\n",
+ " 'Percent of individuals < 200% Federal Poverty Line': [np.median, np.std],\n",
+ " 'Median value ($) of owner-occupied housing units': [np.median, np.std],\n",
+ " 'Percent individuals age 25 or over with less than high school degree': [np.median, np.std],\n",
+ " 'Percent enrollment in college or graduate school': [np.median, np.std],\n",
+ " 'Percent Black or African American alone': [np.median, np.std],\n",
+ " 'Percent American Indian and Alaska Native alone': [np.median, np.std],\n",
+ " 'Percent Non-Hispanic White': [np.median, np.std], \n",
+ " 'Linguistic isolation (percent)': [np.median, np.std],\n",
+ " 'Percent Hispanic or Latino': [np.median, np.std],\n",
+ " 'hbrd_rank': [np.median, np.std],\n",
+ " 'current_methodology_percent': [np.median, np.std],\n",
+ " 'current_summed_methodology': [np.median, np.std, np.sum]\n",
+ "}).reset_index()\n",
+ "\n",
+ "\n",
+ "grouped_stats_score_ej_screen.columns = [' '.join(col).strip() for \n",
+ " col in grouped_stats_score_ej_screen.columns.values]\n",
+ "\n",
+ "grouped_stats_score_ej_screen = grouped_stats_score_ej_screen[[x for x in grouped_stats_score_ej_screen \n",
+ " if \"std\" in x and \n",
+ " \"Percent\" in x] + [\n",
+ " 'current_summed_methodology std', \"state_name\"]]\n",
+ "\n",
+ "grouped_stats_score_ej_screen.set_index(\"state_name\", inplace=True)\n",
+ "\n",
+ "grouped_stats_score_ej_screen = grouped_stats_score_ej_screen.rename(columns={\n",
+ "'Percent of individuals < 200% Federal Poverty Line std'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (variance across all tracts)', \n",
+ "'Percent individuals age 25 or over with less than high school degree std':\n",
+ "'Percent individuals age 25 or over with less than high school degree (variance across all tracts)', \n",
+ "'Percent enrollment in college or graduate school std'\n",
+ " :'Percent enrollment in college or graduate school (variance across all tracts)',\n",
+ "'Percent Black or African American alone std':\n",
+ " 'Percent Black or African American alone (variance across all tracts)',\n",
+ "'Percent American Indian and Alaska Native alone std':\n",
+ " 'Percent American Indian and Alaska Native alone (variance across all tracts)',\n",
+ "'Percent Non-Hispanic White std':\n",
+ " 'Percent Non-Hispanic White (variance across all tracts)',\n",
+ "'Percent Hispanic or Latino std':\n",
+ " 'Percent Hispanic or Latino (variance across all tracts)',\n",
+ "'GEOID10_TRACT std': \"Total Number of Unique Tracts\",\n",
+ "\"current_summed_methodology std\": \"Variance: Owned and Rented Burdened Households (Current Aggregation Methodology)\" \n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
Percent of individuals < 200% Federal Poverty Line (variance across all tracts)
Percent individuals age 25 or over with less than high school degree (variance across all tracts)
Percent enrollment in college or graduate school (variance across all tracts)
Percent Black or African American alone (variance across all tracts)
Percent American Indian and Alaska Native alone (variance across all tracts)
Percent Non-Hispanic White (variance across all tracts)
Percent Hispanic or Latino (variance across all tracts)
Variance: Owned and Rented Burdened Households (Current Aggregation Methodology)
state_name
\n",
+ "
\n",
+ "
AK
\n",
+ "
13.520895
\n",
+ "
4.615880
\n",
+ "
4.223458
\n",
+ "
4.322703
\n",
+ "
6.659459
\n",
+ "
20.079234
\n",
+ "
4.363742
\n",
+ "
259.106717
\n",
+ "
\n",
+ "
\n",
+ "
AL
\n",
+ "
10.125709
\n",
+ "
9.439090
\n",
+ "
11.872275
\n",
+ "
26.542148
\n",
+ "
0.835248
\n",
+ "
22.611835
\n",
+ "
7.001758
\n",
+ "
248.330726
\n",
+ "
\n",
+ "
\n",
+ "
AR
\n",
+ "
12.281127
\n",
+ "
7.975936
\n",
+ "
12.495043
\n",
+ "
30.476852
\n",
+ "
1.371498
\n",
+ "
24.599422
\n",
+ "
9.514320
\n",
+ "
192.426182
\n",
+ "
\n",
+ "
\n",
+ "
AZ
\n",
+ "
12.946036
\n",
+ "
15.447142
\n",
+ "
15.468539
\n",
+ "
5.601762
\n",
+ "
4.027090
\n",
+ "
24.939213
\n",
+ "
27.758437
\n",
+ "
248.404072
\n",
+ "
\n",
+ "
\n",
+ "
CA
\n",
+ "
12.487873
\n",
+ "
12.922233
\n",
+ "
12.461067
\n",
+ "
12.222976
\n",
+ "
0.789972
\n",
+ "
25.345393
\n",
+ "
23.401828
\n",
+ "
249.579452
\n",
+ "
\n",
+ "
\n",
+ "
CO
\n",
+ "
12.350719
\n",
+ "
10.420574
\n",
+ "
13.955004
\n",
+ "
7.747167
\n",
+ "
3.147796
\n",
+ "
24.315130
\n",
+ "
22.744562
\n",
+ "
263.838721
\n",
+ "
\n",
+ "
\n",
+ "
CT
\n",
+ "
6.712412
\n",
+ "
5.502453
\n",
+ "
2.335335
\n",
+ "
14.655833
\n",
+ "
0.221906
\n",
+ "
16.956516
\n",
+ "
9.128131
\n",
+ "
273.860001
\n",
+ "
\n",
+ "
\n",
+ "
DC
\n",
+ "
5.839941
\n",
+ "
2.632636
\n",
+ "
8.647272
\n",
+ "
28.325298
\n",
+ "
0.000000
\n",
+ "
16.555237
\n",
+ "
7.311294
\n",
+ "
73.539105
\n",
+ "
\n",
+ "
\n",
+ "
DE
\n",
+ "
8.080726
\n",
+ "
4.024877
\n",
+ "
13.093286
\n",
+ "
25.002276
\n",
+ "
0.601911
\n",
+ "
19.263794
\n",
+ "
7.124579
\n",
+ "
206.473727
\n",
+ "
\n",
+ "
\n",
+ "
FL
\n",
+ "
12.291765
\n",
+ "
8.443308
\n",
+ "
11.206603
\n",
+ "
23.985808
\n",
+ "
0.650698
\n",
+ "
26.291813
\n",
+ "
30.672446
\n",
+ "
263.872563
\n",
+ "
\n",
+ "
\n",
+ "
GA
\n",
+ "
10.406307
\n",
+ "
8.469707
\n",
+ "
11.379100
\n",
+ "
22.869892
\n",
+ "
0.866079
\n",
+ "
17.937022
\n",
+ "
9.939746
\n",
+ "
291.040808
\n",
+ "
\n",
+ "
\n",
+ "
HI
\n",
+ "
8.953741
\n",
+ "
4.119816
\n",
+ "
4.230463
\n",
+ "
5.903393
\n",
+ "
0.567517
\n",
+ "
20.414592
\n",
+ "
7.826224
\n",
+ "
195.415711
\n",
+ "
\n",
+ "
\n",
+ "
IA
\n",
+ "
10.607442
\n",
+ "
8.335171
\n",
+ "
12.622566
\n",
+ "
9.346253
\n",
+ "
0.885379
\n",
+ "
15.040902
\n",
+ "
8.480330
\n",
+ "
244.410783
\n",
+ "
\n",
+ "
\n",
+ "
ID
\n",
+ "
11.219832
\n",
+ "
7.881505
\n",
+ "
18.597185
\n",
+ "
1.203050
\n",
+ "
1.182857
\n",
+ "
9.968177
\n",
+ "
9.736982
\n",
+ "
324.961525
\n",
+ "
\n",
+ "
\n",
+ "
IL
\n",
+ "
14.026649
\n",
+ "
10.140918
\n",
+ "
13.752964
\n",
+ "
37.268619
\n",
+ "
0.698281
\n",
+ "
25.750998
\n",
+ "
23.486043
\n",
+ "
247.083538
\n",
+ "
\n",
+ "
\n",
+ "
IN
\n",
+ "
10.234744
\n",
+ "
10.471140
\n",
+ "
15.462955
\n",
+ "
27.768428
\n",
+ "
0.562845
\n",
+ "
26.950047
\n",
+ "
11.500188
\n",
+ "
339.918101
\n",
+ "
\n",
+ "
\n",
+ "
KS
\n",
+ "
10.066779
\n",
+ "
10.860425
\n",
+ "
13.582433
\n",
+ "
19.523683
\n",
+ "
2.086300
\n",
+ "
25.585689
\n",
+ "
17.249613
\n",
+ "
243.666729
\n",
+ "
\n",
+ "
\n",
+ "
KY
\n",
+ "
11.979435
\n",
+ "
8.389199
\n",
+ "
8.836230
\n",
+ "
24.464310
\n",
+ "
0.664383
\n",
+ "
24.782279
\n",
+ "
5.415088
\n",
+ "
223.379490
\n",
+ "
\n",
+ "
\n",
+ "
LA
\n",
+ "
13.215265
\n",
+ "
6.155072
\n",
+ "
12.233619
\n",
+ "
24.269290
\n",
+ "
0.146634
\n",
+ "
20.015625
\n",
+ "
3.790697
\n",
+ "
157.788981
\n",
+ "
\n",
+ "
\n",
+ "
MA
\n",
+ "
10.359133
\n",
+ "
10.151970
\n",
+ "
18.937744
\n",
+ "
24.069256
\n",
+ "
0.348046
\n",
+ "
26.092471
\n",
+ "
19.800689
\n",
+ "
197.928531
\n",
+ "
\n",
+ "
\n",
+ "
MD
\n",
+ "
12.835824
\n",
+ "
10.586967
\n",
+ "
10.320206
\n",
+ "
31.095121
\n",
+ "
0.687262
\n",
+ "
23.988533
\n",
+ "
15.032420
\n",
+ "
207.664458
\n",
+ "
\n",
+ "
\n",
+ "
ME
\n",
+ "
13.685119
\n",
+ "
7.099519
\n",
+ "
12.289457
\n",
+ "
4.392563
\n",
+ "
0.765678
\n",
+ "
5.400656
\n",
+ "
1.556741
\n",
+ "
268.108982
\n",
+ "
\n",
+ "
\n",
+ "
MI
\n",
+ "
11.787155
\n",
+ "
7.968431
\n",
+ "
14.673247
\n",
+ "
36.100852
\n",
+ "
0.794458
\n",
+ "
30.927621
\n",
+ "
6.316568
\n",
+ "
222.209551
\n",
+ "
\n",
+ "
\n",
+ "
MN
\n",
+ "
13.723010
\n",
+ "
9.739200
\n",
+ "
13.172282
\n",
+ "
14.211796
\n",
+ "
2.745100
\n",
+ "
24.799241
\n",
+ "
9.580859
\n",
+ "
236.647262
\n",
+ "
\n",
+ "
\n",
+ "
MO
\n",
+ "
13.926854
\n",
+ "
8.268839
\n",
+ "
9.004131
\n",
+ "
32.274333
\n",
+ "
0.522477
\n",
+ "
31.766367
\n",
+ "
14.184567
\n",
+ "
221.671125
\n",
+ "
\n",
+ "
\n",
+ "
MS
\n",
+ "
11.128920
\n",
+ "
9.303949
\n",
+ "
11.954882
\n",
+ "
23.550758
\n",
+ "
0.457355
\n",
+ "
21.335399
\n",
+ "
3.288823
\n",
+ "
233.658743
\n",
+ "
\n",
+ "
\n",
+ "
MT
\n",
+ "
10.119607
\n",
+ "
3.559417
\n",
+ "
14.846792
\n",
+ "
1.429825
\n",
+ "
4.761037
\n",
+ "
5.548337
\n",
+ "
1.904990
\n",
+ "
284.061423
\n",
+ "
\n",
+ "
\n",
+ "
NC
\n",
+ "
12.884796
\n",
+ "
9.594227
\n",
+ "
15.499084
\n",
+ "
24.784252
\n",
+ "
1.981525
\n",
+ "
24.213073
\n",
+ "
8.841596
\n",
+ "
299.583580
\n",
+ "
\n",
+ "
\n",
+ "
ND
\n",
+ "
9.021654
\n",
+ "
5.289198
\n",
+ "
19.126451
\n",
+ "
7.711368
\n",
+ "
1.653742
\n",
+ "
12.206947
\n",
+ "
3.783639
\n",
+ "
202.671137
\n",
+ "
\n",
+ "
\n",
+ "
NE
\n",
+ "
13.838250
\n",
+ "
12.069916
\n",
+ "
14.020891
\n",
+ "
10.927751
\n",
+ "
1.369888
\n",
+ "
20.152953
\n",
+ "
18.238305
\n",
+ "
228.345148
\n",
+ "
\n",
+ "
\n",
+ "
NH
\n",
+ "
10.990331
\n",
+ "
6.745657
\n",
+ "
7.744067
\n",
+ "
4.439421
\n",
+ "
0.279980
\n",
+ "
12.905667
\n",
+ "
7.710063
\n",
+ "
247.971051
\n",
+ "
\n",
+ "
\n",
+ "
NJ
\n",
+ "
12.662421
\n",
+ "
4.374319
\n",
+ "
3.457423
\n",
+ "
34.125623
\n",
+ "
0.062942
\n",
+ "
27.425144
\n",
+ "
23.794351
\n",
+ "
121.752090
\n",
+ "
\n",
+ "
\n",
+ "
NM
\n",
+ "
13.703794
\n",
+ "
11.957349
\n",
+ "
8.582104
\n",
+ "
3.015090
\n",
+ "
11.554536
\n",
+ "
18.853656
\n",
+ "
21.865586
\n",
+ "
219.885214
\n",
+ "
\n",
+ "
\n",
+ "
NV
\n",
+ "
13.591475
\n",
+ "
14.487448
\n",
+ "
10.431661
\n",
+ "
16.143358
\n",
+ "
1.001012
\n",
+ "
23.914023
\n",
+ "
22.030263
\n",
+ "
163.780341
\n",
+ "
\n",
+ "
\n",
+ "
NY
\n",
+ "
13.790519
\n",
+ "
8.052658
\n",
+ "
14.377376
\n",
+ "
35.581151
\n",
+ "
1.505131
\n",
+ "
23.625736
\n",
+ "
21.544354
\n",
+ "
149.887672
\n",
+ "
\n",
+ "
\n",
+ "
OH
\n",
+ "
12.318027
\n",
+ "
7.705678
\n",
+ "
9.509734
\n",
+ "
30.867198
\n",
+ "
0.765228
\n",
+ "
27.528987
\n",
+ "
8.507085
\n",
+ "
189.476546
\n",
+ "
\n",
+ "
\n",
+ "
OK
\n",
+ "
15.699078
\n",
+ "
10.370803
\n",
+ "
13.945978
\n",
+ "
22.133927
\n",
+ "
3.633287
\n",
+ "
20.906816
\n",
+ "
17.779218
\n",
+ "
211.075700
\n",
+ "
\n",
+ "
\n",
+ "
OR
\n",
+ "
9.306317
\n",
+ "
6.689131
\n",
+ "
7.925363
\n",
+ "
3.815764
\n",
+ "
1.439716
\n",
+ "
13.331581
\n",
+ "
10.691046
\n",
+ "
268.643525
\n",
+ "
\n",
+ "
\n",
+ "
PA
\n",
+ "
14.894647
\n",
+ "
8.630037
\n",
+ "
9.495274
\n",
+ "
34.093890
\n",
+ "
0.876892
\n",
+ "
30.973814
\n",
+ "
18.200768
\n",
+ "
237.806669
\n",
+ "
\n",
+ "
\n",
+ "
RI
\n",
+ "
11.146019
\n",
+ "
4.435617
\n",
+ "
32.918187
\n",
+ "
7.412547
\n",
+ "
0.241216
\n",
+ "
23.036924
\n",
+ "
16.407429
\n",
+ "
257.420168
\n",
+ "
\n",
+ "
\n",
+ "
SC
\n",
+ "
12.230901
\n",
+ "
9.140531
\n",
+ "
15.994662
\n",
+ "
28.200957
\n",
+ "
0.541914
\n",
+ "
24.205295
\n",
+ "
8.326607
\n",
+ "
299.090600
\n",
+ "
\n",
+ "
\n",
+ "
SD
\n",
+ "
10.762269
\n",
+ "
6.350754
\n",
+ "
18.344069
\n",
+ "
6.482646
\n",
+ "
17.759617
\n",
+ "
17.513471
\n",
+ "
3.146443
\n",
+ "
231.027858
\n",
+ "
\n",
+ "
\n",
+ "
TN
\n",
+ "
13.134597
\n",
+ "
8.462203
\n",
+ "
18.732469
\n",
+ "
31.389712
\n",
+ "
0.724456
\n",
+ "
27.535573
\n",
+ "
8.915858
\n",
+ "
260.655771
\n",
+ "
\n",
+ "
\n",
+ "
TX
\n",
+ "
13.578461
\n",
+ "
14.435436
\n",
+ "
10.950813
\n",
+ "
21.904895
\n",
+ "
0.885379
\n",
+ "
22.327223
\n",
+ "
28.187346
\n",
+ "
326.165285
\n",
+ "
\n",
+ "
\n",
+ "
UT
\n",
+ "
11.688057
\n",
+ "
9.505571
\n",
+ "
11.614578
\n",
+ "
2.352014
\n",
+ "
2.323857
\n",
+ "
18.718422
\n",
+ "
16.466781
\n",
+ "
255.040636
\n",
+ "
\n",
+ "
\n",
+ "
VA
\n",
+ "
12.373895
\n",
+ "
8.151552
\n",
+ "
13.045575
\n",
+ "
25.425311
\n",
+ "
0.558601
\n",
+ "
22.041004
\n",
+ "
14.043601
\n",
+ "
239.139561
\n",
+ "
\n",
+ "
\n",
+ "
VT
\n",
+ "
10.180275
\n",
+ "
5.438204
\n",
+ "
21.609560
\n",
+ "
4.073187
\n",
+ "
0.985815
\n",
+ "
10.981419
\n",
+ "
1.490384
\n",
+ "
224.502561
\n",
+ "
\n",
+ "
\n",
+ "
WA
\n",
+ "
12.497007
\n",
+ "
8.339197
\n",
+ "
10.260304
\n",
+ "
7.753262
\n",
+ "
1.636725
\n",
+ "
19.165341
\n",
+ "
13.734670
\n",
+ "
298.822930
\n",
+ "
\n",
+ "
\n",
+ "
WI
\n",
+ "
13.738968
\n",
+ "
10.500006
\n",
+ "
12.713038
\n",
+ "
31.859261
\n",
+ "
1.823214
\n",
+ "
29.772296
\n",
+ "
24.227007
\n",
+ "
151.094338
\n",
+ "
\n",
+ "
\n",
+ "
WV
\n",
+ "
13.313674
\n",
+ "
6.107206
\n",
+ "
17.045398
\n",
+ "
11.091388
\n",
+ "
0.344967
\n",
+ "
12.586416
\n",
+ "
3.074087
\n",
+ "
227.309836
\n",
+ "
\n",
+ "
\n",
+ "
WY
\n",
+ "
8.312618
\n",
+ "
7.088938
\n",
+ "
14.926238
\n",
+ "
1.773146
\n",
+ "
3.098632
\n",
+ "
10.233686
\n",
+ "
9.157789
\n",
+ "
255.577994
\n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_stats_score_ej_screen.style.bar(\n",
+ " subset=grouped_stats_score_ej_screen.columns, align='mid', color=['#d65f5f', '#5fba7d'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2272, 36)"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df_score_ejcreen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_2 = list(grouped_stats_score_ej_screen[list_of_columns].mean())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_2.extend([False, True])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Both Met Criterion"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predicate_3 = (non_null_df['current_threshold_exceeded'] == True) & (non_null_df['new_threshold_exceeded'] == True)\n",
+ "\n",
+ "union_df = non_null_df[predicate_3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(4529, 15)"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "union_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Join the demographics in.\n",
+ "merged_df_both_met = union_df.merge(\n",
+ " demographics_df[columns_to_keep],\n",
+ " left_on=\"FIPS_tract_id\",\n",
+ " right_on=\"GEOID10_TRACT\",\n",
+ " how=\"inner\"\n",
+ ")\n",
+ "\n",
+ "# these are not converted into percent 0 - 100 scale\n",
+ "percent_cols = [x for x in merged_df_both_met.columns if \n",
+ " 'Percent' in x or '(percent)' in x\n",
+ " ]\n",
+ "\n",
+ "merged_df_both_met[\n",
+ " percent_cols] = merged_df_both_met[\n",
+ " percent_cols].apply(lambda x: x * 100)\n",
+ "\n",
+ "for idx, row in merged_df_both_met.iterrows():\n",
+ " current_row = str(merged_df_both_met.loc[idx, 'state'])\n",
+ " state = mapping.get(current_row, None)\n",
+ " merged_df_both_met.loc[idx, 'state_name'] = state\n",
+ "\n",
+ "grouped_stats_both_met = merged_df_both_met.groupby([\"state_name\"]).agg({\n",
+ " 'GEOID10_TRACT': 'nunique',\n",
+ " 'Percent of individuals < 200% Federal Poverty Line': [np.median, np.std],\n",
+ " 'Median value ($) of owner-occupied housing units': [np.median, np.std],\n",
+ " 'Percent individuals age 25 or over with less than high school degree': [np.median, np.std],\n",
+ " 'Percent enrollment in college or graduate school': [np.median, np.std],\n",
+ " 'Percent Black or African American alone': [np.median, np.std],\n",
+ " 'Percent American Indian and Alaska Native alone': [np.median, np.std],\n",
+ " 'Percent Non-Hispanic White': [np.median, np.std], \n",
+ " 'Linguistic isolation (percent)': [np.median, np.std],\n",
+ " 'Percent Hispanic or Latino': [np.median, np.std],\n",
+ " 'hbrd_rank': [np.median, np.std],\n",
+ " 'current_methodology_percent': [np.median, np.std],\n",
+ " 'current_summed_methodology': [np.median, np.std, np.sum]\n",
+ "}).reset_index()\n",
+ "\n",
+ "\n",
+ "grouped_stats_both_met.columns = [' '.join(col).strip() for \n",
+ " col in grouped_stats_both_met.columns.values]\n",
+ "\n",
+ "grouped_stats_both_met = grouped_stats_both_met[[x for x in grouped_stats_both_met \n",
+ " if \"std\" in x and \n",
+ " \"Percent\" in x] + [\n",
+ " 'current_summed_methodology std', \"state_name\"]]\n",
+ "\n",
+ "grouped_stats_both_met.set_index(\"state_name\", inplace=True)\n",
+ "\n",
+ "grouped_stats_both_met = grouped_stats_both_met.rename(columns={\n",
+ " 'Percent of individuals < 200% Federal Poverty Line std'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (variance across all tracts)', \n",
+ "'Percent individuals age 25 or over with less than high school degree std':\n",
+ "'Percent individuals age 25 or over with less than high school degree (variance across all tracts)', \n",
+ "'Percent enrollment in college or graduate school std'\n",
+ " :'Percent enrollment in college or graduate school (variance across all tracts)',\n",
+ "'Percent Black or African American alone std':\n",
+ " 'Percent Black or African American alone (variance across all tracts)',\n",
+ "'Percent American Indian and Alaska Native alone std':\n",
+ " 'Percent American Indian and Alaska Native alone (variance across all tracts)',\n",
+ "'Percent Non-Hispanic White std':\n",
+ " 'Percent Non-Hispanic White (variance across all tracts)',\n",
+ "'Percent Hispanic or Latino std':\n",
+ " 'Percent Hispanic or Latino (variance across all tracts)',\n",
+ "'GEOID10_TRACT std': \"Total Number of Unique Tracts\",\n",
+ "\"current_summed_methodology std\": \"Variance: Owned and Rented Burdened Households (Current Aggregation Methodology)\" \n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
Percent of individuals < 200% Federal Poverty Line (variance across all tracts)
Percent individuals age 25 or over with less than high school degree (variance across all tracts)
Percent enrollment in college or graduate school (variance across all tracts)
Percent Black or African American alone (variance across all tracts)
Percent American Indian and Alaska Native alone (variance across all tracts)
Percent Non-Hispanic White (variance across all tracts)
Percent Hispanic or Latino (variance across all tracts)
Variance: Owned and Rented Burdened Households (Current Aggregation Methodology)
state_name
\n",
+ "
\n",
+ "
AK
\n",
+ "
5.362564
\n",
+ "
6.866730
\n",
+ "
4.713743
\n",
+ "
4.359215
\n",
+ "
4.461964
\n",
+ "
4.464435
\n",
+ "
5.648686
\n",
+ "
200.852350
\n",
+ "
\n",
+ "
\n",
+ "
AL
\n",
+ "
10.762316
\n",
+ "
8.759849
\n",
+ "
26.357589
\n",
+ "
29.542667
\n",
+ "
0.983361
\n",
+ "
26.170044
\n",
+ "
6.445044
\n",
+ "
241.026314
\n",
+ "
\n",
+ "
\n",
+ "
AR
\n",
+ "
8.955506
\n",
+ "
6.661830
\n",
+ "
21.335498
\n",
+ "
29.046109
\n",
+ "
0.376642
\n",
+ "
26.230211
\n",
+ "
7.373333
\n",
+ "
386.784673
\n",
+ "
\n",
+ "
\n",
+ "
AZ
\n",
+ "
9.734770
\n",
+ "
13.667529
\n",
+ "
21.385000
\n",
+ "
5.929329
\n",
+ "
3.031099
\n",
+ "
18.395655
\n",
+ "
21.505953
\n",
+ "
264.572097
\n",
+ "
\n",
+ "
\n",
+ "
CA
\n",
+ "
12.839478
\n",
+ "
15.519094
\n",
+ "
15.965302
\n",
+ "
12.108554
\n",
+ "
1.274808
\n",
+ "
19.065738
\n",
+ "
26.046888
\n",
+ "
300.670483
\n",
+ "
\n",
+ "
\n",
+ "
CO
\n",
+ "
12.013508
\n",
+ "
12.057231
\n",
+ "
22.895904
\n",
+ "
11.301122
\n",
+ "
2.011565
\n",
+ "
24.484957
\n",
+ "
21.863004
\n",
+ "
381.523930
\n",
+ "
\n",
+ "
\n",
+ "
CT
\n",
+ "
10.322097
\n",
+ "
10.498181
\n",
+ "
15.641987
\n",
+ "
17.945122
\n",
+ "
1.126035
\n",
+ "
17.177253
\n",
+ "
19.111784
\n",
+ "
294.901340
\n",
+ "
\n",
+ "
\n",
+ "
DC
\n",
+ "
9.353163
\n",
+ "
5.744292
\n",
+ "
21.807513
\n",
+ "
22.181755
\n",
+ "
0.865375
\n",
+ "
16.106402
\n",
+ "
4.547917
\n",
+ "
187.708717
\n",
+ "
\n",
+ "
\n",
+ "
DE
\n",
+ "
10.044318
\n",
+ "
9.207932
\n",
+ "
32.720012
\n",
+ "
29.569930
\n",
+ "
0.572551
\n",
+ "
27.481490
\n",
+ "
15.873234
\n",
+ "
110.123365
\n",
+ "
\n",
+ "
\n",
+ "
FL
\n",
+ "
11.807584
\n",
+ "
10.359419
\n",
+ "
14.810645
\n",
+ "
31.244787
\n",
+ "
0.955248
\n",
+ "
16.990042
\n",
+ "
33.349320
\n",
+ "
409.998796
\n",
+ "
\n",
+ "
\n",
+ "
GA
\n",
+ "
11.579126
\n",
+ "
11.217565
\n",
+ "
21.469723
\n",
+ "
28.052059
\n",
+ "
1.598846
\n",
+ "
18.907615
\n",
+ "
16.634868
\n",
+ "
413.546272
\n",
+ "
\n",
+ "
\n",
+ "
HI
\n",
+ "
7.488105
\n",
+ "
3.886126
\n",
+ "
6.737730
\n",
+ "
7.302510
\n",
+ "
0.600968
\n",
+ "
15.793943
\n",
+ "
8.150455
\n",
+ "
267.260404
\n",
+ "
\n",
+ "
\n",
+ "
IA
\n",
+ "
9.367440
\n",
+ "
12.733213
\n",
+ "
33.960889
\n",
+ "
19.537493
\n",
+ "
2.646528
\n",
+ "
22.302319
\n",
+ "
11.377196
\n",
+ "
527.841747
\n",
+ "
\n",
+ "
\n",
+ "
ID
\n",
+ "
13.135462
\n",
+ "
12.427059
\n",
+ "
23.083662
\n",
+ "
0.424381
\n",
+ "
2.924023
\n",
+ "
21.731829
\n",
+ "
23.764339
\n",
+ "
974.017967
\n",
+ "
\n",
+ "
\n",
+ "
IL
\n",
+ "
14.365266
\n",
+ "
9.961048
\n",
+ "
18.641863
\n",
+ "
34.710751
\n",
+ "
0.667958
\n",
+ "
19.929638
\n",
+ "
24.171428
\n",
+ "
300.228779
\n",
+ "
\n",
+ "
\n",
+ "
IN
\n",
+ "
10.502570
\n",
+ "
10.009589
\n",
+ "
26.208742
\n",
+ "
29.408204
\n",
+ "
0.531106
\n",
+ "
28.719219
\n",
+ "
10.778399
\n",
+ "
331.278547
\n",
+ "
\n",
+ "
\n",
+ "
KS
\n",
+ "
9.343448
\n",
+ "
11.563216
\n",
+ "
31.571527
\n",
+ "
27.818353
\n",
+ "
1.680673
\n",
+ "
27.077467
\n",
+ "
18.532743
\n",
+ "
424.193116
\n",
+ "
\n",
+ "
\n",
+ "
KY
\n",
+ "
10.353541
\n",
+ "
10.491346
\n",
+ "
29.524277
\n",
+ "
31.184962
\n",
+ "
0.885588
\n",
+ "
28.091972
\n",
+ "
14.085954
\n",
+ "
289.382905
\n",
+ "
\n",
+ "
\n",
+ "
LA
\n",
+ "
11.485870
\n",
+ "
8.786543
\n",
+ "
15.562703
\n",
+ "
20.008507
\n",
+ "
0.527876
\n",
+ "
15.520856
\n",
+ "
7.542826
\n",
+ "
305.445267
\n",
+ "
\n",
+ "
\n",
+ "
MA
\n",
+ "
13.415892
\n",
+ "
11.856250
\n",
+ "
21.686145
\n",
+ "
25.576649
\n",
+ "
0.763304
\n",
+ "
23.189013
\n",
+ "
27.384592
\n",
+ "
261.976125
\n",
+ "
\n",
+ "
\n",
+ "
MD
\n",
+ "
14.611597
\n",
+ "
9.260909
\n",
+ "
17.774458
\n",
+ "
28.430971
\n",
+ "
0.755644
\n",
+ "
17.883547
\n",
+ "
15.157175
\n",
+ "
308.477828
\n",
+ "
\n",
+ "
\n",
+ "
ME
\n",
+ "
5.983994
\n",
+ "
8.778858
\n",
+ "
25.805900
\n",
+ "
9.016565
\n",
+ "
0.598179
\n",
+ "
12.236258
\n",
+ "
2.361052
\n",
+ "
354.752227
\n",
+ "
\n",
+ "
\n",
+ "
MI
\n",
+ "
11.364613
\n",
+ "
10.104722
\n",
+ "
24.604378
\n",
+ "
35.740705
\n",
+ "
0.749980
\n",
+ "
29.139941
\n",
+ "
9.723557
\n",
+ "
270.032843
\n",
+ "
\n",
+ "
\n",
+ "
MN
\n",
+ "
11.155877
\n",
+ "
10.702122
\n",
+ "
25.205802
\n",
+ "
20.883018
\n",
+ "
2.011065
\n",
+ "
26.127868
\n",
+ "
7.197320
\n",
+ "
373.996267
\n",
+ "
\n",
+ "
\n",
+ "
MO
\n",
+ "
10.137324
\n",
+ "
8.592364
\n",
+ "
22.712005
\n",
+ "
35.098282
\n",
+ "
0.526103
\n",
+ "
29.599300
\n",
+ "
10.088432
\n",
+ "
324.569103
\n",
+ "
\n",
+ "
\n",
+ "
MS
\n",
+ "
9.854816
\n",
+ "
6.811832
\n",
+ "
24.155295
\n",
+ "
23.449468
\n",
+ "
0.638943
\n",
+ "
20.710178
\n",
+ "
3.181963
\n",
+ "
235.058305
\n",
+ "
\n",
+ "
\n",
+ "
MT
\n",
+ "
4.115267
\n",
+ "
7.633925
\n",
+ "
31.654537
\n",
+ "
0.466736
\n",
+ "
6.593054
\n",
+ "
12.042715
\n",
+ "
6.102896
\n",
+ "
260.969730
\n",
+ "
\n",
+ "
\n",
+ "
NC
\n",
+ "
10.390897
\n",
+ "
10.501628
\n",
+ "
23.544817
\n",
+ "
24.377819
\n",
+ "
1.166064
\n",
+ "
23.259549
\n",
+ "
13.346942
\n",
+ "
342.769565
\n",
+ "
\n",
+ "
\n",
+ "
ND
\n",
+ "
13.065586
\n",
+ "
5.245493
\n",
+ "
34.210278
\n",
+ "
3.511070
\n",
+ "
5.104930
\n",
+ "
5.105851
\n",
+ "
2.359395
\n",
+ "
297.654162
\n",
+ "
\n",
+ "
\n",
+ "
NE
\n",
+ "
8.350455
\n",
+ "
8.893226
\n",
+ "
21.948548
\n",
+ "
23.075337
\n",
+ "
0.585448
\n",
+ "
21.662061
\n",
+ "
5.582370
\n",
+ "
202.499665
\n",
+ "
\n",
+ "
\n",
+ "
NH
\n",
+ "
11.683527
\n",
+ "
11.311405
\n",
+ "
37.639709
\n",
+ "
4.998337
\n",
+ "
0.403837
\n",
+ "
16.842827
\n",
+ "
13.079522
\n",
+ "
419.520096
\n",
+ "
\n",
+ "
\n",
+ "
NJ
\n",
+ "
12.203217
\n",
+ "
10.473662
\n",
+ "
7.610944
\n",
+ "
30.106950
\n",
+ "
0.811802
\n",
+ "
21.229334
\n",
+ "
26.540235
\n",
+ "
275.002256
\n",
+ "
\n",
+ "
\n",
+ "
NM
\n",
+ "
11.397865
\n",
+ "
11.011395
\n",
+ "
26.601550
\n",
+ "
2.963015
\n",
+ "
4.720711
\n",
+ "
10.616521
\n",
+ "
16.064664
\n",
+ "
411.970186
\n",
+ "
\n",
+ "
\n",
+ "
NV
\n",
+ "
10.157125
\n",
+ "
10.395277
\n",
+ "
7.134588
\n",
+ "
9.786716
\n",
+ "
1.241392
\n",
+ "
15.752480
\n",
+ "
18.739589
\n",
+ "
211.298685
\n",
+ "
\n",
+ "
\n",
+ "
NY
\n",
+ "
13.488737
\n",
+ "
9.898986
\n",
+ "
8.697032
\n",
+ "
26.867533
\n",
+ "
1.609412
\n",
+ "
28.820923
\n",
+ "
26.174584
\n",
+ "
370.123164
\n",
+ "
\n",
+ "
\n",
+ "
OH
\n",
+ "
10.825822
\n",
+ "
9.178838
\n",
+ "
26.332892
\n",
+ "
32.219266
\n",
+ "
1.052610
\n",
+ "
28.183901
\n",
+ "
9.513140
\n",
+ "
261.267771
\n",
+ "
\n",
+ "
\n",
+ "
OK
\n",
+ "
11.690984
\n",
+ "
11.778162
\n",
+ "
29.823430
\n",
+ "
25.811025
\n",
+ "
3.282949
\n",
+ "
22.038322
\n",
+ "
16.282388
\n",
+ "
313.721214
\n",
+ "
\n",
+ "
\n",
+ "
OR
\n",
+ "
11.672362
\n",
+ "
8.560431
\n",
+ "
26.402550
\n",
+ "
4.438473
\n",
+ "
1.999473
\n",
+ "
11.497117
\n",
+ "
11.710487
\n",
+ "
390.637347
\n",
+ "
\n",
+ "
\n",
+ "
PA
\n",
+ "
12.482215
\n",
+ "
11.954000
\n",
+ "
23.383473
\n",
+ "
30.664060
\n",
+ "
1.107837
\n",
+ "
25.268762
\n",
+ "
26.123990
\n",
+ "
265.380774
\n",
+ "
\n",
+ "
\n",
+ "
RI
\n",
+ "
9.292574
\n",
+ "
8.244244
\n",
+ "
8.293391
\n",
+ "
11.734409
\n",
+ "
1.084689
\n",
+ "
29.769755
\n",
+ "
26.781181
\n",
+ "
253.312496
\n",
+ "
\n",
+ "
\n",
+ "
SC
\n",
+ "
10.633839
\n",
+ "
10.503194
\n",
+ "
25.396787
\n",
+ "
29.221893
\n",
+ "
0.617377
\n",
+ "
24.669126
\n",
+ "
12.992547
\n",
+ "
230.947955
\n",
+ "
\n",
+ "
\n",
+ "
SD
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
nan
\n",
+ "
\n",
+ "
\n",
+ "
TN
\n",
+ "
9.858797
\n",
+ "
8.620668
\n",
+ "
18.128720
\n",
+ "
27.453093
\n",
+ "
0.411158
\n",
+ "
20.791161
\n",
+ "
11.842790
\n",
+ "
344.209468
\n",
+ "
\n",
+ "
\n",
+ "
TX
\n",
+ "
10.613484
\n",
+ "
14.759169
\n",
+ "
22.114764
\n",
+ "
22.973454
\n",
+ "
0.898911
\n",
+ "
19.668712
\n",
+ "
26.558668
\n",
+ "
366.266727
\n",
+ "
\n",
+ "
\n",
+ "
UT
\n",
+ "
13.791322
\n",
+ "
10.330279
\n",
+ "
33.796530
\n",
+ "
6.021641
\n",
+ "
1.631398
\n",
+ "
23.358438
\n",
+ "
17.502232
\n",
+ "
212.805390
\n",
+ "
\n",
+ "
\n",
+ "
VA
\n",
+ "
14.130616
\n",
+ "
11.286115
\n",
+ "
23.743179
\n",
+ "
32.125657
\n",
+ "
0.984275
\n",
+ "
22.722607
\n",
+ "
16.743508
\n",
+ "
315.503703
\n",
+ "
\n",
+ "
\n",
+ "
VT
\n",
+ "
9.574526
\n",
+ "
5.947329
\n",
+ "
23.245865
\n",
+ "
3.339003
\n",
+ "
0.026920
\n",
+ "
5.228800
\n",
+ "
1.019496
\n",
+ "
268.424260
\n",
+ "
\n",
+ "
\n",
+ "
WA
\n",
+ "
13.330610
\n",
+ "
9.590603
\n",
+ "
28.396426
\n",
+ "
8.269962
\n",
+ "
2.267946
\n",
+ "
19.266576
\n",
+ "
12.581541
\n",
+ "
510.235920
\n",
+ "
\n",
+ "
\n",
+ "
WI
\n",
+ "
12.645607
\n",
+ "
11.622318
\n",
+ "
27.077443
\n",
+ "
36.808805
\n",
+ "
0.896910
\n",
+ "
30.237449
\n",
+ "
23.002879
\n",
+ "
303.044555
\n",
+ "
\n",
+ "
\n",
+ "
WV
\n",
+ "
14.195970
\n",
+ "
8.054310
\n",
+ "
33.530553
\n",
+ "
14.305199
\n",
+ "
0.324366
\n",
+ "
12.987162
\n",
+ "
2.963561
\n",
+ "
256.028444
\n",
+ "
\n",
+ "
\n",
+ "
WY
\n",
+ "
4.924008
\n",
+ "
0.138082
\n",
+ "
28.231683
\n",
+ "
0.664380
\n",
+ "
0.198144
\n",
+ "
4.387199
\n",
+ "
1.966222
\n",
+ "
361.222000
\n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_stats_both_met.style.bar(\n",
+ " subset=grouped_stats_both_met.columns, align='mid', color=['#d65f5f', '#5fba7d'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_3 = list(grouped_stats_both_met[list_of_columns].mean())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 243,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_3.extend([True, True])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Neither Met Criterion "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 244,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predicate_4 = (non_null_df[\n",
+ " 'current_threshold_exceeded'] == False) & (\n",
+ " non_null_df['new_threshold_exceeded'] == False)\n",
+ "\n",
+ "negation_union_df = non_null_df[predicate_4]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 245,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Join the demographics in.\n",
+ "merged_df_negation_union = negation_union_df.merge(\n",
+ " demographics_df[columns_to_keep],\n",
+ " left_on=\"FIPS_tract_id\",\n",
+ " right_on=\"GEOID10_TRACT\",\n",
+ " how=\"inner\"\n",
+ ")\n",
+ "\n",
+ "# these are not converted into percent 0 - 100 scale\n",
+ "percent_cols = [x for x in merged_df_negation_union.columns if \n",
+ " 'Percent' in x or '(percent)' in x\n",
+ " ]\n",
+ "\n",
+ "merged_df_negation_union[\n",
+ " percent_cols] = merged_df_negation_union[\n",
+ " percent_cols].apply(lambda x: x * 100)\n",
+ "\n",
+ "for idx, row in merged_df_negation_union.iterrows():\n",
+ " current_row = str(merged_df_negation_union.loc[idx, 'state'])\n",
+ " state = mapping.get(current_row, None)\n",
+ " merged_df_negation_union.loc[idx, 'state_name'] = state\n",
+ "\n",
+ "grouped_stats_both_not_met = merged_df_negation_union.groupby([\"state_name\"]).agg({\n",
+ " 'GEOID10_TRACT': 'nunique',\n",
+ " 'Percent of individuals < 200% Federal Poverty Line': [np.median, np.std],\n",
+ " 'Median value ($) of owner-occupied housing units': [np.median, np.std],\n",
+ " 'Percent individuals age 25 or over with less than high school degree': [np.median, np.std],\n",
+ " 'Percent enrollment in college or graduate school': [np.median, np.std],\n",
+ " 'Percent Black or African American alone': [np.median, np.std],\n",
+ " 'Percent American Indian and Alaska Native alone': [np.median, np.std],\n",
+ " 'Percent Non-Hispanic White': [np.median, np.std], \n",
+ " 'Linguistic isolation (percent)': [np.median, np.std],\n",
+ " 'Percent Hispanic or Latino': [np.median, np.std],\n",
+ " 'hbrd_rank': [np.median, np.std],\n",
+ " 'current_methodology_percent': [np.median, np.std],\n",
+ " 'current_summed_methodology': [np.median, np.std, np.sum]\n",
+ "}).reset_index()\n",
+ "\n",
+ "\n",
+ "grouped_stats_both_not_met.columns = [' '.join(col).strip() for \n",
+ " col in grouped_stats_both_not_met.columns.values]\n",
+ "\n",
+ "grouped_stats_both_not_met = grouped_stats_both_not_met[[x for x in grouped_stats_both_not_met \n",
+ " if \"median\" in x and \n",
+ " \"Percent\" in x] + [\"GEOID10_TRACT nunique\", \n",
+ " 'current_summed_methodology sum', \"state_name\"]]\n",
+ "\n",
+ "grouped_stats_both_not_met.set_index(\"state_name\", inplace=True)\n",
+ "\n",
+ "grouped_stats_both_not_met = grouped_stats_both_not_met.rename(columns={\n",
+ "'Percent of individuals < 200% Federal Poverty Line median'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (median across all tracts)', \n",
+ "'Percent individuals age 25 or over with less than high school degree median':\n",
+ "'Percent individuals age 25 or over with less than high school degree (median across all tracts)', \n",
+ "'Percent enrollment in college or graduate school median'\n",
+ " :'Percent enrollment in college or graduate school (median across all tracts)',\n",
+ "'Percent Black or African American alone median':\n",
+ " 'Percent Black or African American alone (median across all tracts)',\n",
+ "'Percent American Indian and Alaska Native alone median':\n",
+ " 'Percent American Indian and Alaska Native alone (median across all tracts)',\n",
+ "'Percent Non-Hispanic White median':\n",
+ " 'Percent Non-Hispanic White (median across all tracts)',\n",
+ "'Percent Hispanic or Latino median':\n",
+ " 'Percent Hispanic or Latino (median across all tracts)',\n",
+ "'GEOID10_TRACT nunique': \"Total Number of Unique Tracts\",\n",
+ "\"current_summed_methodology sum\": \"Total Owned and Rented Burdened Households (Current Aggregation Methodology)\" \n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 246,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_4 = list(grouped_stats_both_not_met[list_of_columns].mean())\n",
+ "\n",
+ "values_4.extend([False, False])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 247,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
0
\n",
+ "
1
\n",
+ "
2
\n",
+ "
3
\n",
+ "
4
\n",
+ "
5
\n",
+ "
6
\n",
+ "
7
\n",
+ "
8
\n",
+ "
9
\n",
+ "
10
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
57.052724
\n",
+ "
19.383813
\n",
+ "
8.132533
\n",
+ "
33.692806
\n",
+ "
1.221395
\n",
+ "
31.321886
\n",
+ "
19.378348
\n",
+ "
62.088889
\n",
+ "
40949.200000
\n",
+ "
True
\n",
+ "
False
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
47.082927
\n",
+ "
13.224927
\n",
+ "
8.685441
\n",
+ "
23.061683
\n",
+ "
0.772400
\n",
+ "
48.240042
\n",
+ "
12.553371
\n",
+ "
44.549020
\n",
+ "
24719.098039
\n",
+ "
False
\n",
+ "
True
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
62.331263
\n",
+ "
17.689811
\n",
+ "
11.824205
\n",
+ "
33.455525
\n",
+ "
0.921755
\n",
+ "
34.495797
\n",
+ "
15.855933
\n",
+ "
88.803922
\n",
+ "
60757.901961
\n",
+ "
True
\n",
+ "
True
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
27.736502
\n",
+ "
8.785280
\n",
+ "
5.847120
\n",
+ "
5.729181
\n",
+ "
0.537429
\n",
+ "
75.916536
\n",
+ "
7.463468
\n",
+ "
1244.333333
\n",
+ "
443680.568627
\n",
+ "
False
\n",
+ "
False
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 57.052724 19.383813 8.132533 33.692806 1.221395 31.321886 19.378348 \n",
+ "1 47.082927 13.224927 8.685441 23.061683 0.772400 48.240042 12.553371 \n",
+ "2 62.331263 17.689811 11.824205 33.455525 0.921755 34.495797 15.855933 \n",
+ "3 27.736502 8.785280 5.847120 5.729181 0.537429 75.916536 7.463468 \n",
+ "\n",
+ " 7 8 9 10 \n",
+ "0 62.088889 40949.200000 True False \n",
+ "1 44.549020 24719.098039 False True \n",
+ "2 88.803922 60757.901961 True True \n",
+ "3 1244.333333 443680.568627 False False "
+ ]
+ },
+ "execution_count": 247,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "final_summary_data_values = [values_1, values_2, values_3, values_4]\n",
+ " \n",
+ "# Create the pandas DataFrame\n",
+ "df = pd.DataFrame(final_summary_data_values)\n",
+ " \n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 248,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cols = list(grouped_stats_both_not_met.columns)\n",
+ "cols.extend([\n",
+ " 'Score L Relative Housing Burden for all households met burden threshold', \n",
+ " 'CalEnviroScreen Housing Burden Met Burden Threshold (Ranked Percentile)'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 249,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.rename(columns={df.columns[idx]: cols[idx] for idx, _ in enumerate(cols)}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 250,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.set_index([\n",
+ " 'Score L Relative Housing Burden for all households met burden threshold', \n",
+ " 'CalEnviroScreen Housing Burden Met Burden Threshold (Ranked Percentile)'], inplace=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 251,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.rename(columns={\n",
+ "'Percent of individuals < 200% Federal Poverty Line (median across all tracts)'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (mean of median across all states)', \n",
+ "'Percent individuals age 25 or over with less than high school degree median':\n",
+ "'Percent individuals age 25 or over with less than high school degree (mean of median across all states)', \n",
+ "'Percent enrollment in college or graduate school (median across all states)'\n",
+ " :'Percent enrollment in college or graduate school (mean of median across all states)',\n",
+ "'Percent Black or African American alone (median across all tracts)':\n",
+ " 'Percent Black or African American alone (mean of median across all states)',\n",
+ "'Percent American Indian and Alaska Native alone (median across all tracts)':\n",
+ " 'Percent American Indian and Alaska Native alone (mean of median across all states)',\n",
+ "'Percent Non-Hispanic White (median across all tracts)':\n",
+ " 'Percent Non-Hispanic White (mean of median across all states)',\n",
+ "'Percent Hispanic or Latino (median across all tracts)':\n",
+ " 'Percent Hispanic or Latino (mean of median across all states)',\n",
+ "\"Total Number of Unique Tracts\": \"Total Number of Tracts - Mean of median across all states\",\n",
+ "\"Total Owned and Rented Burdened Households (Current Aggregation Methodology)\": \"Total Owned and Rented Burdened Households (Current Aggregation Methodology)- Mean of median across all states\"\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 252,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
Percent of individuals < 200% Federal Poverty Line (mean of median across all states)
\n",
+ "
Percent individuals age 25 or over with less than high school degree (median across all tracts)
\n",
+ "
Percent enrollment in college or graduate school (median across all tracts)
\n",
+ "
Percent Black or African American alone (mean of median across all states)
\n",
+ "
Percent American Indian and Alaska Native alone (mean of median across all states)
\n",
+ "
Percent Non-Hispanic White (mean of median across all states)
\n",
+ "
Percent Hispanic or Latino (mean of median across all states)
\n",
+ "
Total Number of Tracts - Mean of median across all states
\n",
+ "
Total Owned and Rented Burdened Households (Current Aggregation Methodology)- Mean of median across all states
\n",
+ "
\n",
+ "
\n",
+ "
Score L Relative Housing Burden for all households met burden threshold
\n",
+ "
CalEnviroScreen Housing Burden Met Burden Threshold (Ranked Percentile)
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
True
\n",
+ "
False
\n",
+ "
57.052724
\n",
+ "
19.383813
\n",
+ "
8.132533
\n",
+ "
33.692806
\n",
+ "
1.221395
\n",
+ "
31.321886
\n",
+ "
19.378348
\n",
+ "
62.088889
\n",
+ "
40949.200000
\n",
+ "
\n",
+ "
\n",
+ "
False
\n",
+ "
True
\n",
+ "
47.082927
\n",
+ "
13.224927
\n",
+ "
8.685441
\n",
+ "
23.061683
\n",
+ "
0.772400
\n",
+ "
48.240042
\n",
+ "
12.553371
\n",
+ "
44.549020
\n",
+ "
24719.098039
\n",
+ "
\n",
+ "
\n",
+ "
True
\n",
+ "
True
\n",
+ "
62.331263
\n",
+ "
17.689811
\n",
+ "
11.824205
\n",
+ "
33.455525
\n",
+ "
0.921755
\n",
+ "
34.495797
\n",
+ "
15.855933
\n",
+ "
88.803922
\n",
+ "
60757.901961
\n",
+ "
\n",
+ "
\n",
+ "
False
\n",
+ "
False
\n",
+ "
27.736502
\n",
+ "
8.785280
\n",
+ "
5.847120
\n",
+ "
5.729181
\n",
+ "
0.537429
\n",
+ "
75.916536
\n",
+ "
7.463468
\n",
+ "
1244.333333
\n",
+ "
443680.568627
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Percent of individuals < 200% Federal Poverty Line (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 57.052724 \n",
+ "False True 47.082927 \n",
+ "True True 62.331263 \n",
+ "False False 27.736502 \n",
+ "\n",
+ " Percent individuals age 25 or over with less than high school degree (median across all tracts) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 19.383813 \n",
+ "False True 13.224927 \n",
+ "True True 17.689811 \n",
+ "False False 8.785280 \n",
+ "\n",
+ " Percent enrollment in college or graduate school (median across all tracts) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 8.132533 \n",
+ "False True 8.685441 \n",
+ "True True 11.824205 \n",
+ "False False 5.847120 \n",
+ "\n",
+ " Percent Black or African American alone (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 33.692806 \n",
+ "False True 23.061683 \n",
+ "True True 33.455525 \n",
+ "False False 5.729181 \n",
+ "\n",
+ " Percent American Indian and Alaska Native alone (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 1.221395 \n",
+ "False True 0.772400 \n",
+ "True True 0.921755 \n",
+ "False False 0.537429 \n",
+ "\n",
+ " Percent Non-Hispanic White (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 31.321886 \n",
+ "False True 48.240042 \n",
+ "True True 34.495797 \n",
+ "False False 75.916536 \n",
+ "\n",
+ " Percent Hispanic or Latino (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 19.378348 \n",
+ "False True 12.553371 \n",
+ "True True 15.855933 \n",
+ "False False 7.463468 \n",
+ "\n",
+ " Total Number of Tracts - Mean of median across all states \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 62.088889 \n",
+ "False True 44.549020 \n",
+ "True True 88.803922 \n",
+ "False False 1244.333333 \n",
+ "\n",
+ " Total Owned and Rented Burdened Households (Current Aggregation Methodology)- Mean of median across all states \n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 40949.200000 \n",
+ "False True 24719.098039 \n",
+ "True True 60757.901961 \n",
+ "False False 443680.568627 "
+ ]
+ },
+ "execution_count": 252,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 211,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def highlight_min(s, props=''):\n",
+ " return np.where(s == np.nanmin(s.values), props, '')\n",
+ "\n",
+ "def highlight_max(s, props=''):\n",
+ " return np.where(s == np.nanmax(s.values), props, '')\n",
+ "\n",
+ "df_styled = df.style.format(\n",
+ " na_rep='MISSING',\n",
+ " formatter={df.columns[idx]: \"{:.2f}\" for idx, _ in enumerate(df.columns) })"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 255,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_styled_max_min = df_styled.apply(highlight_max, props='color:#FFFFFF;background-color:#00008b', axis=0)\\\n",
+ " .apply(highlight_min, props='color:#FFFFFF;background-color:#FF0000', axis=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 256,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_styled_max_min.to_excel(\"summary_all_states.xlsx\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 257,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
Percent of individuals < 200% Federal Poverty Line (mean of median across all states)
Percent individuals age 25 or over with less than high school degree (median across all tracts)
Percent enrollment in college or graduate school (median across all tracts)
Percent Black or African American alone (mean of median across all states)
Percent American Indian and Alaska Native alone (mean of median across all states)
Percent Non-Hispanic White (mean of median across all states)
Percent Hispanic or Latino (mean of median across all states)
Total Number of Tracts - Mean of median across all states
Total Owned and Rented Burdened Households (Current Aggregation Methodology)- Mean of median across all states
Score L Relative Housing Burden for all households met burden threshold
CalEnviroScreen Housing Burden Met Burden Threshold (Ranked Percentile)
Percent of individuals < 200% Federal Poverty Line (median across all tracts)
\n",
+ "
Percent individuals age 25 or over with less than high school degree (median across all tracts)
\n",
+ "
Percent enrollment in college or graduate school (median across all tracts)
\n",
+ "
Percent Black or African American alone (median across all tracts)
\n",
+ "
Percent American Indian and Alaska Native alone (median across all tracts)
\n",
+ "
Percent Non-Hispanic White (median across all tracts)
\n",
+ "
Percent Hispanic or Latino (median across all tracts)
\n",
+ "
Total Number of Unique Tracts
\n",
+ "
Total Owned and Rented Burdened Households (Current Aggregation Methodology)
\n",
+ "
\n",
+ "
\n",
+ "
state_name
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
AK
\n",
+ "
33.563081
\n",
+ "
10.407240
\n",
+ "
9.485638
\n",
+ "
4.553571
\n",
+ "
21.279762
\n",
+ "
48.839286
\n",
+ "
9.375000
\n",
+ "
1
\n",
+ "
825
\n",
+ "
\n",
+ "
\n",
+ "
AL
\n",
+ "
63.335854
\n",
+ "
20.273617
\n",
+ "
7.895408
\n",
+ "
79.236090
\n",
+ "
0.000000
\n",
+ "
8.436779
\n",
+ "
1.986055
\n",
+ "
14
\n",
+ "
7107
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Percent of individuals < 200% Federal Poverty Line (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 33.563081 \n",
+ "AL 63.335854 \n",
+ "\n",
+ " Percent individuals age 25 or over with less than high school degree (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 10.407240 \n",
+ "AL 20.273617 \n",
+ "\n",
+ " Percent enrollment in college or graduate school (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 9.485638 \n",
+ "AL 7.895408 \n",
+ "\n",
+ " Percent Black or African American alone (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 4.553571 \n",
+ "AL 79.236090 \n",
+ "\n",
+ " Percent American Indian and Alaska Native alone (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 21.279762 \n",
+ "AL 0.000000 \n",
+ "\n",
+ " Percent Non-Hispanic White (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 48.839286 \n",
+ "AL 8.436779 \n",
+ "\n",
+ " Percent Hispanic or Latino (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 9.375000 \n",
+ "AL 1.986055 \n",
+ "\n",
+ " Total Number of Unique Tracts \\\n",
+ "state_name \n",
+ "AK 1 \n",
+ "AL 14 \n",
+ "\n",
+ " Total Owned and Rented Burdened Households (Current Aggregation Methodology) \n",
+ "state_name \n",
+ "AK 825 \n",
+ "AL 7107 "
+ ]
+ },
+ "execution_count": 272,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_1.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 273,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "candidate_columns = ['Percent of individuals < 200% Federal Poverty Line (median across all tracts)',\n",
+ " 'Percent individuals age 25 or over with less than high school degree (median across all tracts)',\n",
+ " 'Percent enrollment in college or graduate school (median across all tracts)',\n",
+ " 'Percent Black or African American alone (median across all tracts)',\n",
+ " 'Percent American Indian and Alaska Native alone (median across all tracts)',\n",
+ " 'Percent Non-Hispanic White (median across all tracts)',\n",
+ " 'Percent Hispanic or Latino (median across all tracts)',\n",
+ " 'Total Number of Unique Tracts',\n",
+ " 'Total Owned and Rented Burdened Households (Current Aggregation Methodology)']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 276,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "diff = (df_1[candidate_columns] - df_2[candidate_columns]).fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 277,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
Percent of individuals < 200% Federal Poverty Line (median across all tracts)
\n",
+ "
Percent individuals age 25 or over with less than high school degree (median across all tracts)
\n",
+ "
Percent enrollment in college or graduate school (median across all tracts)
\n",
+ "
Percent Black or African American alone (median across all tracts)
\n",
+ "
Percent American Indian and Alaska Native alone (median across all tracts)
\n",
+ "
Percent Non-Hispanic White (median across all tracts)
\n",
+ "
Percent Hispanic or Latino (median across all tracts)
\n",
+ "
Total Number of Unique Tracts
\n",
+ "
Total Owned and Rented Burdened Households (Current Aggregation Methodology)
\n",
+ "
\n",
+ "
\n",
+ "
state_name
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
AK
\n",
+ "
0.985223
\n",
+ "
-0.274578
\n",
+ "
3.206568
\n",
+ "
-1.085258
\n",
+ "
8.721076
\n",
+ "
-6.803172
\n",
+ "
-1.778285
\n",
+ "
-10.0
\n",
+ "
-4179.0
\n",
+ "
\n",
+ "
\n",
+ "
AL
\n",
+ "
6.211917
\n",
+ "
4.283941
\n",
+ "
0.331430
\n",
+ "
6.212278
\n",
+ "
0.000000
\n",
+ "
-9.011147
\n",
+ "
-0.630017
\n",
+ "
-28.0
\n",
+ "
-15422.0
\n",
+ "
\n",
+ "
\n",
+ "
AR
\n",
+ "
5.432052
\n",
+ "
1.601861
\n",
+ "
3.942754
\n",
+ "
20.319049
\n",
+ "
0.102459
\n",
+ "
-23.263854
\n",
+ "
-0.896646
\n",
+ "
-35.0
\n",
+ "
-16843.0
\n",
+ "
\n",
+ "
\n",
+ "
AZ
\n",
+ "
13.594871
\n",
+ "
11.979424
\n",
+ "
-0.272040
\n",
+ "
1.361388
\n",
+ "
-0.915677
\n",
+ "
-8.953361
\n",
+ "
8.757832
\n",
+ "
-39.0
\n",
+ "
-23253.0
\n",
+ "
\n",
+ "
\n",
+ "
CA
\n",
+ "
15.717598
\n",
+ "
16.799587
\n",
+ "
-0.702951
\n",
+ "
0.247647
\n",
+ "
0.415820
\n",
+ "
-21.870519
\n",
+ "
40.216194
\n",
+ "
783.0
\n",
+ "
542144.0
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Percent of individuals < 200% Federal Poverty Line (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 0.985223 \n",
+ "AL 6.211917 \n",
+ "AR 5.432052 \n",
+ "AZ 13.594871 \n",
+ "CA 15.717598 \n",
+ "\n",
+ " Percent individuals age 25 or over with less than high school degree (median across all tracts) \\\n",
+ "state_name \n",
+ "AK -0.274578 \n",
+ "AL 4.283941 \n",
+ "AR 1.601861 \n",
+ "AZ 11.979424 \n",
+ "CA 16.799587 \n",
+ "\n",
+ " Percent enrollment in college or graduate school (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 3.206568 \n",
+ "AL 0.331430 \n",
+ "AR 3.942754 \n",
+ "AZ -0.272040 \n",
+ "CA -0.702951 \n",
+ "\n",
+ " Percent Black or African American alone (median across all tracts) \\\n",
+ "state_name \n",
+ "AK -1.085258 \n",
+ "AL 6.212278 \n",
+ "AR 20.319049 \n",
+ "AZ 1.361388 \n",
+ "CA 0.247647 \n",
+ "\n",
+ " Percent American Indian and Alaska Native alone (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 8.721076 \n",
+ "AL 0.000000 \n",
+ "AR 0.102459 \n",
+ "AZ -0.915677 \n",
+ "CA 0.415820 \n",
+ "\n",
+ " Percent Non-Hispanic White (median across all tracts) \\\n",
+ "state_name \n",
+ "AK -6.803172 \n",
+ "AL -9.011147 \n",
+ "AR -23.263854 \n",
+ "AZ -8.953361 \n",
+ "CA -21.870519 \n",
+ "\n",
+ " Percent Hispanic or Latino (median across all tracts) \\\n",
+ "state_name \n",
+ "AK -1.778285 \n",
+ "AL -0.630017 \n",
+ "AR -0.896646 \n",
+ "AZ 8.757832 \n",
+ "CA 40.216194 \n",
+ "\n",
+ " Total Number of Unique Tracts \\\n",
+ "state_name \n",
+ "AK -10.0 \n",
+ "AL -28.0 \n",
+ "AR -35.0 \n",
+ "AZ -39.0 \n",
+ "CA 783.0 \n",
+ "\n",
+ " Total Owned and Rented Burdened Households (Current Aggregation Methodology) \n",
+ "state_name \n",
+ "AK -4179.0 \n",
+ "AL -15422.0 \n",
+ "AR -16843.0 \n",
+ "AZ -23253.0 \n",
+ "CA 542144.0 "
+ ]
+ },
+ "execution_count": 277,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "diff.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 279,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
Percent of individuals < 200% Federal Poverty Line (median across all tracts)
Percent individuals age 25 or over with less than high school degree (median across all tracts)
Percent enrollment in college or graduate school (median across all tracts)
Percent Black or African American alone (median across all tracts)
Percent American Indian and Alaska Native alone (median across all tracts)
Percent Non-Hispanic White (median across all tracts)
Percent Hispanic or Latino (median across all tracts)
Total Number of Unique Tracts
Total Owned and Rented Burdened Households (Current Aggregation Methodology)
state_name
\n",
+ "
\n",
+ "
AK
\n",
+ "
0.985223
\n",
+ "
-0.274578
\n",
+ "
3.206568
\n",
+ "
-1.085258
\n",
+ "
8.721076
\n",
+ "
-6.803172
\n",
+ "
-1.778285
\n",
+ "
-10.000000
\n",
+ "
-4179.000000
\n",
+ "
\n",
+ "
\n",
+ "
AL
\n",
+ "
6.211917
\n",
+ "
4.283941
\n",
+ "
0.331430
\n",
+ "
6.212278
\n",
+ "
0.000000
\n",
+ "
-9.011147
\n",
+ "
-0.630017
\n",
+ "
-28.000000
\n",
+ "
-15422.000000
\n",
+ "
\n",
+ "
\n",
+ "
AR
\n",
+ "
5.432052
\n",
+ "
1.601861
\n",
+ "
3.942754
\n",
+ "
20.319049
\n",
+ "
0.102459
\n",
+ "
-23.263854
\n",
+ "
-0.896646
\n",
+ "
-35.000000
\n",
+ "
-16843.000000
\n",
+ "
\n",
+ "
\n",
+ "
AZ
\n",
+ "
13.594871
\n",
+ "
11.979424
\n",
+ "
-0.272040
\n",
+ "
1.361388
\n",
+ "
-0.915677
\n",
+ "
-8.953361
\n",
+ "
8.757832
\n",
+ "
-39.000000
\n",
+ "
-23253.000000
\n",
+ "
\n",
+ "
\n",
+ "
CA
\n",
+ "
15.717598
\n",
+ "
16.799587
\n",
+ "
-0.702951
\n",
+ "
0.247647
\n",
+ "
0.415820
\n",
+ "
-21.870519
\n",
+ "
40.216194
\n",
+ "
783.000000
\n",
+ "
542144.000000
\n",
+ "
\n",
+ "
\n",
+ "
CO
\n",
+ "
5.267963
\n",
+ "
0.816207
\n",
+ "
0.089370
\n",
+ "
5.338649
\n",
+ "
0.149809
\n",
+ "
-24.425903
\n",
+ "
10.133080
\n",
+ "
-34.000000
\n",
+ "
-17296.000000
\n",
+ "
\n",
+ "
\n",
+ "
CT
\n",
+ "
20.337916
\n",
+ "
6.865214
\n",
+ "
0.574592
\n",
+ "
-7.952396
\n",
+ "
0.000000
\n",
+ "
-9.384233
\n",
+ "
3.734253
\n",
+ "
48.000000
\n",
+ "
33786.000000
\n",
+ "
\n",
+ "
\n",
+ "
DC
\n",
+ "
20.225619
\n",
+ "
9.277701
\n",
+ "
-10.589364
\n",
+ "
65.946763
\n",
+ "
0.000000
\n",
+ "
-49.789447
\n",
+ "
-6.124777
\n",
+ "
13.000000
\n",
+ "
9673.000000
\n",
+ "
\n",
+ "
\n",
+ "
DE
\n",
+ "
1.370275
\n",
+ "
-1.231413
\n",
+ "
-4.499021
\n",
+ "
-27.806872
\n",
+ "
0.581898
\n",
+ "
9.572051
\n",
+ "
6.704117
\n",
+ "
-8.000000
\n",
+ "
-2816.000000
\n",
+ "
\n",
+ "
\n",
+ "
FL
\n",
+ "
12.798641
\n",
+ "
7.913089
\n",
+ "
-0.464450
\n",
+ "
13.453116
\n",
+ "
0.000000
\n",
+ "
-19.567955
\n",
+ "
2.960044
\n",
+ "
88.000000
\n",
+ "
71042.000000
\n",
+ "
\n",
+ "
\n",
+ "
GA
\n",
+ "
0.623927
\n",
+ "
2.733393
\n",
+ "
-0.799951
\n",
+ "
1.511527
\n",
+ "
-0.093487
\n",
+ "
-9.680710
\n",
+ "
2.172740
\n",
+ "
-3.000000
\n",
+ "
12615.000000
\n",
+ "
\n",
+ "
\n",
+ "
HI
\n",
+ "
4.897141
\n",
+ "
9.101461
\n",
+ "
0.265372
\n",
+ "
-0.896060
\n",
+ "
-0.094805
\n",
+ "
-35.939385
\n",
+ "
0.379145
\n",
+ "
7.000000
\n",
+ "
4771.000000
\n",
+ "
\n",
+ "
\n",
+ "
IA
\n",
+ "
10.953716
\n",
+ "
-0.708470
\n",
+ "
-4.742017
\n",
+ "
-1.623474
\n",
+ "
0.732012
\n",
+ "
19.552402
\n",
+ "
-7.997991
\n",
+ "
-53.000000
\n",
+ "
-25766.000000
\n",
+ "
\n",
+ "
\n",
+ "
ID
\n",
+ "
4.722145
\n",
+ "
9.643493
\n",
+ "
7.189788
\n",
+ "
-0.311804
\n",
+ "
-1.015539
\n",
+ "
-15.992569
\n",
+ "
19.816490
\n",
+ "
-23.000000
\n",
+ "
-16597.000000
\n",
+ "
\n",
+ "
\n",
+ "
IL
\n",
+ "
5.155379
\n",
+ "
5.303919
\n",
+ "
-0.630714
\n",
+ "
5.037600
\n",
+ "
0.000000
\n",
+ "
-25.049480
\n",
+ "
6.050435
\n",
+ "
93.000000
\n",
+ "
54402.000000
\n",
+ "
\n",
+ "
\n",
+ "
IN
\n",
+ "
2.113662
\n",
+ "
2.437293
\n",
+ "
0.719657
\n",
+ "
7.315713
\n",
+ "
0.000000
\n",
+ "
1.347745
\n",
+ "
0.492553
\n",
+ "
-36.000000
\n",
+ "
-16444.000000
\n",
+ "
\n",
+ "
\n",
+ "
KS
\n",
+ "
5.264502
\n",
+ "
-7.562943
\n",
+ "
0.627460
\n",
+ "
12.245890
\n",
+ "
0.466634
\n",
+ "
1.465143
\n",
+ "
-0.964424
\n",
+ "
-46.000000
\n",
+ "
-21169.000000
\n",
+ "
\n",
+ "
\n",
+ "
KY
\n",
+ "
8.794436
\n",
+ "
6.683970
\n",
+ "
0.862572
\n",
+ "
-0.575122
\n",
+ "
0.578035
\n",
+ "
-6.133101
\n",
+ "
5.455541
\n",
+ "
-57.000000
\n",
+ "
-27542.000000
\n",
+ "
\n",
+ "
\n",
+ "
LA
\n",
+ "
8.343117
\n",
+ "
3.494850
\n",
+ "
1.398804
\n",
+ "
7.964377
\n",
+ "
0.000000
\n",
+ "
-11.833643
\n",
+ "
-1.607310
\n",
+ "
27.000000
\n",
+ "
19739.000000
\n",
+ "
\n",
+ "
\n",
+ "
MA
\n",
+ "
8.936772
\n",
+ "
7.732666
\n",
+ "
1.005028
\n",
+ "
8.415698
\n",
+ "
0.000000
\n",
+ "
-19.359667
\n",
+ "
15.170475
\n",
+ "
38.000000
\n",
+ "
28222.000000
\n",
+ "
\n",
+ "
\n",
+ "
MD
\n",
+ "
9.100530
\n",
+ "
3.424516
\n",
+ "
-1.341899
\n",
+ "
21.848353
\n",
+ "
0.000000
\n",
+ "
-10.271062
\n",
+ "
-2.199229
\n",
+ "
-26.000000
\n",
+ "
-8900.000000
\n",
+ "
\n",
+ "
\n",
+ "
ME
\n",
+ "
12.565009
\n",
+ "
9.122611
\n",
+ "
-3.922820
\n",
+ "
-0.880265
\n",
+ "
-0.439467
\n",
+ "
-5.913473
\n",
+ "
-0.866007
\n",
+ "
-25.000000
\n",
+ "
-14441.000000
\n",
+ "
\n",
+ "
\n",
+ "
MI
\n",
+ "
6.453785
\n",
+ "
1.068588
\n",
+ "
0.689832
\n",
+ "
6.708850
\n",
+ "
0.000000
\n",
+ "
-8.072578
\n",
+ "
-1.350049
\n",
+ "
-23.000000
\n",
+ "
-6868.000000
\n",
+ "
\n",
+ "
\n",
+ "
MN
\n",
+ "
15.920965
\n",
+ "
17.740780
\n",
+ "
3.141607
\n",
+ "
22.498320
\n",
+ "
1.758343
\n",
+ "
-38.332396
\n",
+ "
22.509297
\n",
+ "
-87.000000
\n",
+ "
-46742.000000
\n",
+ "
\n",
+ "
\n",
+ "
MO
\n",
+ "
4.976759
\n",
+ "
-0.085646
\n",
+ "
-3.577408
\n",
+ "
47.102090
\n",
+ "
0.000000
\n",
+ "
-35.919908
\n",
+ "
-2.378652
\n",
+ "
-30.000000
\n",
+ "
-13248.000000
\n",
+ "
\n",
+ "
\n",
+ "
MS
\n",
+ "
8.484879
\n",
+ "
1.040226
\n",
+ "
-0.572744
\n",
+ "
24.863184
\n",
+ "
0.000000
\n",
+ "
-22.951000
\n",
+ "
0.051741
\n",
+ "
-27.000000
\n",
+ "
-12524.000000
\n",
+ "
\n",
+ "
\n",
+ "
MT
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
NC
\n",
+ "
8.153384
\n",
+ "
11.292289
\n",
+ "
-2.811982
\n",
+ "
6.496331
\n",
+ "
0.078513
\n",
+ "
-19.504668
\n",
+ "
11.151508
\n",
+ "
-71.000000
\n",
+ "
-45011.000000
\n",
+ "
\n",
+ "
\n",
+ "
ND
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
NE
\n",
+ "
7.484343
\n",
+ "
37.676269
\n",
+ "
-9.271518
\n",
+ "
-7.562947
\n",
+ "
0.634817
\n",
+ "
-37.670893
\n",
+ "
58.015686
\n",
+ "
-27.000000
\n",
+ "
-13452.000000
\n",
+ "
\n",
+ "
\n",
+ "
NH
\n",
+ "
11.333692
\n",
+ "
7.394888
\n",
+ "
-1.695591
\n",
+ "
2.079689
\n",
+ "
0.000000
\n",
+ "
-14.216572
\n",
+ "
9.124264
\n",
+ "
-16.000000
\n",
+ "
-11851.000000
\n",
+ "
\n",
+ "
\n",
+ "
NJ
\n",
+ "
5.927327
\n",
+ "
8.713014
\n",
+ "
-2.337923
\n",
+ "
-10.341142
\n",
+ "
0.000000
\n",
+ "
-5.623117
\n",
+ "
25.764242
\n",
+ "
151.000000
\n",
+ "
102626.000000
\n",
+ "
\n",
+ "
\n",
+ "
NM
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
NV
\n",
+ "
7.485453
\n",
+ "
2.632552
\n",
+ "
-0.783104
\n",
+ "
-0.500327
\n",
+ "
0.076985
\n",
+ "
2.512022
\n",
+ "
8.687485
\n",
+ "
12.000000
\n",
+ "
14693.000000
\n",
+ "
\n",
+ "
\n",
+ "
NY
\n",
+ "
12.369088
\n",
+ "
8.434444
\n",
+ "
-0.870792
\n",
+ "
-10.437037
\n",
+ "
0.000000
\n",
+ "
4.477567
\n",
+ "
15.929982
\n",
+ "
573.000000
\n",
+ "
407383.000000
\n",
+ "
\n",
+ "
\n",
+ "
OH
\n",
+ "
5.166431
\n",
+ "
2.626470
\n",
+ "
0.160945
\n",
+ "
16.322108
\n",
+ "
0.000000
\n",
+ "
-16.400598
\n",
+ "
0.628654
\n",
+ "
-21.000000
\n",
+ "
355.000000
\n",
+ "
\n",
+ "
\n",
+ "
OK
\n",
+ "
17.267863
\n",
+ "
5.687201
\n",
+ "
-2.733525
\n",
+ "
5.227426
\n",
+ "
-1.254840
\n",
+ "
-4.629739
\n",
+ "
13.684527
\n",
+ "
-60.000000
\n",
+ "
-25920.000000
\n",
+ "
\n",
+ "
\n",
+ "
OR
\n",
+ "
10.312702
\n",
+ "
7.677240
\n",
+ "
-0.667578
\n",
+ "
1.177527
\n",
+ "
-0.110343
\n",
+ "
-15.168623
\n",
+ "
16.277201
\n",
+ "
-35.000000
\n",
+ "
-26320.000000
\n",
+ "
\n",
+ "
\n",
+ "
PA
\n",
+ "
9.568567
\n",
+ "
6.789819
\n",
+ "
-1.100045
\n",
+ "
2.768551
\n",
+ "
0.000000
\n",
+ "
-7.188058
\n",
+ "
6.730014
\n",
+ "
-63.000000
\n",
+ "
-33293.000000
\n",
+ "
\n",
+ "
\n",
+ "
RI
\n",
+ "
28.097665
\n",
+ "
18.885313
\n",
+ "
-28.064680
\n",
+ "
9.569074
\n",
+ "
0.344864
\n",
+ "
-42.404295
\n",
+ "
39.740670
\n",
+ "
5.000000
\n",
+ "
4138.000000
\n",
+ "
\n",
+ "
\n",
+ "
SC
\n",
+ "
3.709508
\n",
+ "
2.304021
\n",
+ "
3.766892
\n",
+ "
19.048750
\n",
+ "
0.000000
\n",
+ "
-12.734147
\n",
+ "
1.243649
\n",
+ "
-43.000000
\n",
+ "
-22081.000000
\n",
+ "
\n",
+ "
\n",
+ "
SD
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
TN
\n",
+ "
8.692949
\n",
+ "
4.908793
\n",
+ "
-1.487781
\n",
+ "
31.162224
\n",
+ "
0.000000
\n",
+ "
-26.640514
\n",
+ "
2.135541
\n",
+ "
-31.000000
\n",
+ "
-13021.000000
\n",
+ "
\n",
+ "
\n",
+ "
TX
\n",
+ "
8.229632
\n",
+ "
4.785835
\n",
+ "
-0.018445
\n",
+ "
7.608903
\n",
+ "
0.000000
\n",
+ "
-6.991503
\n",
+ "
11.472775
\n",
+ "
-103.000000
\n",
+ "
-58670.000000
\n",
+ "
\n",
+ "
\n",
+ "
UT
\n",
+ "
12.401135
\n",
+ "
9.279480
\n",
+ "
2.393714
\n",
+ "
-1.095225
\n",
+ "
2.211103
\n",
+ "
-27.444168
\n",
+ "
33.552393
\n",
+ "
-31.000000
\n",
+ "
-18553.000000
\n",
+ "
\n",
+ "
\n",
+ "
VA
\n",
+ "
12.747983
\n",
+ "
4.210496
\n",
+ "
-2.294801
\n",
+ "
20.581487
\n",
+ "
0.000000
\n",
+ "
-12.192675
\n",
+ "
2.823834
\n",
+ "
-48.000000
\n",
+ "
-24962.000000
\n",
+ "
\n",
+ "
\n",
+ "
VT
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
WA
\n",
+ "
0.836843
\n",
+ "
3.285992
\n",
+ "
-2.853985
\n",
+ "
1.365010
\n",
+ "
0.242227
\n",
+ "
-14.708699
\n",
+ "
5.595930
\n",
+ "
-84.000000
\n",
+ "
-61277.000000
\n",
+ "
\n",
+ "
\n",
+ "
WI
\n",
+ "
15.552119
\n",
+ "
4.422237
\n",
+ "
-0.796047
\n",
+ "
40.591511
\n",
+ "
-0.526995
\n",
+ "
-20.282972
\n",
+ "
-3.363747
\n",
+ "
-17.000000
\n",
+ "
-8795.000000
\n",
+ "
\n",
+ "
\n",
+ "
WV
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
WY
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 279,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def style_negative(v, props=''):\n",
+ " return props if v < 0 else None\n",
+ "\n",
+ "diff_style = diff.style.applymap(style_negative, props='color:red;')\\\n",
+ " .applymap(lambda v: 'opacity: 20%;' if (v < 0.0) and (v > 0.0) else None)\n",
+ "diff_style"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27-highlevel-summarization.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27-highlevel-summarization.ipynb
new file mode 100644
index 00000000..9c671466
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27-highlevel-summarization.ipynb
@@ -0,0 +1,3434 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Methodology to address fundamental problem 1 itemized in Issue 1024 - follow-up compare tabulations and relative household burden. This time I extend the 12-11 notebook to look at how the percentile ranks affects the proportion of tracts considered as burdened versus the current methodology."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Indicator reviewed: \n",
+ "\n",
+ "Socioeconomic Factors Indicator reviewed\n",
+ "* [Extreme Housing Burden](#housingburden)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import math\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pandas as pd\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### ETL process for acquiring relevant tables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### NOTE: If you ran the ETL Process to acquire Table 8 in the other notebook of this draft PR you do not need to run the ETL cell block again"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copy and adapt certain sections of code from data_pipeline.utils \n",
+ "\n",
+ "def download_hud_dataset():\n",
+ " DOWNLOAD_FILENAME = \"HUD_ZIPPED.csv\"\n",
+ " HOUSING_FTP_URL = \"https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip\" \n",
+ " response = requests.get(HOUSING_FTP_URL, verify=True)\n",
+ " if response.status_code == 200:\n",
+ " file_contents = response.content\n",
+ " else:\n",
+ " sys.exit(\n",
+ " f\"HTTP response {response.status_code} from url {file_url}. Info: {response.content}\"\n",
+ " )\n",
+ "\n",
+ " # Write the contents to disk.\n",
+ " file = open(DOWNLOAD_FILENAME, \"wb\")\n",
+ " file.write(file_contents)\n",
+ " file.close()\n",
+ " \n",
+ "def extract_zipped_download(zip_file_path, unzipped_path):\n",
+ " with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
+ " zip_ref.extractall(unzipped_path)\n",
+ " # cleanup temporary file\n",
+ " os.remove(zip_file_path)\n",
+ " \n",
+ "def up_one_directory(path):\n",
+ " try:\n",
+ " # from Python 3.6\n",
+ " parent_dir = Path(path).parents[1]\n",
+ " # for Python 3.4/3.5, use str to convert the path to string\n",
+ " # parent_dir = str(Path(path).parents[1])\n",
+ " shutil.move(path, parent_dir)\n",
+ " except IndexError:\n",
+ " # no upper directory\n",
+ " pass\n",
+ "\n",
+ "CURRENT_DIRECTORY = os.getcwd()\n",
+ "download_hud_dataset()\n",
+ "extract_zipped_download(CURRENT_DIRECTORY + \"/HUD_ZIPPED.csv\", CURRENT_DIRECTORY) \n",
+ "up_one_directory(CURRENT_DIRECTORY + \"/140/Table8.csv\")\n",
+ "shutil.rmtree(\"./140/\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Extreme Housing Burden \n",
+ "\n",
+ "The Extreme Housing Burden indicator represents the proportion of low-income households that have to spend more than half their income on rent. These households experience higher levels of stress, report lower health, and may delay medical treatment because of its high cost.\n",
+ "\n",
+ "The Extreme Housing Burden indicator measures the percent of households in a census tract that are:\n",
+ "\n",
+ "1. Making less than 80% of the Area Median Family Income as determined by the Department of Housing and Urban Development (HUD), and\n",
+ "2. Paying greater than 50% of their income to housing costs. \n",
+ "\n",
+ "This data is sourced from the 2014-2018 Comprehensive Housing Affordability Strategy dataset from the Department of Housing and Urban Development (HUD) using the census tract geographic summary level, and contains cost burdens for households by percent HUD-adjusted median family income (HAMFI) category. This data can be found [here](https://www.huduser.gov/portal/datasets/cp.html). \n",
+ "\n",
+ "Because CHAS data is based on American Communities Survey (ACS) estimates, which come from a sample of the population, they may be unreliable if based on a small sample or population size.\n",
+ "\n",
+ "The standard error and relative standard error were used to evaluate the reliability of each estimate using CalEnviroScreen’s methodology. \n",
+ "\n",
+ "Census tract estimates that met either of the following criteria were considered reliable and included in the analysis [(CalEnviroScreen, 2017, page 129)](https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf ):\n",
+ "\n",
+ "- Relative standard error less than 50 (meaning the standard error was less than half of the estimate), OR \n",
+ "- Standard error less than the mean standard error of all census tract estimates \n",
+ "\n",
+ "Formulas for calculating the standard error of sums, proportions, and ratio come from the [American Communities Survey Office](https://www2.census.gov/programs-surveys/acs/tech_docs/accuracy/MultiyearACSAccuracyofData2013.pdf).\n",
+ "\n",
+ "Note that this code creates a score and rank by state, for every state."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The relevant variables in table 8 of the CHAS dataset are the following (CHAS data dictionary available [here](https://www.huduser.gov/portal/datasets/cp/CHAS-data-dictionary-14-18.xlsx)):\n",
+ "\n",
+ "| Name | Label |\n",
+ "|---------|-----------------------------------------------------|\n",
+ "|T1_est1 | Total Occupied housing units | \n",
+ "|T8_est10 | Owner occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est23 |Owner occupied greater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est36 |Owner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est76 | Renter occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est89 |Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est102|Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Below I also propose an alternate means for ranking census tracts\n",
+ "### These steps are outlined and commented below"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/series.py:726: RuntimeWarning: invalid value encountered in sqrt\n",
+ " result = getattr(ufunc, method)(*inputs, **kwargs)\n",
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " iloc._setitem_with_indexer(indexer, value)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Read in the data from https://www.huduser.gov/portal/datasets/cp.html\n",
+ "housing = pd.read_csv(\"Table8.csv\", \n",
+ " encoding = \"ISO-8859-1\", \n",
+ " dtype = {'Tract_ID': object, 'st': object, 'geoid': object})\n",
+ "\n",
+ "# Remove data for states that aren't included in the census (e.g. American Samoa, Guam, etc.):\n",
+ "housing.drop(housing.loc[housing['st'] == '72'].index, inplace = True)\n",
+ "\n",
+ "# Combine owner and renter occupied low-income households that make less than 80% of HAMFI into one variable\n",
+ "housing['summed'] = (housing['T8_est10'] + \n",
+ " housing['T8_est23'] + \n",
+ " housing['T8_est36'] + \n",
+ " housing['T8_est76'] + \n",
+ " housing['T8_est89'] + \n",
+ " housing['T8_est102'])\n",
+ "\n",
+ "# Create a variable for the standard error of the summed variables\n",
+ "housing['summed_se'] = np.sqrt((housing['T8_moe10'] / 1.645)**2 + \n",
+ " (housing['T8_moe23'] / 1.645)**2 + \n",
+ " (housing['T8_moe36'] / 1.645)**2 + \n",
+ " (housing['T8_moe76'] / 1.645)**2 + \n",
+ " (housing['T8_moe89'] / 1.645)**2 + \n",
+ " (housing['T8_moe102'] / 1.645)**2)\n",
+ "\n",
+ "# Remove the first 7 digits in the FIPS Census Tract ID \n",
+ "housing['geoid'] = housing['geoid'].str[-11:]\n",
+ "\n",
+ "# Find the estimate of the proportion of the population that is heavily rent burdened\n",
+ "housing['hbrd_score'] = housing['summed'] / housing['T8_est1']\n",
+ "\n",
+ "# Change rates where the population is 0 to nan\n",
+ "housing['hbrd_score'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Create function for calculating the standard error, using the proportions standard error formula\n",
+ "# if the value under the radical is negative, use the ratio standard error formula\n",
+ "def se_prop(x, y, se_x, moe_y): \n",
+ " se_y = moe_y / 1.645\n",
+ " test = se_x**2 - (((x**2)/(y**2))*((se_y)**2))\n",
+ " se = np.where(test < 0,\n",
+ " (1/y) * np.sqrt(se_x**2 + (((x**2)/(y**2))*(se_y**2))), \n",
+ " (1/y) * np.sqrt(se_x**2 - (((x**2)/(y**2))*(se_y**2))))\n",
+ " return se\n",
+ "\n",
+ "housing['se'] = se_prop(housing['summed'], housing['T8_est1'], housing['summed_se'], housing['T8_moe1'])\n",
+ "\n",
+ "# Calculate the relative standard error\n",
+ "housing['rse'] = housing['se'] / housing['hbrd_score']*100\n",
+ "\n",
+ "# Change infinite rse's where the housing burden is 0 to np.nan\n",
+ "housing['rse'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Calculate the mean standard error for each state\n",
+ "housing['mean_state_se'] = np.zeros(len(housing))\n",
+ "\n",
+ "for state in housing['st'].unique():\n",
+ " mean_se = np.mean(housing[housing['st'] == state]['se'])\n",
+ " housing['mean_state_se'].loc[housing['st'] == state] = mean_se\n",
+ " \n",
+ "# Find census tract estimates that meet both of the following criteria and are thus considered unreliable estimates: \n",
+ "# RSE less than 50 AND\n",
+ "# SE less than the mean state SE or housing burdened low income households\n",
+ "# Convert these scores to nan\n",
+ "housing.loc[(housing['rse'] >= 50) & (housing['rse'] >= housing['mean_state_se']), 'hbrd_score'] = np.nan\n",
+ "\n",
+ "# Rename columns\n",
+ "housing = housing.rename(columns = {'geoid' :'FIPS_tract_id',\n",
+ " 'st' : 'state'\n",
+ " })\n",
+ "\n",
+ "# Calculate percentile rank for census tracts with a score above 0, set percentile to 0 if score is 0, for each state\n",
+ "housing['hbrd_rank'] = housing[\n",
+ " housing['hbrd_score'] != 0][['hbrd_score',\n",
+ " 'state']].groupby('state').rank( \n",
+ " na_option = 'keep', \n",
+ " pct = True) * 100\n",
+ "\n",
+ "housing.loc[housing['hbrd_score'] == 0, 'hbrd_rank'] = 0\n",
+ "\n",
+ "# Create final housing burden df\n",
+ "housingburden = housing.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
source
\n",
+ "
sumlevel
\n",
+ "
FIPS_tract_id
\n",
+ "
name
\n",
+ "
state
\n",
+ "
cnty
\n",
+ "
tract
\n",
+ "
T8_est1
\n",
+ "
T8_est2
\n",
+ "
T8_est3
\n",
+ "
...
\n",
+ "
T8_moe131
\n",
+ "
T8_moe132
\n",
+ "
T8_moe133
\n",
+ "
summed
\n",
+ "
summed_se
\n",
+ "
hbrd_score
\n",
+ "
se
\n",
+ "
rse
\n",
+ "
mean_state_se
\n",
+ "
hbrd_rank
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020100
\n",
+ "
Census Tract 201, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20100
\n",
+ "
765
\n",
+ "
570
\n",
+ "
50
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
80
\n",
+ "
31.721807
\n",
+ "
0.104575
\n",
+ "
0.041032
\n",
+ "
39.237314
\n",
+ "
0.036604
\n",
+ "
46.298077
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020200
\n",
+ "
Census Tract 202, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20200
\n",
+ "
720
\n",
+ "
465
\n",
+ "
65
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
138
\n",
+ "
45.531874
\n",
+ "
0.191667
\n",
+ "
0.061614
\n",
+ "
32.146659
\n",
+ "
0.036604
\n",
+ "
83.269231
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020300
\n",
+ "
Census Tract 203, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20300
\n",
+ "
1295
\n",
+ "
840
\n",
+ "
60
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
170
\n",
+ "
53.722921
\n",
+ "
0.131274
\n",
+ "
0.040927
\n",
+ "
31.176999
\n",
+ "
0.036604
\n",
+ "
63.653846
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020400
\n",
+ "
Census Tract 204, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20400
\n",
+ "
1640
\n",
+ "
1260
\n",
+ "
15
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
145
\n",
+ "
46.288510
\n",
+ "
0.088415
\n",
+ "
0.027822
\n",
+ "
31.467397
\n",
+ "
0.036604
\n",
+ "
34.615385
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020500
\n",
+ "
Census Tract 205, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20500
\n",
+ "
4175
\n",
+ "
2320
\n",
+ "
175
\n",
+ "
...
\n",
+ "
17
\n",
+ "
17
\n",
+ "
17
\n",
+ "
595
\n",
+ "
147.221693
\n",
+ "
0.142515
\n",
+ "
0.034760
\n",
+ "
24.390193
\n",
+ "
0.036604
\n",
+ "
68.221154
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 280 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " source sumlevel FIPS_tract_id \\\n",
+ "0 2014thru2018 140 01001020100 \n",
+ "1 2014thru2018 140 01001020200 \n",
+ "2 2014thru2018 140 01001020300 \n",
+ "3 2014thru2018 140 01001020400 \n",
+ "4 2014thru2018 140 01001020500 \n",
+ "\n",
+ " name state cnty tract T8_est1 \\\n",
+ "0 Census Tract 201, Autauga County, Alabama 01 1 20100 765 \n",
+ "1 Census Tract 202, Autauga County, Alabama 01 1 20200 720 \n",
+ "2 Census Tract 203, Autauga County, Alabama 01 1 20300 1295 \n",
+ "3 Census Tract 204, Autauga County, Alabama 01 1 20400 1640 \n",
+ "4 Census Tract 205, Autauga County, Alabama 01 1 20500 4175 \n",
+ "\n",
+ " T8_est2 T8_est3 ... T8_moe131 T8_moe132 T8_moe133 summed summed_se \\\n",
+ "0 570 50 ... 12 12 12 80 31.721807 \n",
+ "1 465 65 ... 12 12 12 138 45.531874 \n",
+ "2 840 60 ... 12 12 12 170 53.722921 \n",
+ "3 1260 15 ... 12 12 12 145 46.288510 \n",
+ "4 2320 175 ... 17 17 17 595 147.221693 \n",
+ "\n",
+ " hbrd_score se rse mean_state_se hbrd_rank \n",
+ "0 0.104575 0.041032 39.237314 0.036604 46.298077 \n",
+ "1 0.191667 0.061614 32.146659 0.036604 83.269231 \n",
+ "2 0.131274 0.040927 31.176999 0.036604 63.653846 \n",
+ "3 0.088415 0.027822 31.467397 0.036604 34.615385 \n",
+ "4 0.142515 0.034760 24.390193 0.036604 68.221154 \n",
+ "\n",
+ "[5 rows x 280 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(73056, 280)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### As desired we see a uniform distribution for the percentile rank for burdened households"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we compute for a baseline comparison "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Owner occupied numerator fields\n",
+ "OWNER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est7\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est10\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est20\",\n",
+ " \n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est23\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est33\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est36\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est13\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est26\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est39\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est52\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est65\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_FIELD = \"T8_est2\"\n",
+ "# Subtotal\n",
+ "# Owner occupied\n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_HAMFI_FIELD = \"T8_est3\"\n",
+ "# Subtotal\n",
+ "# Owner occupied \n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "# Renter occupied numerator fields\n",
+ "RENTER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est73\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est76\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est86\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est89\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est99\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est102\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est79\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tless than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est92\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est105\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est118\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est131\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# T8_est68\tSubtotalRenter occupied\tAll\tAll\tAll\n",
+ "RENTER_OCCUPIED_POPULATION_FIELD = \"T8_est68\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_summed_methodology\"] = housingburden[\n",
+ " OWNER_OCCUPIED_NUMERATOR_FIELDS\n",
+ "].sum(axis=1) + housingburden[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ " - housingburden[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ " - housingburden[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator_sans_not_computed\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_percent\"] = np.round(\n",
+ " (housingburden[\"current_summed_methodology\"] / housingburden[\"current_methodology_denominator\"] ), 2) * 100"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we construct the distribution of differences in the number of owned and rented burdened households\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Percentiles Comparison"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_df = housingburden[['FIPS_tract_id', 'state','hbrd_rank','hbrd_score', 'summed', \n",
+ " 'current_summed_methodology', 'T8_est1', \n",
+ " \"current_methodology_denominator_sans_not_computed\",\n",
+ " 'current_methodology_denominator', 'current_methodology_percent']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### First notice here that **T8_est1** and **current_methodology_denominator** should represent same or similar aggregates. In general, we cen see that the current computation performed results in a differerntial that undercounts the total occupied and rental households."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator_sans_not_computed\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(12, 8))\n",
+ "plt.title('Distribution of differences between aggregate totals that normalizes tabulation of poverty households (with removal of not computed fields) ')\n",
+ "# Set x-axis label\n",
+ "plt.xlabel('Aggregate differences in total owner and renter occupied low-income households')\n",
+ "# Set y-axis label\n",
+ "plt.ylabel('Relative Frequency in Support')\n",
+ "\n",
+ "sns.histplot(final_df[\"differences_aggregate_denominator_sans_not_computed\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " after removing the cwd from sys.path.\n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"current_methodology_percentile_rank\"] = final_df[\"current_methodology_percent\"].rank(\n",
+ " pct=True,\n",
+ " # Set ascending to the parameter value.\n",
+ " ascending=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \"\"\"Entry point for launching an IPython kernel.\n",
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " after removing the cwd from sys.path.\n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"new_threshold_exceeded\"] = (final_df['hbrd_rank'] >= 90)\n",
+ "\n",
+ "final_df[\"current_threshold_exceeded\"] = (final_df[\n",
+ " 'current_methodology_percentile_rank'] >= 0.90)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Only include non-NA tracts for comparison purposes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# first save NA tracts that were considered unreliable\n",
+ "ineligible_tracts = list(final_df[final_df[\"hbrd_rank\"].isna()][\"FIPS_tract_id\"].values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5243"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(ineligible_tracts)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### How many tracts are ineligible according to CalEnvironScreen but are considerd in Score L?\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_current_methodology = final_df[final_df[\"current_methodology_percentile_rank\"] >= 0.90]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(7323, 15)"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "final_current_methodology.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(53, 15)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 53 tracts\n",
+ "final_current_methodology[\n",
+ " final_current_methodology.FIPS_tract_id.isin(ineligible_tracts)].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "non_null_df = final_df.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(7323, 15)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# number of tracks eligible\n",
+ "non_null_df[non_null_df[\"current_methodology_percentile_rank\"] >= 0.90].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(12, 8))\n",
+ "plt.title(\"Distribution of Percentiles for Housing Burden (Score L)\")\n",
+ "# Set x-axis label\n",
+ "plt.xlabel('Percentile (although currently not represented as a percentage)')\n",
+ "# Set y-axis label\n",
+ "plt.ylabel('Relative Frequency in Support')\n",
+ "\n",
+ "sns.histplot(non_null_df[\"current_methodology_percentile_rank\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False 65733\n",
+ "True 7323\n",
+ "Name: current_threshold_exceeded, dtype: int64"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "non_null_df[\"current_threshold_exceeded\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False 66255\n",
+ "True 6801\n",
+ "Name: new_threshold_exceeded, dtype: int64"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "non_null_df[\"new_threshold_exceeded\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Score L Criterion as burden but not Calvironscreen"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 217,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2794, 15)"
+ ]
+ },
+ "execution_count": 217,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# this includes uineligible np.nan values too\n",
+ "predicate_1 = (non_null_df['current_threshold_exceeded'] == True) & (non_null_df['new_threshold_exceeded'] != True)\n",
+ "\n",
+ "non_null_df[predicate_1].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 218,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Where Score L is considered burdened but not in CalEnviroScreen\n",
+ "score_l_considered_burdened = non_null_df[predicate_1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import us\n",
+ "\n",
+ "mapping = us.states.mapping('fips', 'abbr')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 219,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Inserted after the basic stats definition.\n",
+ "# Load demographic data\n",
+ "import pathlib\n",
+ "\n",
+ "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
+ "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
+ "\n",
+ "demographics_path = DATA_DIR / \"dataset\" / \"census_acs_2019\" / \"usa.csv\"\n",
+ "\n",
+ "demographics_df = pd.read_csv(\n",
+ " demographics_path,\n",
+ " dtype={\"GEOID10_TRACT\": \"string\"},\n",
+ ")\n",
+ "\n",
+ "# Set some field names\n",
+ "BLACK_FIELD_NAME = \"Black or African American alone\"\n",
+ "AMERICAN_INDIAN_FIELD_NAME = \"American Indian and Alaska Native alone\"\n",
+ "ASIAN_FIELD_NAME = \"Asian alone\"\n",
+ "HAWAIIAN_FIELD_NAME = \"Native Hawaiian and Other Pacific alone\"\n",
+ "TWO_OR_MORE_RACES_FIELD_NAME = \"Two or more races\"\n",
+ "NON_HISPANIC_WHITE_FIELD_NAME = \"Non-Hispanic White\"\n",
+ "HISPANIC_FIELD_NAME = \"Hispanic or Latino\"\n",
+ "PERCENT_PREFIX = \"Percent \"\n",
+ "\n",
+ "RE_OUTPUT_FIELDS = [\n",
+ " BLACK_FIELD_NAME,\n",
+ " AMERICAN_INDIAN_FIELD_NAME,\n",
+ " ASIAN_FIELD_NAME,\n",
+ " HAWAIIAN_FIELD_NAME,\n",
+ " TWO_OR_MORE_RACES_FIELD_NAME,\n",
+ " NON_HISPANIC_WHITE_FIELD_NAME,\n",
+ " HISPANIC_FIELD_NAME,\n",
+ "]\n",
+ "\n",
+ "RE_PERCENT_OUTPUT_FIELDS = [PERCENT_PREFIX + field for field in RE_OUTPUT_FIELDS]\n",
+ "\n",
+ "columns_to_keep = (\n",
+ " [\"GEOID10_TRACT\"]\n",
+ " + RE_OUTPUT_FIELDS\n",
+ " + RE_PERCENT_OUTPUT_FIELDS\n",
+ " + ['Percent of individuals < 200% Federal Poverty Line', \n",
+ " 'Median value ($) of owner-occupied housing units',\n",
+ " 'Percent individuals age 25 or over with less than high school degree',\n",
+ " 'Percent enrollment in college or graduate school',\n",
+ " 'Linguistic isolation (percent)']\n",
+ ")\n",
+ "\n",
+ "\n",
+ "def highlight_medians(s):\n",
+ " # highlight if the current median is greater than the median of medians for that series\n",
+ " is_greater_than_median = s > s.median()\n",
+ " return ['color: pink; background-color:#7272FE'\n",
+ " if cell else '' for cell in is_greater_than_median]\n",
+ "# Join the demographics in.\n",
+ "merged_df_score_l = score_l_considered_burdened.merge(\n",
+ " demographics_df[columns_to_keep],\n",
+ " left_on=\"FIPS_tract_id\",\n",
+ " right_on=\"GEOID10_TRACT\",\n",
+ " how=\"inner\"\n",
+ ")\n",
+ "\n",
+ "# these are not converted into percent 0 - 100 scale\n",
+ "percent_cols = [x for x in merged_df_score_l.columns if \n",
+ " 'Percent' in x or '(percent)' in x\n",
+ " ]\n",
+ "\n",
+ "merged_df_score_l[\n",
+ " percent_cols] = merged_df_score_l[\n",
+ " percent_cols].apply(lambda x: x * 100)\n",
+ "\n",
+ "for idx, row in merged_df_score_l.iterrows():\n",
+ " current_row = str(merged_df_score_l.loc[idx, 'state'])\n",
+ " state = mapping.get(current_row, None)\n",
+ " merged_df_score_l.loc[idx, 'state_name'] = state\n",
+ "\n",
+ "grouped_stats_score_l = merged_df_score_l.groupby([\"state_name\"]).agg({\n",
+ " 'GEOID10_TRACT': 'nunique',\n",
+ " 'Percent of individuals < 200% Federal Poverty Line': [np.median, np.std],\n",
+ " 'Median value ($) of owner-occupied housing units': [np.median, np.std],\n",
+ " 'Percent individuals age 25 or over with less than high school degree': [np.median, np.std],\n",
+ " 'Percent enrollment in college or graduate school': [np.median, np.std],\n",
+ " 'Percent Black or African American alone': [np.median, np.std],\n",
+ " 'Percent American Indian and Alaska Native alone': [np.median, np.std],\n",
+ " 'Percent Non-Hispanic White': [np.median, np.std], \n",
+ " 'Linguistic isolation (percent)': [np.median, np.std],\n",
+ " 'Percent Hispanic or Latino': [np.median, np.std],\n",
+ " 'hbrd_rank': [np.median, np.std],\n",
+ " 'current_methodology_percent': [np.median, np.std],\n",
+ " 'current_summed_methodology': [np.median, np.std, np.sum]\n",
+ "}).reset_index()\n",
+ "\n",
+ "\n",
+ "grouped_stats_score_l.columns = [' '.join(col).strip() for \n",
+ " col in grouped_stats_score_l.columns.values]\n",
+ "\n",
+ "grouped_stats_score_l = grouped_stats_score_l[[x for x in grouped_stats_score_l \n",
+ " if \"median\" in x and \n",
+ " \"Percent\" in x] + [\"GEOID10_TRACT nunique\", \n",
+ " 'current_summed_methodology sum', \"state_name\"]]\n",
+ "\n",
+ "grouped_stats_score_l.set_index(\"state_name\", inplace=True)\n",
+ "\n",
+ "grouped_stats_score_l = grouped_stats_score_l.rename(columns={\n",
+ "'Percent of individuals < 200% Federal Poverty Line median'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (median across all tracts)', \n",
+ "'Percent individuals age 25 or over with less than high school degree median':\n",
+ "'Percent individuals age 25 or over with less than high school degree (median across all tracts)', \n",
+ "'Percent enrollment in college or graduate school median'\n",
+ " :'Percent enrollment in college or graduate school (median across all tracts)',\n",
+ "'Percent Black or African American alone median':\n",
+ " 'Percent Black or African American alone (median across all tracts)',\n",
+ "'Percent American Indian and Alaska Native alone median':\n",
+ " 'Percent American Indian and Alaska Native alone (median across all tracts)',\n",
+ "'Percent Non-Hispanic White median':\n",
+ " 'Percent Non-Hispanic White (median across all tracts)',\n",
+ "'Percent Hispanic or Latino median':\n",
+ " 'Percent Hispanic or Latino (median across all tracts)',\n",
+ "'GEOID10_TRACT nunique': \"Total Number of Unique Tracts\",\n",
+ "\"current_summed_methodology sum\": \"Total Owned and Rented Burdened Households (Current Aggregation Methodology)\" \n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 220,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['Percent of individuals < 200% Federal Poverty Line (median across all tracts)',\n",
+ " 'Percent individuals age 25 or over with less than high school degree (median across all tracts)',\n",
+ " 'Percent enrollment in college or graduate school (median across all tracts)',\n",
+ " 'Percent Black or African American alone (median across all tracts)',\n",
+ " 'Percent American Indian and Alaska Native alone (median across all tracts)',\n",
+ " 'Percent Non-Hispanic White (median across all tracts)',\n",
+ " 'Percent Hispanic or Latino (median across all tracts)',\n",
+ " 'Total Number of Unique Tracts',\n",
+ " 'Total Owned and Rented Burdened Households (Current Aggregation Methodology)'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 220,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_stats_score_l.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If the samples are samples from the population - a big assumption - in question and the sampling follows rules and things we take the mean:\n",
+ "\n",
+ "The mean of the median distribution (an order statistic) is the best estimate for the median. It's the best least unbiased estimator. The answer is not the median of that distribution.\n",
+ "\n",
+ "The answer is also not if the mean is an estimate of the median. Completely unrelated.\n",
+ "\n",
+ "Even if the original population is skewed, the distribution of a sampling statistic will be normalized - recall the the central limit theorem for more details\n",
+ "\n",
+ "The standard error of that mean should give you what you want to know to confidently make statements of the true population median across all states"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 221,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "list_of_columns = list(grouped_stats_score_l.columns)\n",
+ "values_1 = list(grouped_stats_score_l[list_of_columns].mean())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 222,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_1.extend([True, False])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 223,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[57.05272445515469,\n",
+ " 19.383812769449793,\n",
+ " 8.132533443213486,\n",
+ " 33.69280603472747,\n",
+ " 1.2213947311592683,\n",
+ " 31.32188596954498,\n",
+ " 19.378347515748676,\n",
+ " 62.08888888888889,\n",
+ " 40949.2,\n",
+ " True,\n",
+ " False]"
+ ]
+ },
+ "execution_count": 223,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "values_1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CalenviroScreen Burden"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 224,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predicate_2 = (non_null_df['current_threshold_exceeded'] == False) & (non_null_df['new_threshold_exceeded'] == True)\n",
+ "\n",
+ "cal_ej_screen_burdened = non_null_df[predicate_2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 225,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Join the demographics in.\n",
+ "merged_df_score_ejcreen = cal_ej_screen_burdened.merge(\n",
+ " demographics_df[columns_to_keep],\n",
+ " left_on=\"FIPS_tract_id\",\n",
+ " right_on=\"GEOID10_TRACT\",\n",
+ " how=\"inner\"\n",
+ ")\n",
+ "\n",
+ "# these are not converted into percent 0 - 100 scale\n",
+ "percent_cols = [x for x in merged_df_score_ejcreen.columns if \n",
+ " 'Percent' in x or '(percent)' in x\n",
+ " ]\n",
+ "\n",
+ "merged_df_score_ejcreen[\n",
+ " percent_cols] = merged_df_score_ejcreen[\n",
+ " percent_cols].apply(lambda x: x * 100)\n",
+ "\n",
+ "for idx, row in merged_df_score_ejcreen.iterrows():\n",
+ " current_row = str(merged_df_score_ejcreen.loc[idx, 'state'])\n",
+ " state = mapping.get(current_row, None)\n",
+ " merged_df_score_ejcreen.loc[idx, 'state_name'] = state\n",
+ "\n",
+ "grouped_stats_score_ej_screen = merged_df_score_ejcreen.groupby([\"state_name\"]).agg({\n",
+ " 'GEOID10_TRACT': 'nunique',\n",
+ " 'Percent of individuals < 200% Federal Poverty Line': [np.median, np.std],\n",
+ " 'Median value ($) of owner-occupied housing units': [np.median, np.std],\n",
+ " 'Percent individuals age 25 or over with less than high school degree': [np.median, np.std],\n",
+ " 'Percent enrollment in college or graduate school': [np.median, np.std],\n",
+ " 'Percent Black or African American alone': [np.median, np.std],\n",
+ " 'Percent American Indian and Alaska Native alone': [np.median, np.std],\n",
+ " 'Percent Non-Hispanic White': [np.median, np.std], \n",
+ " 'Linguistic isolation (percent)': [np.median, np.std],\n",
+ " 'Percent Hispanic or Latino': [np.median, np.std],\n",
+ " 'hbrd_rank': [np.median, np.std],\n",
+ " 'current_methodology_percent': [np.median, np.std],\n",
+ " 'current_summed_methodology': [np.median, np.std, np.sum]\n",
+ "}).reset_index()\n",
+ "\n",
+ "\n",
+ "grouped_stats_score_ej_screen.columns = [' '.join(col).strip() for \n",
+ " col in grouped_stats_score_ej_screen.columns.values]\n",
+ "\n",
+ "grouped_stats_score_ej_screen = grouped_stats_score_ej_screen[[x for x in grouped_stats_score_ej_screen \n",
+ " if \"median\" in x and \n",
+ " \"Percent\" in x] + [\"GEOID10_TRACT nunique\", \n",
+ " 'current_summed_methodology sum', \"state_name\"]]\n",
+ "\n",
+ "grouped_stats_score_ej_screen.set_index(\"state_name\", inplace=True)\n",
+ "\n",
+ "grouped_stats_score_ej_screen = grouped_stats_score_ej_screen.rename(columns={\n",
+ "'Percent of individuals < 200% Federal Poverty Line median'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (median across all tracts)', \n",
+ "'Percent individuals age 25 or over with less than high school degree median':\n",
+ "'Percent individuals age 25 or over with less than high school degree (median across all tracts)', \n",
+ "'Percent enrollment in college or graduate school median'\n",
+ " :'Percent enrollment in college or graduate school (median across all tracts)',\n",
+ "'Percent Black or African American alone median':\n",
+ " 'Percent Black or African American alone (median across all tracts)',\n",
+ "'Percent American Indian and Alaska Native alone median':\n",
+ " 'Percent American Indian and Alaska Native alone (median across all tracts)',\n",
+ "'Percent Non-Hispanic White median':\n",
+ " 'Percent Non-Hispanic White (median across all tracts)',\n",
+ "'Percent Hispanic or Latino median':\n",
+ " 'Percent Hispanic or Latino (median across all tracts)',\n",
+ "'GEOID10_TRACT nunique': \"Total Number of Unique Tracts\",\n",
+ "\"current_summed_methodology sum\": \"Total Owned and Rented Burdened Households (Current Aggregation Methodology)\" \n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 235,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2272, 36)"
+ ]
+ },
+ "execution_count": 235,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df_score_ejcreen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 236,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2272"
+ ]
+ },
+ "execution_count": 236,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_stats_score_ej_screen['Total Number of Unique Tracts'].sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 237,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_2 = list(grouped_stats_score_ej_screen[list_of_columns].mean())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 238,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_2.extend([False, True])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Both Met Criterion"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 239,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predicate_3 = (non_null_df['current_threshold_exceeded'] == True) & (non_null_df['new_threshold_exceeded'] == True)\n",
+ "\n",
+ "union_df = non_null_df[predicate_3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 240,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(4529, 15)"
+ ]
+ },
+ "execution_count": 240,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "union_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 241,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Join the demographics in.\n",
+ "merged_df_both_met = union_df.merge(\n",
+ " demographics_df[columns_to_keep],\n",
+ " left_on=\"FIPS_tract_id\",\n",
+ " right_on=\"GEOID10_TRACT\",\n",
+ " how=\"inner\"\n",
+ ")\n",
+ "\n",
+ "# these are not converted into percent 0 - 100 scale\n",
+ "percent_cols = [x for x in merged_df_both_met.columns if \n",
+ " 'Percent' in x or '(percent)' in x\n",
+ " ]\n",
+ "\n",
+ "merged_df_both_met[\n",
+ " percent_cols] = merged_df_both_met[\n",
+ " percent_cols].apply(lambda x: x * 100)\n",
+ "\n",
+ "for idx, row in merged_df_both_met.iterrows():\n",
+ " current_row = str(merged_df_both_met.loc[idx, 'state'])\n",
+ " state = mapping.get(current_row, None)\n",
+ " merged_df_both_met.loc[idx, 'state_name'] = state\n",
+ "\n",
+ "grouped_stats_both_met = merged_df_both_met.groupby([\"state_name\"]).agg({\n",
+ " 'GEOID10_TRACT': 'nunique',\n",
+ " 'Percent of individuals < 200% Federal Poverty Line': [np.median, np.std],\n",
+ " 'Median value ($) of owner-occupied housing units': [np.median, np.std],\n",
+ " 'Percent individuals age 25 or over with less than high school degree': [np.median, np.std],\n",
+ " 'Percent enrollment in college or graduate school': [np.median, np.std],\n",
+ " 'Percent Black or African American alone': [np.median, np.std],\n",
+ " 'Percent American Indian and Alaska Native alone': [np.median, np.std],\n",
+ " 'Percent Non-Hispanic White': [np.median, np.std], \n",
+ " 'Linguistic isolation (percent)': [np.median, np.std],\n",
+ " 'Percent Hispanic or Latino': [np.median, np.std],\n",
+ " 'hbrd_rank': [np.median, np.std],\n",
+ " 'current_methodology_percent': [np.median, np.std],\n",
+ " 'current_summed_methodology': [np.median, np.std, np.sum]\n",
+ "}).reset_index()\n",
+ "\n",
+ "\n",
+ "grouped_stats_both_met.columns = [' '.join(col).strip() for \n",
+ " col in grouped_stats_both_met.columns.values]\n",
+ "\n",
+ "grouped_stats_both_met = grouped_stats_both_met[[x for x in grouped_stats_both_met \n",
+ " if \"median\" in x and \n",
+ " \"Percent\" in x] + [\"GEOID10_TRACT nunique\", \n",
+ " 'current_summed_methodology sum', \"state_name\"]]\n",
+ "\n",
+ "grouped_stats_both_met.set_index(\"state_name\", inplace=True)\n",
+ "\n",
+ "grouped_stats_both_met = grouped_stats_both_met.rename(columns={\n",
+ "'Percent of individuals < 200% Federal Poverty Line median'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (median across all tracts)', \n",
+ "'Percent individuals age 25 or over with less than high school degree median':\n",
+ "'Percent individuals age 25 or over with less than high school degree (median across all tracts)', \n",
+ "'Percent enrollment in college or graduate school median'\n",
+ " :'Percent enrollment in college or graduate school (median across all tracts)',\n",
+ "'Percent Black or African American alone median':\n",
+ " 'Percent Black or African American alone (median across all tracts)',\n",
+ "'Percent American Indian and Alaska Native alone median':\n",
+ " 'Percent American Indian and Alaska Native alone (median across all tracts)',\n",
+ "'Percent Non-Hispanic White median':\n",
+ " 'Percent Non-Hispanic White (median across all tracts)',\n",
+ "'Percent Hispanic or Latino median':\n",
+ " 'Percent Hispanic or Latino (median across all tracts)',\n",
+ "'GEOID10_TRACT nunique': \"Total Number of Unique Tracts\",\n",
+ "\"current_summed_methodology sum\": \"Total Owned and Rented Burdened Households (Current Aggregation Methodology)\" \n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 242,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_3 = list(grouped_stats_both_met[list_of_columns].mean())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 243,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_3.extend([True, True])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Neither Met Criterion "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 244,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predicate_4 = (non_null_df[\n",
+ " 'current_threshold_exceeded'] == False) & (\n",
+ " non_null_df['new_threshold_exceeded'] == False)\n",
+ "\n",
+ "negation_union_df = non_null_df[predicate_4]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 245,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Join the demographics in.\n",
+ "merged_df_negation_union = negation_union_df.merge(\n",
+ " demographics_df[columns_to_keep],\n",
+ " left_on=\"FIPS_tract_id\",\n",
+ " right_on=\"GEOID10_TRACT\",\n",
+ " how=\"inner\"\n",
+ ")\n",
+ "\n",
+ "# these are not converted into percent 0 - 100 scale\n",
+ "percent_cols = [x for x in merged_df_negation_union.columns if \n",
+ " 'Percent' in x or '(percent)' in x\n",
+ " ]\n",
+ "\n",
+ "merged_df_negation_union[\n",
+ " percent_cols] = merged_df_negation_union[\n",
+ " percent_cols].apply(lambda x: x * 100)\n",
+ "\n",
+ "for idx, row in merged_df_negation_union.iterrows():\n",
+ " current_row = str(merged_df_negation_union.loc[idx, 'state'])\n",
+ " state = mapping.get(current_row, None)\n",
+ " merged_df_negation_union.loc[idx, 'state_name'] = state\n",
+ "\n",
+ "grouped_stats_both_not_met = merged_df_negation_union.groupby([\"state_name\"]).agg({\n",
+ " 'GEOID10_TRACT': 'nunique',\n",
+ " 'Percent of individuals < 200% Federal Poverty Line': [np.median, np.std],\n",
+ " 'Median value ($) of owner-occupied housing units': [np.median, np.std],\n",
+ " 'Percent individuals age 25 or over with less than high school degree': [np.median, np.std],\n",
+ " 'Percent enrollment in college or graduate school': [np.median, np.std],\n",
+ " 'Percent Black or African American alone': [np.median, np.std],\n",
+ " 'Percent American Indian and Alaska Native alone': [np.median, np.std],\n",
+ " 'Percent Non-Hispanic White': [np.median, np.std], \n",
+ " 'Linguistic isolation (percent)': [np.median, np.std],\n",
+ " 'Percent Hispanic or Latino': [np.median, np.std],\n",
+ " 'hbrd_rank': [np.median, np.std],\n",
+ " 'current_methodology_percent': [np.median, np.std],\n",
+ " 'current_summed_methodology': [np.median, np.std, np.sum]\n",
+ "}).reset_index()\n",
+ "\n",
+ "\n",
+ "grouped_stats_both_not_met.columns = [' '.join(col).strip() for \n",
+ " col in grouped_stats_both_not_met.columns.values]\n",
+ "\n",
+ "grouped_stats_both_not_met = grouped_stats_both_not_met[[x for x in grouped_stats_both_not_met \n",
+ " if \"median\" in x and \n",
+ " \"Percent\" in x] + [\"GEOID10_TRACT nunique\", \n",
+ " 'current_summed_methodology sum', \"state_name\"]]\n",
+ "\n",
+ "grouped_stats_both_not_met.set_index(\"state_name\", inplace=True)\n",
+ "\n",
+ "grouped_stats_both_not_met = grouped_stats_both_not_met.rename(columns={\n",
+ "'Percent of individuals < 200% Federal Poverty Line median'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (median across all tracts)', \n",
+ "'Percent individuals age 25 or over with less than high school degree median':\n",
+ "'Percent individuals age 25 or over with less than high school degree (median across all tracts)', \n",
+ "'Percent enrollment in college or graduate school median'\n",
+ " :'Percent enrollment in college or graduate school (median across all tracts)',\n",
+ "'Percent Black or African American alone median':\n",
+ " 'Percent Black or African American alone (median across all tracts)',\n",
+ "'Percent American Indian and Alaska Native alone median':\n",
+ " 'Percent American Indian and Alaska Native alone (median across all tracts)',\n",
+ "'Percent Non-Hispanic White median':\n",
+ " 'Percent Non-Hispanic White (median across all tracts)',\n",
+ "'Percent Hispanic or Latino median':\n",
+ " 'Percent Hispanic or Latino (median across all tracts)',\n",
+ "'GEOID10_TRACT nunique': \"Total Number of Unique Tracts\",\n",
+ "\"current_summed_methodology sum\": \"Total Owned and Rented Burdened Households (Current Aggregation Methodology)\" \n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 246,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "values_4 = list(grouped_stats_both_not_met[list_of_columns].mean())\n",
+ "\n",
+ "values_4.extend([False, False])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 247,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
0
\n",
+ "
1
\n",
+ "
2
\n",
+ "
3
\n",
+ "
4
\n",
+ "
5
\n",
+ "
6
\n",
+ "
7
\n",
+ "
8
\n",
+ "
9
\n",
+ "
10
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
57.052724
\n",
+ "
19.383813
\n",
+ "
8.132533
\n",
+ "
33.692806
\n",
+ "
1.221395
\n",
+ "
31.321886
\n",
+ "
19.378348
\n",
+ "
62.088889
\n",
+ "
40949.200000
\n",
+ "
True
\n",
+ "
False
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
47.082927
\n",
+ "
13.224927
\n",
+ "
8.685441
\n",
+ "
23.061683
\n",
+ "
0.772400
\n",
+ "
48.240042
\n",
+ "
12.553371
\n",
+ "
44.549020
\n",
+ "
24719.098039
\n",
+ "
False
\n",
+ "
True
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
62.331263
\n",
+ "
17.689811
\n",
+ "
11.824205
\n",
+ "
33.455525
\n",
+ "
0.921755
\n",
+ "
34.495797
\n",
+ "
15.855933
\n",
+ "
88.803922
\n",
+ "
60757.901961
\n",
+ "
True
\n",
+ "
True
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
27.736502
\n",
+ "
8.785280
\n",
+ "
5.847120
\n",
+ "
5.729181
\n",
+ "
0.537429
\n",
+ "
75.916536
\n",
+ "
7.463468
\n",
+ "
1244.333333
\n",
+ "
443680.568627
\n",
+ "
False
\n",
+ "
False
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 57.052724 19.383813 8.132533 33.692806 1.221395 31.321886 19.378348 \n",
+ "1 47.082927 13.224927 8.685441 23.061683 0.772400 48.240042 12.553371 \n",
+ "2 62.331263 17.689811 11.824205 33.455525 0.921755 34.495797 15.855933 \n",
+ "3 27.736502 8.785280 5.847120 5.729181 0.537429 75.916536 7.463468 \n",
+ "\n",
+ " 7 8 9 10 \n",
+ "0 62.088889 40949.200000 True False \n",
+ "1 44.549020 24719.098039 False True \n",
+ "2 88.803922 60757.901961 True True \n",
+ "3 1244.333333 443680.568627 False False "
+ ]
+ },
+ "execution_count": 247,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "final_summary_data_values = [values_1, values_2, values_3, values_4]\n",
+ " \n",
+ "# Create the pandas DataFrame\n",
+ "df = pd.DataFrame(final_summary_data_values)\n",
+ " \n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 248,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cols = list(grouped_stats_both_not_met.columns)\n",
+ "cols.extend([\n",
+ " 'Score L Relative Housing Burden for all households met burden threshold', \n",
+ " 'CalEnviroScreen Housing Burden Met Burden Threshold (Ranked Percentile)'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 249,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.rename(columns={df.columns[idx]: cols[idx] for idx, _ in enumerate(cols)}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 250,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.set_index([\n",
+ " 'Score L Relative Housing Burden for all households met burden threshold', \n",
+ " 'CalEnviroScreen Housing Burden Met Burden Threshold (Ranked Percentile)'], inplace=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 251,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.rename(columns={\n",
+ "'Percent of individuals < 200% Federal Poverty Line (median across all tracts)'\n",
+ " : 'Percent of individuals < 200% Federal Poverty Line (mean of median across all states)', \n",
+ "'Percent individuals age 25 or over with less than high school degree median':\n",
+ "'Percent individuals age 25 or over with less than high school degree (mean of median across all states)', \n",
+ "'Percent enrollment in college or graduate school (median across all states)'\n",
+ " :'Percent enrollment in college or graduate school (mean of median across all states)',\n",
+ "'Percent Black or African American alone (median across all tracts)':\n",
+ " 'Percent Black or African American alone (mean of median across all states)',\n",
+ "'Percent American Indian and Alaska Native alone (median across all tracts)':\n",
+ " 'Percent American Indian and Alaska Native alone (mean of median across all states)',\n",
+ "'Percent Non-Hispanic White (median across all tracts)':\n",
+ " 'Percent Non-Hispanic White (mean of median across all states)',\n",
+ "'Percent Hispanic or Latino (median across all tracts)':\n",
+ " 'Percent Hispanic or Latino (mean of median across all states)',\n",
+ "\"Total Number of Unique Tracts\": \"Total Number of Tracts - Mean of median across all states\",\n",
+ "\"Total Owned and Rented Burdened Households (Current Aggregation Methodology)\": \"Total Owned and Rented Burdened Households (Current Aggregation Methodology)- Mean of median across all states\"\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 252,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
Percent of individuals < 200% Federal Poverty Line (mean of median across all states)
\n",
+ "
Percent individuals age 25 or over with less than high school degree (median across all tracts)
\n",
+ "
Percent enrollment in college or graduate school (median across all tracts)
\n",
+ "
Percent Black or African American alone (mean of median across all states)
\n",
+ "
Percent American Indian and Alaska Native alone (mean of median across all states)
\n",
+ "
Percent Non-Hispanic White (mean of median across all states)
\n",
+ "
Percent Hispanic or Latino (mean of median across all states)
\n",
+ "
Total Number of Tracts - Mean of median across all states
\n",
+ "
Total Owned and Rented Burdened Households (Current Aggregation Methodology)- Mean of median across all states
\n",
+ "
\n",
+ "
\n",
+ "
Score L Relative Housing Burden for all households met burden threshold
\n",
+ "
CalEnviroScreen Housing Burden Met Burden Threshold (Ranked Percentile)
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
True
\n",
+ "
False
\n",
+ "
57.052724
\n",
+ "
19.383813
\n",
+ "
8.132533
\n",
+ "
33.692806
\n",
+ "
1.221395
\n",
+ "
31.321886
\n",
+ "
19.378348
\n",
+ "
62.088889
\n",
+ "
40949.200000
\n",
+ "
\n",
+ "
\n",
+ "
False
\n",
+ "
True
\n",
+ "
47.082927
\n",
+ "
13.224927
\n",
+ "
8.685441
\n",
+ "
23.061683
\n",
+ "
0.772400
\n",
+ "
48.240042
\n",
+ "
12.553371
\n",
+ "
44.549020
\n",
+ "
24719.098039
\n",
+ "
\n",
+ "
\n",
+ "
True
\n",
+ "
True
\n",
+ "
62.331263
\n",
+ "
17.689811
\n",
+ "
11.824205
\n",
+ "
33.455525
\n",
+ "
0.921755
\n",
+ "
34.495797
\n",
+ "
15.855933
\n",
+ "
88.803922
\n",
+ "
60757.901961
\n",
+ "
\n",
+ "
\n",
+ "
False
\n",
+ "
False
\n",
+ "
27.736502
\n",
+ "
8.785280
\n",
+ "
5.847120
\n",
+ "
5.729181
\n",
+ "
0.537429
\n",
+ "
75.916536
\n",
+ "
7.463468
\n",
+ "
1244.333333
\n",
+ "
443680.568627
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Percent of individuals < 200% Federal Poverty Line (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 57.052724 \n",
+ "False True 47.082927 \n",
+ "True True 62.331263 \n",
+ "False False 27.736502 \n",
+ "\n",
+ " Percent individuals age 25 or over with less than high school degree (median across all tracts) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 19.383813 \n",
+ "False True 13.224927 \n",
+ "True True 17.689811 \n",
+ "False False 8.785280 \n",
+ "\n",
+ " Percent enrollment in college or graduate school (median across all tracts) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 8.132533 \n",
+ "False True 8.685441 \n",
+ "True True 11.824205 \n",
+ "False False 5.847120 \n",
+ "\n",
+ " Percent Black or African American alone (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 33.692806 \n",
+ "False True 23.061683 \n",
+ "True True 33.455525 \n",
+ "False False 5.729181 \n",
+ "\n",
+ " Percent American Indian and Alaska Native alone (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 1.221395 \n",
+ "False True 0.772400 \n",
+ "True True 0.921755 \n",
+ "False False 0.537429 \n",
+ "\n",
+ " Percent Non-Hispanic White (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 31.321886 \n",
+ "False True 48.240042 \n",
+ "True True 34.495797 \n",
+ "False False 75.916536 \n",
+ "\n",
+ " Percent Hispanic or Latino (mean of median across all states) \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 19.378348 \n",
+ "False True 12.553371 \n",
+ "True True 15.855933 \n",
+ "False False 7.463468 \n",
+ "\n",
+ " Total Number of Tracts - Mean of median across all states \\\n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 62.088889 \n",
+ "False True 44.549020 \n",
+ "True True 88.803922 \n",
+ "False False 1244.333333 \n",
+ "\n",
+ " Total Owned and Rented Burdened Households (Current Aggregation Methodology)- Mean of median across all states \n",
+ "Score L Relative Housing Burden for all househo... CalEnviroScreen Housing Burden Met Burden Thres... \n",
+ "True False 40949.200000 \n",
+ "False True 24719.098039 \n",
+ "True True 60757.901961 \n",
+ "False False 443680.568627 "
+ ]
+ },
+ "execution_count": 252,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 211,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def highlight_min(s, props=''):\n",
+ " return np.where(s == np.nanmin(s.values), props, '')\n",
+ "\n",
+ "def highlight_max(s, props=''):\n",
+ " return np.where(s == np.nanmax(s.values), props, '')\n",
+ "\n",
+ "df_styled = df.style.format(\n",
+ " na_rep='MISSING',\n",
+ " formatter={df.columns[idx]: \"{:.2f}\" for idx, _ in enumerate(df.columns) })"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 255,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_styled_max_min = df_styled.apply(highlight_max, props='color:#FFFFFF;background-color:#00008b', axis=0)\\\n",
+ " .apply(highlight_min, props='color:#FFFFFF;background-color:#FF0000', axis=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 256,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_styled_max_min.to_excel(\"summary_all_states.xlsx\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 257,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
Percent of individuals < 200% Federal Poverty Line (mean of median across all states)
Percent individuals age 25 or over with less than high school degree (median across all tracts)
Percent enrollment in college or graduate school (median across all tracts)
Percent Black or African American alone (mean of median across all states)
Percent American Indian and Alaska Native alone (mean of median across all states)
Percent Non-Hispanic White (mean of median across all states)
Percent Hispanic or Latino (mean of median across all states)
Total Number of Tracts - Mean of median across all states
Total Owned and Rented Burdened Households (Current Aggregation Methodology)- Mean of median across all states
Score L Relative Housing Burden for all households met burden threshold
CalEnviroScreen Housing Burden Met Burden Threshold (Ranked Percentile)
Percent of individuals < 200% Federal Poverty Line (median across all tracts)
\n",
+ "
Percent individuals age 25 or over with less than high school degree (median across all tracts)
\n",
+ "
Percent enrollment in college or graduate school (median across all tracts)
\n",
+ "
Percent Black or African American alone (median across all tracts)
\n",
+ "
Percent American Indian and Alaska Native alone (median across all tracts)
\n",
+ "
Percent Non-Hispanic White (median across all tracts)
\n",
+ "
Percent Hispanic or Latino (median across all tracts)
\n",
+ "
Total Number of Unique Tracts
\n",
+ "
Total Owned and Rented Burdened Households (Current Aggregation Methodology)
\n",
+ "
\n",
+ "
\n",
+ "
state_name
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
AK
\n",
+ "
33.563081
\n",
+ "
10.407240
\n",
+ "
9.485638
\n",
+ "
4.553571
\n",
+ "
21.279762
\n",
+ "
48.839286
\n",
+ "
9.375000
\n",
+ "
1
\n",
+ "
825
\n",
+ "
\n",
+ "
\n",
+ "
AL
\n",
+ "
63.335854
\n",
+ "
20.273617
\n",
+ "
7.895408
\n",
+ "
79.236090
\n",
+ "
0.000000
\n",
+ "
8.436779
\n",
+ "
1.986055
\n",
+ "
14
\n",
+ "
7107
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Percent of individuals < 200% Federal Poverty Line (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 33.563081 \n",
+ "AL 63.335854 \n",
+ "\n",
+ " Percent individuals age 25 or over with less than high school degree (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 10.407240 \n",
+ "AL 20.273617 \n",
+ "\n",
+ " Percent enrollment in college or graduate school (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 9.485638 \n",
+ "AL 7.895408 \n",
+ "\n",
+ " Percent Black or African American alone (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 4.553571 \n",
+ "AL 79.236090 \n",
+ "\n",
+ " Percent American Indian and Alaska Native alone (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 21.279762 \n",
+ "AL 0.000000 \n",
+ "\n",
+ " Percent Non-Hispanic White (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 48.839286 \n",
+ "AL 8.436779 \n",
+ "\n",
+ " Percent Hispanic or Latino (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 9.375000 \n",
+ "AL 1.986055 \n",
+ "\n",
+ " Total Number of Unique Tracts \\\n",
+ "state_name \n",
+ "AK 1 \n",
+ "AL 14 \n",
+ "\n",
+ " Total Owned and Rented Burdened Households (Current Aggregation Methodology) \n",
+ "state_name \n",
+ "AK 825 \n",
+ "AL 7107 "
+ ]
+ },
+ "execution_count": 272,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_1.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 273,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "candidate_columns = ['Percent of individuals < 200% Federal Poverty Line (median across all tracts)',\n",
+ " 'Percent individuals age 25 or over with less than high school degree (median across all tracts)',\n",
+ " 'Percent enrollment in college or graduate school (median across all tracts)',\n",
+ " 'Percent Black or African American alone (median across all tracts)',\n",
+ " 'Percent American Indian and Alaska Native alone (median across all tracts)',\n",
+ " 'Percent Non-Hispanic White (median across all tracts)',\n",
+ " 'Percent Hispanic or Latino (median across all tracts)',\n",
+ " 'Total Number of Unique Tracts',\n",
+ " 'Total Owned and Rented Burdened Households (Current Aggregation Methodology)']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 276,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "diff = (df_1[candidate_columns] - df_2[candidate_columns]).fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 277,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
Percent of individuals < 200% Federal Poverty Line (median across all tracts)
\n",
+ "
Percent individuals age 25 or over with less than high school degree (median across all tracts)
\n",
+ "
Percent enrollment in college or graduate school (median across all tracts)
\n",
+ "
Percent Black or African American alone (median across all tracts)
\n",
+ "
Percent American Indian and Alaska Native alone (median across all tracts)
\n",
+ "
Percent Non-Hispanic White (median across all tracts)
\n",
+ "
Percent Hispanic or Latino (median across all tracts)
\n",
+ "
Total Number of Unique Tracts
\n",
+ "
Total Owned and Rented Burdened Households (Current Aggregation Methodology)
\n",
+ "
\n",
+ "
\n",
+ "
state_name
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
AK
\n",
+ "
0.985223
\n",
+ "
-0.274578
\n",
+ "
3.206568
\n",
+ "
-1.085258
\n",
+ "
8.721076
\n",
+ "
-6.803172
\n",
+ "
-1.778285
\n",
+ "
-10.0
\n",
+ "
-4179.0
\n",
+ "
\n",
+ "
\n",
+ "
AL
\n",
+ "
6.211917
\n",
+ "
4.283941
\n",
+ "
0.331430
\n",
+ "
6.212278
\n",
+ "
0.000000
\n",
+ "
-9.011147
\n",
+ "
-0.630017
\n",
+ "
-28.0
\n",
+ "
-15422.0
\n",
+ "
\n",
+ "
\n",
+ "
AR
\n",
+ "
5.432052
\n",
+ "
1.601861
\n",
+ "
3.942754
\n",
+ "
20.319049
\n",
+ "
0.102459
\n",
+ "
-23.263854
\n",
+ "
-0.896646
\n",
+ "
-35.0
\n",
+ "
-16843.0
\n",
+ "
\n",
+ "
\n",
+ "
AZ
\n",
+ "
13.594871
\n",
+ "
11.979424
\n",
+ "
-0.272040
\n",
+ "
1.361388
\n",
+ "
-0.915677
\n",
+ "
-8.953361
\n",
+ "
8.757832
\n",
+ "
-39.0
\n",
+ "
-23253.0
\n",
+ "
\n",
+ "
\n",
+ "
CA
\n",
+ "
15.717598
\n",
+ "
16.799587
\n",
+ "
-0.702951
\n",
+ "
0.247647
\n",
+ "
0.415820
\n",
+ "
-21.870519
\n",
+ "
40.216194
\n",
+ "
783.0
\n",
+ "
542144.0
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Percent of individuals < 200% Federal Poverty Line (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 0.985223 \n",
+ "AL 6.211917 \n",
+ "AR 5.432052 \n",
+ "AZ 13.594871 \n",
+ "CA 15.717598 \n",
+ "\n",
+ " Percent individuals age 25 or over with less than high school degree (median across all tracts) \\\n",
+ "state_name \n",
+ "AK -0.274578 \n",
+ "AL 4.283941 \n",
+ "AR 1.601861 \n",
+ "AZ 11.979424 \n",
+ "CA 16.799587 \n",
+ "\n",
+ " Percent enrollment in college or graduate school (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 3.206568 \n",
+ "AL 0.331430 \n",
+ "AR 3.942754 \n",
+ "AZ -0.272040 \n",
+ "CA -0.702951 \n",
+ "\n",
+ " Percent Black or African American alone (median across all tracts) \\\n",
+ "state_name \n",
+ "AK -1.085258 \n",
+ "AL 6.212278 \n",
+ "AR 20.319049 \n",
+ "AZ 1.361388 \n",
+ "CA 0.247647 \n",
+ "\n",
+ " Percent American Indian and Alaska Native alone (median across all tracts) \\\n",
+ "state_name \n",
+ "AK 8.721076 \n",
+ "AL 0.000000 \n",
+ "AR 0.102459 \n",
+ "AZ -0.915677 \n",
+ "CA 0.415820 \n",
+ "\n",
+ " Percent Non-Hispanic White (median across all tracts) \\\n",
+ "state_name \n",
+ "AK -6.803172 \n",
+ "AL -9.011147 \n",
+ "AR -23.263854 \n",
+ "AZ -8.953361 \n",
+ "CA -21.870519 \n",
+ "\n",
+ " Percent Hispanic or Latino (median across all tracts) \\\n",
+ "state_name \n",
+ "AK -1.778285 \n",
+ "AL -0.630017 \n",
+ "AR -0.896646 \n",
+ "AZ 8.757832 \n",
+ "CA 40.216194 \n",
+ "\n",
+ " Total Number of Unique Tracts \\\n",
+ "state_name \n",
+ "AK -10.0 \n",
+ "AL -28.0 \n",
+ "AR -35.0 \n",
+ "AZ -39.0 \n",
+ "CA 783.0 \n",
+ "\n",
+ " Total Owned and Rented Burdened Households (Current Aggregation Methodology) \n",
+ "state_name \n",
+ "AK -4179.0 \n",
+ "AL -15422.0 \n",
+ "AR -16843.0 \n",
+ "AZ -23253.0 \n",
+ "CA 542144.0 "
+ ]
+ },
+ "execution_count": 277,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "diff.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 279,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
Percent of individuals < 200% Federal Poverty Line (median across all tracts)
Percent individuals age 25 or over with less than high school degree (median across all tracts)
Percent enrollment in college or graduate school (median across all tracts)
Percent Black or African American alone (median across all tracts)
Percent American Indian and Alaska Native alone (median across all tracts)
Percent Non-Hispanic White (median across all tracts)
Percent Hispanic or Latino (median across all tracts)
Total Number of Unique Tracts
Total Owned and Rented Burdened Households (Current Aggregation Methodology)
state_name
\n",
+ "
\n",
+ "
AK
\n",
+ "
0.985223
\n",
+ "
-0.274578
\n",
+ "
3.206568
\n",
+ "
-1.085258
\n",
+ "
8.721076
\n",
+ "
-6.803172
\n",
+ "
-1.778285
\n",
+ "
-10.000000
\n",
+ "
-4179.000000
\n",
+ "
\n",
+ "
\n",
+ "
AL
\n",
+ "
6.211917
\n",
+ "
4.283941
\n",
+ "
0.331430
\n",
+ "
6.212278
\n",
+ "
0.000000
\n",
+ "
-9.011147
\n",
+ "
-0.630017
\n",
+ "
-28.000000
\n",
+ "
-15422.000000
\n",
+ "
\n",
+ "
\n",
+ "
AR
\n",
+ "
5.432052
\n",
+ "
1.601861
\n",
+ "
3.942754
\n",
+ "
20.319049
\n",
+ "
0.102459
\n",
+ "
-23.263854
\n",
+ "
-0.896646
\n",
+ "
-35.000000
\n",
+ "
-16843.000000
\n",
+ "
\n",
+ "
\n",
+ "
AZ
\n",
+ "
13.594871
\n",
+ "
11.979424
\n",
+ "
-0.272040
\n",
+ "
1.361388
\n",
+ "
-0.915677
\n",
+ "
-8.953361
\n",
+ "
8.757832
\n",
+ "
-39.000000
\n",
+ "
-23253.000000
\n",
+ "
\n",
+ "
\n",
+ "
CA
\n",
+ "
15.717598
\n",
+ "
16.799587
\n",
+ "
-0.702951
\n",
+ "
0.247647
\n",
+ "
0.415820
\n",
+ "
-21.870519
\n",
+ "
40.216194
\n",
+ "
783.000000
\n",
+ "
542144.000000
\n",
+ "
\n",
+ "
\n",
+ "
CO
\n",
+ "
5.267963
\n",
+ "
0.816207
\n",
+ "
0.089370
\n",
+ "
5.338649
\n",
+ "
0.149809
\n",
+ "
-24.425903
\n",
+ "
10.133080
\n",
+ "
-34.000000
\n",
+ "
-17296.000000
\n",
+ "
\n",
+ "
\n",
+ "
CT
\n",
+ "
20.337916
\n",
+ "
6.865214
\n",
+ "
0.574592
\n",
+ "
-7.952396
\n",
+ "
0.000000
\n",
+ "
-9.384233
\n",
+ "
3.734253
\n",
+ "
48.000000
\n",
+ "
33786.000000
\n",
+ "
\n",
+ "
\n",
+ "
DC
\n",
+ "
20.225619
\n",
+ "
9.277701
\n",
+ "
-10.589364
\n",
+ "
65.946763
\n",
+ "
0.000000
\n",
+ "
-49.789447
\n",
+ "
-6.124777
\n",
+ "
13.000000
\n",
+ "
9673.000000
\n",
+ "
\n",
+ "
\n",
+ "
DE
\n",
+ "
1.370275
\n",
+ "
-1.231413
\n",
+ "
-4.499021
\n",
+ "
-27.806872
\n",
+ "
0.581898
\n",
+ "
9.572051
\n",
+ "
6.704117
\n",
+ "
-8.000000
\n",
+ "
-2816.000000
\n",
+ "
\n",
+ "
\n",
+ "
FL
\n",
+ "
12.798641
\n",
+ "
7.913089
\n",
+ "
-0.464450
\n",
+ "
13.453116
\n",
+ "
0.000000
\n",
+ "
-19.567955
\n",
+ "
2.960044
\n",
+ "
88.000000
\n",
+ "
71042.000000
\n",
+ "
\n",
+ "
\n",
+ "
GA
\n",
+ "
0.623927
\n",
+ "
2.733393
\n",
+ "
-0.799951
\n",
+ "
1.511527
\n",
+ "
-0.093487
\n",
+ "
-9.680710
\n",
+ "
2.172740
\n",
+ "
-3.000000
\n",
+ "
12615.000000
\n",
+ "
\n",
+ "
\n",
+ "
HI
\n",
+ "
4.897141
\n",
+ "
9.101461
\n",
+ "
0.265372
\n",
+ "
-0.896060
\n",
+ "
-0.094805
\n",
+ "
-35.939385
\n",
+ "
0.379145
\n",
+ "
7.000000
\n",
+ "
4771.000000
\n",
+ "
\n",
+ "
\n",
+ "
IA
\n",
+ "
10.953716
\n",
+ "
-0.708470
\n",
+ "
-4.742017
\n",
+ "
-1.623474
\n",
+ "
0.732012
\n",
+ "
19.552402
\n",
+ "
-7.997991
\n",
+ "
-53.000000
\n",
+ "
-25766.000000
\n",
+ "
\n",
+ "
\n",
+ "
ID
\n",
+ "
4.722145
\n",
+ "
9.643493
\n",
+ "
7.189788
\n",
+ "
-0.311804
\n",
+ "
-1.015539
\n",
+ "
-15.992569
\n",
+ "
19.816490
\n",
+ "
-23.000000
\n",
+ "
-16597.000000
\n",
+ "
\n",
+ "
\n",
+ "
IL
\n",
+ "
5.155379
\n",
+ "
5.303919
\n",
+ "
-0.630714
\n",
+ "
5.037600
\n",
+ "
0.000000
\n",
+ "
-25.049480
\n",
+ "
6.050435
\n",
+ "
93.000000
\n",
+ "
54402.000000
\n",
+ "
\n",
+ "
\n",
+ "
IN
\n",
+ "
2.113662
\n",
+ "
2.437293
\n",
+ "
0.719657
\n",
+ "
7.315713
\n",
+ "
0.000000
\n",
+ "
1.347745
\n",
+ "
0.492553
\n",
+ "
-36.000000
\n",
+ "
-16444.000000
\n",
+ "
\n",
+ "
\n",
+ "
KS
\n",
+ "
5.264502
\n",
+ "
-7.562943
\n",
+ "
0.627460
\n",
+ "
12.245890
\n",
+ "
0.466634
\n",
+ "
1.465143
\n",
+ "
-0.964424
\n",
+ "
-46.000000
\n",
+ "
-21169.000000
\n",
+ "
\n",
+ "
\n",
+ "
KY
\n",
+ "
8.794436
\n",
+ "
6.683970
\n",
+ "
0.862572
\n",
+ "
-0.575122
\n",
+ "
0.578035
\n",
+ "
-6.133101
\n",
+ "
5.455541
\n",
+ "
-57.000000
\n",
+ "
-27542.000000
\n",
+ "
\n",
+ "
\n",
+ "
LA
\n",
+ "
8.343117
\n",
+ "
3.494850
\n",
+ "
1.398804
\n",
+ "
7.964377
\n",
+ "
0.000000
\n",
+ "
-11.833643
\n",
+ "
-1.607310
\n",
+ "
27.000000
\n",
+ "
19739.000000
\n",
+ "
\n",
+ "
\n",
+ "
MA
\n",
+ "
8.936772
\n",
+ "
7.732666
\n",
+ "
1.005028
\n",
+ "
8.415698
\n",
+ "
0.000000
\n",
+ "
-19.359667
\n",
+ "
15.170475
\n",
+ "
38.000000
\n",
+ "
28222.000000
\n",
+ "
\n",
+ "
\n",
+ "
MD
\n",
+ "
9.100530
\n",
+ "
3.424516
\n",
+ "
-1.341899
\n",
+ "
21.848353
\n",
+ "
0.000000
\n",
+ "
-10.271062
\n",
+ "
-2.199229
\n",
+ "
-26.000000
\n",
+ "
-8900.000000
\n",
+ "
\n",
+ "
\n",
+ "
ME
\n",
+ "
12.565009
\n",
+ "
9.122611
\n",
+ "
-3.922820
\n",
+ "
-0.880265
\n",
+ "
-0.439467
\n",
+ "
-5.913473
\n",
+ "
-0.866007
\n",
+ "
-25.000000
\n",
+ "
-14441.000000
\n",
+ "
\n",
+ "
\n",
+ "
MI
\n",
+ "
6.453785
\n",
+ "
1.068588
\n",
+ "
0.689832
\n",
+ "
6.708850
\n",
+ "
0.000000
\n",
+ "
-8.072578
\n",
+ "
-1.350049
\n",
+ "
-23.000000
\n",
+ "
-6868.000000
\n",
+ "
\n",
+ "
\n",
+ "
MN
\n",
+ "
15.920965
\n",
+ "
17.740780
\n",
+ "
3.141607
\n",
+ "
22.498320
\n",
+ "
1.758343
\n",
+ "
-38.332396
\n",
+ "
22.509297
\n",
+ "
-87.000000
\n",
+ "
-46742.000000
\n",
+ "
\n",
+ "
\n",
+ "
MO
\n",
+ "
4.976759
\n",
+ "
-0.085646
\n",
+ "
-3.577408
\n",
+ "
47.102090
\n",
+ "
0.000000
\n",
+ "
-35.919908
\n",
+ "
-2.378652
\n",
+ "
-30.000000
\n",
+ "
-13248.000000
\n",
+ "
\n",
+ "
\n",
+ "
MS
\n",
+ "
8.484879
\n",
+ "
1.040226
\n",
+ "
-0.572744
\n",
+ "
24.863184
\n",
+ "
0.000000
\n",
+ "
-22.951000
\n",
+ "
0.051741
\n",
+ "
-27.000000
\n",
+ "
-12524.000000
\n",
+ "
\n",
+ "
\n",
+ "
MT
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
NC
\n",
+ "
8.153384
\n",
+ "
11.292289
\n",
+ "
-2.811982
\n",
+ "
6.496331
\n",
+ "
0.078513
\n",
+ "
-19.504668
\n",
+ "
11.151508
\n",
+ "
-71.000000
\n",
+ "
-45011.000000
\n",
+ "
\n",
+ "
\n",
+ "
ND
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
NE
\n",
+ "
7.484343
\n",
+ "
37.676269
\n",
+ "
-9.271518
\n",
+ "
-7.562947
\n",
+ "
0.634817
\n",
+ "
-37.670893
\n",
+ "
58.015686
\n",
+ "
-27.000000
\n",
+ "
-13452.000000
\n",
+ "
\n",
+ "
\n",
+ "
NH
\n",
+ "
11.333692
\n",
+ "
7.394888
\n",
+ "
-1.695591
\n",
+ "
2.079689
\n",
+ "
0.000000
\n",
+ "
-14.216572
\n",
+ "
9.124264
\n",
+ "
-16.000000
\n",
+ "
-11851.000000
\n",
+ "
\n",
+ "
\n",
+ "
NJ
\n",
+ "
5.927327
\n",
+ "
8.713014
\n",
+ "
-2.337923
\n",
+ "
-10.341142
\n",
+ "
0.000000
\n",
+ "
-5.623117
\n",
+ "
25.764242
\n",
+ "
151.000000
\n",
+ "
102626.000000
\n",
+ "
\n",
+ "
\n",
+ "
NM
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
NV
\n",
+ "
7.485453
\n",
+ "
2.632552
\n",
+ "
-0.783104
\n",
+ "
-0.500327
\n",
+ "
0.076985
\n",
+ "
2.512022
\n",
+ "
8.687485
\n",
+ "
12.000000
\n",
+ "
14693.000000
\n",
+ "
\n",
+ "
\n",
+ "
NY
\n",
+ "
12.369088
\n",
+ "
8.434444
\n",
+ "
-0.870792
\n",
+ "
-10.437037
\n",
+ "
0.000000
\n",
+ "
4.477567
\n",
+ "
15.929982
\n",
+ "
573.000000
\n",
+ "
407383.000000
\n",
+ "
\n",
+ "
\n",
+ "
OH
\n",
+ "
5.166431
\n",
+ "
2.626470
\n",
+ "
0.160945
\n",
+ "
16.322108
\n",
+ "
0.000000
\n",
+ "
-16.400598
\n",
+ "
0.628654
\n",
+ "
-21.000000
\n",
+ "
355.000000
\n",
+ "
\n",
+ "
\n",
+ "
OK
\n",
+ "
17.267863
\n",
+ "
5.687201
\n",
+ "
-2.733525
\n",
+ "
5.227426
\n",
+ "
-1.254840
\n",
+ "
-4.629739
\n",
+ "
13.684527
\n",
+ "
-60.000000
\n",
+ "
-25920.000000
\n",
+ "
\n",
+ "
\n",
+ "
OR
\n",
+ "
10.312702
\n",
+ "
7.677240
\n",
+ "
-0.667578
\n",
+ "
1.177527
\n",
+ "
-0.110343
\n",
+ "
-15.168623
\n",
+ "
16.277201
\n",
+ "
-35.000000
\n",
+ "
-26320.000000
\n",
+ "
\n",
+ "
\n",
+ "
PA
\n",
+ "
9.568567
\n",
+ "
6.789819
\n",
+ "
-1.100045
\n",
+ "
2.768551
\n",
+ "
0.000000
\n",
+ "
-7.188058
\n",
+ "
6.730014
\n",
+ "
-63.000000
\n",
+ "
-33293.000000
\n",
+ "
\n",
+ "
\n",
+ "
RI
\n",
+ "
28.097665
\n",
+ "
18.885313
\n",
+ "
-28.064680
\n",
+ "
9.569074
\n",
+ "
0.344864
\n",
+ "
-42.404295
\n",
+ "
39.740670
\n",
+ "
5.000000
\n",
+ "
4138.000000
\n",
+ "
\n",
+ "
\n",
+ "
SC
\n",
+ "
3.709508
\n",
+ "
2.304021
\n",
+ "
3.766892
\n",
+ "
19.048750
\n",
+ "
0.000000
\n",
+ "
-12.734147
\n",
+ "
1.243649
\n",
+ "
-43.000000
\n",
+ "
-22081.000000
\n",
+ "
\n",
+ "
\n",
+ "
SD
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
TN
\n",
+ "
8.692949
\n",
+ "
4.908793
\n",
+ "
-1.487781
\n",
+ "
31.162224
\n",
+ "
0.000000
\n",
+ "
-26.640514
\n",
+ "
2.135541
\n",
+ "
-31.000000
\n",
+ "
-13021.000000
\n",
+ "
\n",
+ "
\n",
+ "
TX
\n",
+ "
8.229632
\n",
+ "
4.785835
\n",
+ "
-0.018445
\n",
+ "
7.608903
\n",
+ "
0.000000
\n",
+ "
-6.991503
\n",
+ "
11.472775
\n",
+ "
-103.000000
\n",
+ "
-58670.000000
\n",
+ "
\n",
+ "
\n",
+ "
UT
\n",
+ "
12.401135
\n",
+ "
9.279480
\n",
+ "
2.393714
\n",
+ "
-1.095225
\n",
+ "
2.211103
\n",
+ "
-27.444168
\n",
+ "
33.552393
\n",
+ "
-31.000000
\n",
+ "
-18553.000000
\n",
+ "
\n",
+ "
\n",
+ "
VA
\n",
+ "
12.747983
\n",
+ "
4.210496
\n",
+ "
-2.294801
\n",
+ "
20.581487
\n",
+ "
0.000000
\n",
+ "
-12.192675
\n",
+ "
2.823834
\n",
+ "
-48.000000
\n",
+ "
-24962.000000
\n",
+ "
\n",
+ "
\n",
+ "
VT
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
WA
\n",
+ "
0.836843
\n",
+ "
3.285992
\n",
+ "
-2.853985
\n",
+ "
1.365010
\n",
+ "
0.242227
\n",
+ "
-14.708699
\n",
+ "
5.595930
\n",
+ "
-84.000000
\n",
+ "
-61277.000000
\n",
+ "
\n",
+ "
\n",
+ "
WI
\n",
+ "
15.552119
\n",
+ "
4.422237
\n",
+ "
-0.796047
\n",
+ "
40.591511
\n",
+ "
-0.526995
\n",
+ "
-20.282972
\n",
+ "
-3.363747
\n",
+ "
-17.000000
\n",
+ "
-8795.000000
\n",
+ "
\n",
+ "
\n",
+ "
WV
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
\n",
+ "
WY
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
0.000000
\n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 279,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def style_negative(v, props=''):\n",
+ " return props if v < 0 else None\n",
+ "\n",
+ "diff_style = diff.style.applymap(style_negative, props='color:red;')\\\n",
+ " .applymap(lambda v: 'opacity: 20%;' if (v < 0.0) and (v > 0.0) else None)\n",
+ "diff_style"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27_2011_relative_differences_between_methodologies-ranking-percentile.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27_2011_relative_differences_between_methodologies-ranking-percentile.ipynb
new file mode 100644
index 00000000..3ca6fd6a
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27_2011_relative_differences_between_methodologies-ranking-percentile.ipynb
@@ -0,0 +1,5701 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Methodology to address fundamental problem 1 itemized in Issue 1024 - follow-up compare tabulations and relative household burden. This time I extend the 12-11 notebook to look at how the percentile ranks affects the proportion of tracts considered as burdened versus the current methodology."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Indicator reviewed: \n",
+ "\n",
+ "Socioeconomic Factors Indicator reviewed\n",
+ "* [Extreme Housing Burden](#housingburden)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import math\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pandas as pd\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### ETL process for acquiring relevant tables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### NOTE: If you ran the ETL Process to acquire Table 8 in the other notebook of this draft PR you do not need to run the ETL cell block again"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copy and adapt certain sections of code from data_pipeline.utils \n",
+ "\n",
+ "def download_hud_dataset():\n",
+ " DOWNLOAD_FILENAME = \"HUD_ZIPPED.csv\"\n",
+ " HOUSING_FTP_URL = \"https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip\" \n",
+ " response = requests.get(HOUSING_FTP_URL, verify=True)\n",
+ " if response.status_code == 200:\n",
+ " file_contents = response.content\n",
+ " else:\n",
+ " sys.exit(\n",
+ " f\"HTTP response {response.status_code} from url {file_url}. Info: {response.content}\"\n",
+ " )\n",
+ "\n",
+ " # Write the contents to disk.\n",
+ " file = open(DOWNLOAD_FILENAME, \"wb\")\n",
+ " file.write(file_contents)\n",
+ " file.close()\n",
+ " \n",
+ "def extract_zipped_download(zip_file_path, unzipped_path):\n",
+ " with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
+ " zip_ref.extractall(unzipped_path)\n",
+ " # cleanup temporary file\n",
+ " os.remove(zip_file_path)\n",
+ " \n",
+ "def up_one_directory(path):\n",
+ " try:\n",
+ " # from Python 3.6\n",
+ " parent_dir = Path(path).parents[1]\n",
+ " # for Python 3.4/3.5, use str to convert the path to string\n",
+ " # parent_dir = str(Path(path).parents[1])\n",
+ " shutil.move(path, parent_dir)\n",
+ " except IndexError:\n",
+ " # no upper directory\n",
+ " pass\n",
+ "\n",
+ "CURRENT_DIRECTORY = os.getcwd()\n",
+ "download_hud_dataset()\n",
+ "extract_zipped_download(CURRENT_DIRECTORY + \"/HUD_ZIPPED.csv\", CURRENT_DIRECTORY) \n",
+ "up_one_directory(CURRENT_DIRECTORY + \"/140/Table8.csv\")\n",
+ "shutil.rmtree(\"./140/\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Extreme Housing Burden \n",
+ "\n",
+ "The Extreme Housing Burden indicator represents the proportion of low-income households that have to spend more than half their income on rent. These households experience higher levels of stress, report lower health, and may delay medical treatment because of its high cost.\n",
+ "\n",
+ "The Extreme Housing Burden indicator measures the percent of households in a census tract that are:\n",
+ "\n",
+ "1. Making less than 80% of the Area Median Family Income as determined by the Department of Housing and Urban Development (HUD), and\n",
+ "2. Paying greater than 50% of their income to housing costs. \n",
+ "\n",
+ "This data is sourced from the 2014-2018 Comprehensive Housing Affordability Strategy dataset from the Department of Housing and Urban Development (HUD) using the census tract geographic summary level, and contains cost burdens for households by percent HUD-adjusted median family income (HAMFI) category. This data can be found [here](https://www.huduser.gov/portal/datasets/cp.html). \n",
+ "\n",
+ "Because CHAS data is based on American Communities Survey (ACS) estimates, which come from a sample of the population, they may be unreliable if based on a small sample or population size.\n",
+ "\n",
+ "The standard error and relative standard error were used to evaluate the reliability of each estimate using CalEnviroScreen’s methodology. \n",
+ "\n",
+ "Census tract estimates that met either of the following criteria were considered reliable and included in the analysis [(CalEnviroScreen, 2017, page 129)](https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf ):\n",
+ "\n",
+ "- Relative standard error less than 50 (meaning the standard error was less than half of the estimate), OR \n",
+ "- Standard error less than the mean standard error of all census tract estimates \n",
+ "\n",
+ "Formulas for calculating the standard error of sums, proportions, and ratio come from the [American Communities Survey Office](https://www2.census.gov/programs-surveys/acs/tech_docs/accuracy/MultiyearACSAccuracyofData2013.pdf).\n",
+ "\n",
+ "Note that this code creates a score and rank by state, for every state."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The relevant variables in table 8 of the CHAS dataset are the following (CHAS data dictionary available [here](https://www.huduser.gov/portal/datasets/cp/CHAS-data-dictionary-14-18.xlsx)):\n",
+ "\n",
+ "| Name | Label |\n",
+ "|---------|-----------------------------------------------------|\n",
+ "|T1_est1 | Total Occupied housing units | \n",
+ "|T8_est10 | Owner occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est23 |Owner occupied greater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est36 |Owner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est76 | Renter occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est89 |Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est102|Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Below I also propose an alternate means for ranking census tracts\n",
+ "### These steps are outlined and commented below"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/series.py:726: RuntimeWarning: invalid value encountered in sqrt\n",
+ " result = getattr(ufunc, method)(*inputs, **kwargs)\n",
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " iloc._setitem_with_indexer(indexer, value)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Read in the data from https://www.huduser.gov/portal/datasets/cp.html\n",
+ "housing = pd.read_csv(\"Table8.csv\", \n",
+ " encoding = \"ISO-8859-1\", \n",
+ " dtype = {'Tract_ID': object, 'st': object, 'geoid': object})\n",
+ "\n",
+ "# Remove data for states that aren't included in the census (e.g. American Samoa, Guam, etc.):\n",
+ "housing.drop(housing.loc[housing['st'] == '72'].index, inplace = True)\n",
+ "\n",
+ "# Combine owner and renter occupied low-income households that make less than 80% of HAMFI into one variable\n",
+ "housing['summed'] = (housing['T8_est10'] + \n",
+ " housing['T8_est23'] + \n",
+ " housing['T8_est36'] + \n",
+ " housing['T8_est76'] + \n",
+ " housing['T8_est89'] + \n",
+ " housing['T8_est102'])\n",
+ "\n",
+ "# Create a variable for the standard error of the summed variables\n",
+ "housing['summed_se'] = np.sqrt((housing['T8_moe10'] / 1.645)**2 + \n",
+ " (housing['T8_moe23'] / 1.645)**2 + \n",
+ " (housing['T8_moe36'] / 1.645)**2 + \n",
+ " (housing['T8_moe76'] / 1.645)**2 + \n",
+ " (housing['T8_moe89'] / 1.645)**2 + \n",
+ " (housing['T8_moe102'] / 1.645)**2)\n",
+ "\n",
+ "# Remove the first 7 digits in the FIPS Census Tract ID \n",
+ "housing['geoid'] = housing['geoid'].str[-11:]\n",
+ "\n",
+ "# Find the estimate of the proportion of the population that is heavily rent burdened\n",
+ "housing['hbrd_score'] = housing['summed'] / housing['T8_est1']\n",
+ "\n",
+ "# Change rates where the population is 0 to nan\n",
+ "housing['hbrd_score'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Create function for calculating the standard error, using the proportions standard error formula\n",
+ "# if the value under the radical is negative, use the ratio standard error formula\n",
+ "def se_prop(x, y, se_x, moe_y): \n",
+ " se_y = moe_y / 1.645\n",
+ " test = se_x**2 - (((x**2)/(y**2))*((se_y)**2))\n",
+ " se = np.where(test < 0,\n",
+ " (1/y) * np.sqrt(se_x**2 + (((x**2)/(y**2))*(se_y**2))), \n",
+ " (1/y) * np.sqrt(se_x**2 - (((x**2)/(y**2))*(se_y**2))))\n",
+ " return se\n",
+ "\n",
+ "housing['se'] = se_prop(housing['summed'], housing['T8_est1'], housing['summed_se'], housing['T8_moe1'])\n",
+ "\n",
+ "# Calculate the relative standard error\n",
+ "housing['rse'] = housing['se'] / housing['hbrd_score']*100\n",
+ "\n",
+ "# Change infinite rse's where the housing burden is 0 to np.nan\n",
+ "housing['rse'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Calculate the mean standard error for each state\n",
+ "housing['mean_state_se'] = np.zeros(len(housing))\n",
+ "\n",
+ "for state in housing['st'].unique():\n",
+ " mean_se = np.mean(housing[housing['st'] == state]['se'])\n",
+ " housing['mean_state_se'].loc[housing['st'] == state] = mean_se\n",
+ " \n",
+ "# Find census tract estimates that meet both of the following criteria and are thus considered unreliable estimates: \n",
+ "# RSE less than 50 AND\n",
+ "# SE less than the mean state SE or housing burdened low income households\n",
+ "# Convert these scores to nan\n",
+ "housing.loc[(housing['rse'] >= 50) & (housing['rse'] >= housing['mean_state_se']), 'hbrd_score'] = np.nan\n",
+ "\n",
+ "# Rename columns\n",
+ "housing = housing.rename(columns = {'geoid' :'FIPS_tract_id',\n",
+ " 'st' : 'state'\n",
+ " })\n",
+ "\n",
+ "# Calculate percentile rank for census tracts with a score above 0, set percentile to 0 if score is 0, for each state\n",
+ "housing['hbrd_rank'] = housing[\n",
+ " housing['hbrd_score'] != 0][['hbrd_score',\n",
+ " 'state']].groupby('state').rank( \n",
+ " na_option = 'keep', \n",
+ " pct = True) * 100\n",
+ "\n",
+ "housing.loc[housing['hbrd_score'] == 0, 'hbrd_rank'] = 0\n",
+ "\n",
+ "# Create final housing burden df\n",
+ "housingburden = housing.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
source
\n",
+ "
sumlevel
\n",
+ "
FIPS_tract_id
\n",
+ "
name
\n",
+ "
state
\n",
+ "
cnty
\n",
+ "
tract
\n",
+ "
T8_est1
\n",
+ "
T8_est2
\n",
+ "
T8_est3
\n",
+ "
...
\n",
+ "
T8_moe131
\n",
+ "
T8_moe132
\n",
+ "
T8_moe133
\n",
+ "
summed
\n",
+ "
summed_se
\n",
+ "
hbrd_score
\n",
+ "
se
\n",
+ "
rse
\n",
+ "
mean_state_se
\n",
+ "
hbrd_rank
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020100
\n",
+ "
Census Tract 201, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20100
\n",
+ "
765
\n",
+ "
570
\n",
+ "
50
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
80
\n",
+ "
31.721807
\n",
+ "
0.104575
\n",
+ "
0.041032
\n",
+ "
39.237314
\n",
+ "
0.036604
\n",
+ "
46.298077
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020200
\n",
+ "
Census Tract 202, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20200
\n",
+ "
720
\n",
+ "
465
\n",
+ "
65
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
138
\n",
+ "
45.531874
\n",
+ "
0.191667
\n",
+ "
0.061614
\n",
+ "
32.146659
\n",
+ "
0.036604
\n",
+ "
83.269231
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020300
\n",
+ "
Census Tract 203, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20300
\n",
+ "
1295
\n",
+ "
840
\n",
+ "
60
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
170
\n",
+ "
53.722921
\n",
+ "
0.131274
\n",
+ "
0.040927
\n",
+ "
31.176999
\n",
+ "
0.036604
\n",
+ "
63.653846
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020400
\n",
+ "
Census Tract 204, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20400
\n",
+ "
1640
\n",
+ "
1260
\n",
+ "
15
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
145
\n",
+ "
46.288510
\n",
+ "
0.088415
\n",
+ "
0.027822
\n",
+ "
31.467397
\n",
+ "
0.036604
\n",
+ "
34.615385
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020500
\n",
+ "
Census Tract 205, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20500
\n",
+ "
4175
\n",
+ "
2320
\n",
+ "
175
\n",
+ "
...
\n",
+ "
17
\n",
+ "
17
\n",
+ "
17
\n",
+ "
595
\n",
+ "
147.221693
\n",
+ "
0.142515
\n",
+ "
0.034760
\n",
+ "
24.390193
\n",
+ "
0.036604
\n",
+ "
68.221154
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 280 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " source sumlevel FIPS_tract_id \\\n",
+ "0 2014thru2018 140 01001020100 \n",
+ "1 2014thru2018 140 01001020200 \n",
+ "2 2014thru2018 140 01001020300 \n",
+ "3 2014thru2018 140 01001020400 \n",
+ "4 2014thru2018 140 01001020500 \n",
+ "\n",
+ " name state cnty tract T8_est1 \\\n",
+ "0 Census Tract 201, Autauga County, Alabama 01 1 20100 765 \n",
+ "1 Census Tract 202, Autauga County, Alabama 01 1 20200 720 \n",
+ "2 Census Tract 203, Autauga County, Alabama 01 1 20300 1295 \n",
+ "3 Census Tract 204, Autauga County, Alabama 01 1 20400 1640 \n",
+ "4 Census Tract 205, Autauga County, Alabama 01 1 20500 4175 \n",
+ "\n",
+ " T8_est2 T8_est3 ... T8_moe131 T8_moe132 T8_moe133 summed summed_se \\\n",
+ "0 570 50 ... 12 12 12 80 31.721807 \n",
+ "1 465 65 ... 12 12 12 138 45.531874 \n",
+ "2 840 60 ... 12 12 12 170 53.722921 \n",
+ "3 1260 15 ... 12 12 12 145 46.288510 \n",
+ "4 2320 175 ... 17 17 17 595 147.221693 \n",
+ "\n",
+ " hbrd_score se rse mean_state_se hbrd_rank \n",
+ "0 0.104575 0.041032 39.237314 0.036604 46.298077 \n",
+ "1 0.191667 0.061614 32.146659 0.036604 83.269231 \n",
+ "2 0.131274 0.040927 31.176999 0.036604 63.653846 \n",
+ "3 0.088415 0.027822 31.467397 0.036604 34.615385 \n",
+ "4 0.142515 0.034760 24.390193 0.036604 68.221154 \n",
+ "\n",
+ "[5 rows x 280 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(73056, 280)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### As desired we see a uniform distribution for the percentile rank for burdened households"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we compute for a baseline comparison "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Owner occupied numerator fields\n",
+ "OWNER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est7\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est10\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est20\",\n",
+ " \n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est23\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est33\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est36\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est13\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est26\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est39\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est52\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est65\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_FIELD = \"T8_est2\"\n",
+ "# Subtotal\n",
+ "# Owner occupied\n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_HAMFI_FIELD = \"T8_est3\"\n",
+ "# Subtotal\n",
+ "# Owner occupied \n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "# Renter occupied numerator fields\n",
+ "RENTER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est73\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est76\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est86\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est89\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est99\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est102\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est79\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tless than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est92\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est105\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est118\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est131\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# T8_est68\tSubtotalRenter occupied\tAll\tAll\tAll\n",
+ "RENTER_OCCUPIED_POPULATION_FIELD = \"T8_est68\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_summed_methodology\"] = housingburden[\n",
+ " OWNER_OCCUPIED_NUMERATOR_FIELDS\n",
+ "].sum(axis=1) + housingburden[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ " - housingburden[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ " - housingburden[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator_sans_not_computed\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_percent\"] = np.round(\n",
+ " (housingburden[\"current_summed_methodology\"] / housingburden[\"current_methodology_denominator\"] ), 2) * 100"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we construct the distribution of differences in the number of owned and rented burdened households\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Percentiles Comparison"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_df = housingburden[['FIPS_tract_id', 'state','hbrd_rank','hbrd_score', 'summed', \n",
+ " 'current_summed_methodology', 'T8_est1', \n",
+ " \"current_methodology_denominator_sans_not_computed\",\n",
+ " 'current_methodology_denominator', 'current_methodology_percent']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### First notice here that **T8_est1** and **current_methodology_denominator** should represent same or similar aggregates. In general, we cen see that the current computation performed results in a differerntial that undercounts the total occupied and rental households."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator_sans_not_computed\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Compute the correlation matrix\n",
+ "import seaborn as sns\n",
+ "corr = merged_df[[\"hbrd_rank\", \n",
+ " \"current_methodology_percentile_rank\"] + percent_cols].corr()\n",
+ "\n",
+ "# Generate a mask for the upper triangle\n",
+ "mask = np.triu(np.ones_like(corr, dtype=bool))\n",
+ "\n",
+ "# Set up the matplotlib figure\n",
+ "f, ax = plt.subplots(figsize=(15, 12))\n",
+ "\n",
+ "# Generate a custom diverging colormap\n",
+ "cmap = sns.diverging_palette(230, 20, as_cmap=True)\n",
+ "\n",
+ "# Draw the heatmap with the mask and correct aspect ratio\n",
+ "sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,\n",
+ " square=True, linewidths=.5, cbar_kws={\"shrink\": .5})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the previous, comparing both methodologies. \n",
+ "\n",
+ "1. "
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27_reference.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27_reference.ipynb
new file mode 100644
index 00000000..3efdbb67
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_27_reference.ipynb
@@ -0,0 +1,865 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Indicator reviewed: \n",
+ "\n",
+ "Socioeconomic Factors Indicator reviewed\n",
+ "* [Extreme Housing Burden](#housingburden)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import math\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pandas as pd\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Extreme Housing Burden \n",
+ "\n",
+ "The Extreme Housing Burden indicator represents the proportion of low-income households that have to spend more than half their income on rent. These households experience higher levels of stress, report lower health, and may delay medical treatment because of its high cost.\n",
+ "\n",
+ "The Extreme Housing Burden indicator measures the percent of households in a census tract that are:\n",
+ "\n",
+ "1. Making less than 80% of the Area Median Family Income as determined by the Department of Housing and Urban Development (HUD), and\n",
+ "2. Paying greater than 50% of their income to housing costs. \n",
+ "\n",
+ "This data is sourced from the 2014-2018 Comprehensive Housing Affordability Strategy dataset from the Department of Housing and Urban Development (HUD) using the census tract geographic summary level, and contains cost burdens for households by percent HUD-adjusted median family income (HAMFI) category. This data can be found [here](https://www.huduser.gov/portal/datasets/cp.html). \n",
+ "\n",
+ "Because CHAS data is based on American Communities Survey (ACS) estimates, which come from a sample of the population, they may be unreliable if based on a small sample or population size.\n",
+ "\n",
+ "The standard error and relative standard error were used to evaluate the reliability of each estimate using CalEnviroScreen’s methodology. \n",
+ "\n",
+ "Census tract estimates that met either of the following criteria were considered reliable and included in the analysis [(CalEnviroScreen, 2017, page 129)](https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf ):\n",
+ "\n",
+ "- Relative standard error less than 50 (meaning the standard error was less than half of the estimate), OR \n",
+ "- Standard error less than the mean standard error of all census tract estimates \n",
+ "\n",
+ "Formulas for calculating the standard error of sums, proportions, and ratio come from the [American Communities Survey Office](https://www2.census.gov/programs-surveys/acs/tech_docs/accuracy/MultiyearACSAccuracyofData2013.pdf).\n",
+ "\n",
+ "Note that this code creates a score and rank by state, for every state."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The relevant variables in table 8 of the CHAS dataset are the following (CHAS data dictionary available [here](https://www.huduser.gov/portal/datasets/cp/CHAS-data-dictionary-14-18.xlsx)):\n",
+ "\n",
+ "| Name | Label |\n",
+ "|---------|-----------------------------------------------------|\n",
+ "|T1_est1 | Total Occupied housing units | \n",
+ "|T8_est10 | Owner occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est23 |Owner occupied greater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est36 |Owner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est76 | Renter occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est89 |Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est102|Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Below I also propose an alternate means for ranking census tracts\n",
+ "### These steps are outlined and commented below"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/series.py:726: RuntimeWarning: invalid value encountered in sqrt\n",
+ " result = getattr(ufunc, method)(*inputs, **kwargs)\n",
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " iloc._setitem_with_indexer(indexer, value)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Read in the data from https://www.huduser.gov/portal/datasets/cp.html\n",
+ "housing = pd.read_csv(\"Table8.csv\", \n",
+ " encoding = \"ISO-8859-1\", \n",
+ " dtype = {'Tract_ID': object, 'st': object, 'geoid': object})\n",
+ "\n",
+ "# Remove data for states that aren't included in the census (e.g. American Samoa, Guam, etc.):\n",
+ "housing.drop(housing.loc[housing['st'] == '72'].index, inplace = True)\n",
+ "\n",
+ "# Combine owner and renter occupied low-income households that make less than 80% of HAMFI into one variable\n",
+ "housing['summed'] = (housing['T8_est10'] + \n",
+ " housing['T8_est23'] + \n",
+ " housing['T8_est36'] + \n",
+ " housing['T8_est76'] + \n",
+ " housing['T8_est89'] + \n",
+ " housing['T8_est102'])\n",
+ "\n",
+ "# Create a variable for the standard error of the summed variables\n",
+ "housing['summed_se'] = np.sqrt((housing['T8_moe10'] / 1.645)**2 + \n",
+ " (housing['T8_moe23'] / 1.645)**2 + \n",
+ " (housing['T8_moe36'] / 1.645)**2 + \n",
+ " (housing['T8_moe76'] / 1.645)**2 + \n",
+ " (housing['T8_moe89'] / 1.645)**2 + \n",
+ " (housing['T8_moe102'] / 1.645)**2)\n",
+ "\n",
+ "# Remove the first 7 digits in the FIPS Census Tract ID \n",
+ "housing['geoid'] = housing['geoid'].str[-11:]\n",
+ "\n",
+ "# Find the estimate of the proportion of the population that is heavily rent burdened\n",
+ "housing['hbrd_score'] = housing['summed'] / housing['T8_est1']\n",
+ "\n",
+ "# Change rates where the population is 0 to nan\n",
+ "housing['hbrd_score'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Create function for calculating the standard error, using the proportions standard error formula\n",
+ "# if the value under the radical is negative, use the ratio standard error formula\n",
+ "def se_prop(x, y, se_x, moe_y): \n",
+ " se_y = moe_y / 1.645\n",
+ " test = se_x**2 - (((x**2)/(y**2))*((se_y)**2))\n",
+ " se = np.where(test < 0,\n",
+ " (1/y) * np.sqrt(se_x**2 + (((x**2)/(y**2))*(se_y**2))), \n",
+ " (1/y) * np.sqrt(se_x**2 - (((x**2)/(y**2))*(se_y**2))))\n",
+ " return se\n",
+ "\n",
+ "housing['se'] = se_prop(housing['summed'], housing['T8_est1'], housing['summed_se'], housing['T8_moe1'])\n",
+ "\n",
+ "# Calculate the relative standard error\n",
+ "housing['rse'] = housing['se'] / housing['hbrd_score']*100\n",
+ "\n",
+ "# Change infinite rse's where the housing burden is 0 to np.nan\n",
+ "housing['rse'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Calculate the mean standard error for each state\n",
+ "housing['mean_state_se'] = np.zeros(len(housing))\n",
+ "\n",
+ "for state in housing['st'].unique():\n",
+ " mean_se = np.mean(housing[housing['st'] == state]['se'])\n",
+ " housing['mean_state_se'].loc[housing['st'] == state] = mean_se\n",
+ " \n",
+ "# Find census tract estimates that meet both of the following criteria and are thus considered unreliable estimates: \n",
+ "# RSE less than 50 AND\n",
+ "# SE less than the mean state SE or housing burdened low income households\n",
+ "# Convert these scores to nan\n",
+ "housing.loc[(housing['rse'] >= 50) & (housing['rse'] >= housing['mean_state_se']), 'hbrd_score'] = np.nan\n",
+ "\n",
+ "# Rename columns\n",
+ "housing = housing.rename(columns = {'geoid' :'FIPS_tract_id',\n",
+ " 'st' : 'state'\n",
+ " })\n",
+ "\n",
+ "# Calculate percentile rank for census tracts with a score above 0, set percentile to 0 if score is 0, for each state\n",
+ "housing['hbrd_rank'] = housing[\n",
+ " housing['hbrd_score'] != 0][['hbrd_score',\n",
+ " 'state']].groupby('state').rank( \n",
+ " na_option = 'keep', \n",
+ " pct = True) * 100\n",
+ "\n",
+ "housing.loc[housing['hbrd_score'] == 0, 'hbrd_rank'] = 0\n",
+ "\n",
+ "# Create final housing burden df\n",
+ "housingburden = housing.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
source
\n",
+ "
sumlevel
\n",
+ "
FIPS_tract_id
\n",
+ "
name
\n",
+ "
state
\n",
+ "
cnty
\n",
+ "
tract
\n",
+ "
T8_est1
\n",
+ "
T8_est2
\n",
+ "
T8_est3
\n",
+ "
...
\n",
+ "
T8_moe131
\n",
+ "
T8_moe132
\n",
+ "
T8_moe133
\n",
+ "
summed
\n",
+ "
summed_se
\n",
+ "
hbrd_score
\n",
+ "
se
\n",
+ "
rse
\n",
+ "
mean_state_se
\n",
+ "
hbrd_rank
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020100
\n",
+ "
Census Tract 201, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20100
\n",
+ "
765
\n",
+ "
570
\n",
+ "
50
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
80
\n",
+ "
31.721807
\n",
+ "
0.104575
\n",
+ "
0.041032
\n",
+ "
39.237314
\n",
+ "
0.036604
\n",
+ "
46.298077
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020200
\n",
+ "
Census Tract 202, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20200
\n",
+ "
720
\n",
+ "
465
\n",
+ "
65
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
138
\n",
+ "
45.531874
\n",
+ "
0.191667
\n",
+ "
0.061614
\n",
+ "
32.146659
\n",
+ "
0.036604
\n",
+ "
83.269231
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020300
\n",
+ "
Census Tract 203, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20300
\n",
+ "
1295
\n",
+ "
840
\n",
+ "
60
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
170
\n",
+ "
53.722921
\n",
+ "
0.131274
\n",
+ "
0.040927
\n",
+ "
31.176999
\n",
+ "
0.036604
\n",
+ "
63.653846
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020400
\n",
+ "
Census Tract 204, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20400
\n",
+ "
1640
\n",
+ "
1260
\n",
+ "
15
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
145
\n",
+ "
46.288510
\n",
+ "
0.088415
\n",
+ "
0.027822
\n",
+ "
31.467397
\n",
+ "
0.036604
\n",
+ "
34.615385
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020500
\n",
+ "
Census Tract 205, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20500
\n",
+ "
4175
\n",
+ "
2320
\n",
+ "
175
\n",
+ "
...
\n",
+ "
17
\n",
+ "
17
\n",
+ "
17
\n",
+ "
595
\n",
+ "
147.221693
\n",
+ "
0.142515
\n",
+ "
0.034760
\n",
+ "
24.390193
\n",
+ "
0.036604
\n",
+ "
68.221154
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 280 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " source sumlevel FIPS_tract_id \\\n",
+ "0 2014thru2018 140 01001020100 \n",
+ "1 2014thru2018 140 01001020200 \n",
+ "2 2014thru2018 140 01001020300 \n",
+ "3 2014thru2018 140 01001020400 \n",
+ "4 2014thru2018 140 01001020500 \n",
+ "\n",
+ " name state cnty tract T8_est1 \\\n",
+ "0 Census Tract 201, Autauga County, Alabama 01 1 20100 765 \n",
+ "1 Census Tract 202, Autauga County, Alabama 01 1 20200 720 \n",
+ "2 Census Tract 203, Autauga County, Alabama 01 1 20300 1295 \n",
+ "3 Census Tract 204, Autauga County, Alabama 01 1 20400 1640 \n",
+ "4 Census Tract 205, Autauga County, Alabama 01 1 20500 4175 \n",
+ "\n",
+ " T8_est2 T8_est3 ... T8_moe131 T8_moe132 T8_moe133 summed summed_se \\\n",
+ "0 570 50 ... 12 12 12 80 31.721807 \n",
+ "1 465 65 ... 12 12 12 138 45.531874 \n",
+ "2 840 60 ... 12 12 12 170 53.722921 \n",
+ "3 1260 15 ... 12 12 12 145 46.288510 \n",
+ "4 2320 175 ... 17 17 17 595 147.221693 \n",
+ "\n",
+ " hbrd_score se rse mean_state_se hbrd_rank \n",
+ "0 0.104575 0.041032 39.237314 0.036604 46.298077 \n",
+ "1 0.191667 0.061614 32.146659 0.036604 83.269231 \n",
+ "2 0.131274 0.040927 31.176999 0.036604 63.653846 \n",
+ "3 0.088415 0.027822 31.467397 0.036604 34.615385 \n",
+ "4 0.142515 0.034760 24.390193 0.036604 68.221154 \n",
+ "\n",
+ "[5 rows x 280 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(73056, 280)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### As desired we see a uniform distribution for the percentile rank for burdened households"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we compute for a baseline comparison "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Owner occupied numerator fields\n",
+ "OWNER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est7\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est10\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est20\",\n",
+ " \n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est23\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est33\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est36\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est13\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est26\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est39\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est52\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est65\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_FIELD = \"T8_est2\"\n",
+ "# Subtotal\n",
+ "# Owner occupied\n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_HAMFI_FIELD = \"T8_est3\"\n",
+ "# Subtotal\n",
+ "# Owner occupied \n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "# Renter occupied numerator fields\n",
+ "RENTER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est73\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est76\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est86\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est89\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est99\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est102\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est79\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tless than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est92\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est105\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est118\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est131\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# T8_est68\tSubtotalRenter occupied\tAll\tAll\tAll\n",
+ "RENTER_OCCUPIED_POPULATION_FIELD = \"T8_est68\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_summed_methodology\"] = housingburden[\n",
+ " OWNER_OCCUPIED_NUMERATOR_FIELDS\n",
+ "].sum(axis=1) + housingburden[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ " - housingburden[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ " - housingburden[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator_sans_not_computed\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_percent\"] = np.round(\n",
+ " (housingburden[\"current_summed_methodology\"] / housingburden[\"current_methodology_denominator\"] ), 2) * 100"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we construct the distribution of differences in the number of owned and rented burdened households\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Percentiles Comparison"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_df = housingburden[['FIPS_tract_id', 'state','hbrd_rank','hbrd_score', 'summed', \n",
+ " 'current_summed_methodology', 'T8_est1', \n",
+ " \"current_methodology_denominator_sans_not_computed\",\n",
+ " 'current_methodology_denominator', 'current_methodology_percent']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### First notice here that **T8_est1** and **current_methodology_denominator** should represent same or similar aggregates. In general, we cen see that the current computation performed results in a differential that undercounts the total occupied and rental households."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator_sans_not_computed\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(12, 8))\n",
+ "plt.title('Distribution of differences between aggregate totals that normalizes tabulation of poverty households (with removal of not computed fields) ')\n",
+ "# Set x-axis label\n",
+ "plt.xlabel('Aggregate differences in total owner and renter occupied low-income households')\n",
+ "# Set y-axis label\n",
+ "plt.ylabel('Relative Frequency in Support')\n",
+ "\n",
+ "sns.histplot(final_df[\"differences_aggregate_denominator_sans_not_computed\"])"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/data/data-pipeline/data_pipeline/ipython/hud_notebook_tabulation_differences_12_27_2021.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_notebook_tabulation_differences_12_27_2021.ipynb
new file mode 100644
index 00000000..2ce5a35b
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/ipython/hud_notebook_tabulation_differences_12_27_2021.ipynb
@@ -0,0 +1,9194 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Extreme Housing Burden \n",
+ "\n",
+ "The Extreme Housing Burden indicator represents the proportion of low-income households that have to spend more than half their income on rent. These households experience higher levels of stress, report lower health, and may delay medical treatment because of its high cost.\n",
+ "\n",
+ "The Extreme Housing Burden indicator measures the percent of households in a census tract that are:\n",
+ "\n",
+ "1. Making less than 80% of the Area Median Family Income as determined by the Department of Housing and Urban Development (HUD), and\n",
+ "2. Paying greater than 50% of their income to housing costs. \n",
+ "\n",
+ "This data is sourced from the 2014-2018 Comprehensive Housing Affordability Strategy dataset from the Department of Housing and Urban Development (HUD) using the census tract geographic summary level, and contains cost burdens for households by percent HUD-adjusted median family income (HAMFI) category. This data can be found [here](https://www.huduser.gov/portal/datasets/cp.html). \n",
+ "\n",
+ "Because CHAS data is based on American Communities Survey (ACS) estimates, which come from a sample of the population, they may be unreliable if based on a small sample or population size.\n",
+ "\n",
+ "The standard error and relative standard error were used to evaluate the reliability of each estimate using CalEnviroScreen’s methodology. \n",
+ "\n",
+ "Census tract estimates that met either of the following criteria were considered reliable and included in the analysis [(CalEnviroScreen, 2017, page 129)](https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf ):\n",
+ "\n",
+ "- Relative standard error less than 50 (meaning the standard error was less than half of the estimate), OR \n",
+ "- Standard error less than the mean standard error of all census tract estimates \n",
+ "\n",
+ "Formulas for calculating the standard error of sums, proportions, and ratio come from the [American Communities Survey Office](https://www2.census.gov/programs-surveys/acs/tech_docs/accuracy/MultiyearACSAccuracyofData2013.pdf).\n",
+ "\n",
+ "Note that this code creates a score and rank by state, for every state."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The relevant variables in table 8 of the CHAS dataset are the following (CHAS data dictionary available [here](https://www.huduser.gov/portal/datasets/cp/CHAS-data-dictionary-14-18.xlsx)):\n",
+ "\n",
+ "| Name | Label |\n",
+ "|---------|-----------------------------------------------------|\n",
+ "|T1_est1 | Total Occupied housing units | \n",
+ "|T8_est10 | Owner occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est23 |Owner occupied greater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est36 |Owner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est76 | Renter occupied less than or equal to 30% of HAMFI cost burden greater than 50% |\n",
+ "|T8_est89 |Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tcost burden greater than 50%|\n",
+ "|T8_est102|Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tcost burden greater than 50%|\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Below I also propose an alternate means for ranking census tracts\n",
+ "### These steps are outlined and commented below"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/series.py:726: RuntimeWarning: invalid value encountered in sqrt\n",
+ " result = getattr(ufunc, method)(*inputs, **kwargs)\n",
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " iloc._setitem_with_indexer(indexer, value)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Read in the data from https://www.huduser.gov/portal/datasets/cp.html\n",
+ "housing = pd.read_csv(\"Table8.csv\", \n",
+ " encoding = \"ISO-8859-1\", \n",
+ " dtype = {'Tract_ID': object, 'st': object, 'geoid': object})\n",
+ "\n",
+ "# Remove data for states that aren't included in the census (e.g. American Samoa, Guam, etc.):\n",
+ "housing.drop(housing.loc[housing['st'] == '72'].index, inplace = True)\n",
+ "\n",
+ "# Combine owner and renter occupied low-income households that make less than 80% of HAMFI into one variable\n",
+ "housing['summed'] = (housing['T8_est10'] + \n",
+ " housing['T8_est23'] + \n",
+ " housing['T8_est36'] + \n",
+ " housing['T8_est76'] + \n",
+ " housing['T8_est89'] + \n",
+ " housing['T8_est102'])\n",
+ "\n",
+ "# Create a variable for the standard error of the summed variables\n",
+ "housing['summed_se'] = np.sqrt((housing['T8_moe10'] / 1.645)**2 + \n",
+ " (housing['T8_moe23'] / 1.645)**2 + \n",
+ " (housing['T8_moe36'] / 1.645)**2 + \n",
+ " (housing['T8_moe76'] / 1.645)**2 + \n",
+ " (housing['T8_moe89'] / 1.645)**2 + \n",
+ " (housing['T8_moe102'] / 1.645)**2)\n",
+ "\n",
+ "# Remove the first 7 digits in the FIPS Census Tract ID \n",
+ "housing['geoid'] = housing['geoid'].str[-11:]\n",
+ "\n",
+ "# Find the estimate of the proportion of the population that is heavily rent burdened\n",
+ "housing['hbrd_score'] = housing['summed'] / housing['T8_est1']\n",
+ "\n",
+ "# Change rates where the population is 0 to nan\n",
+ "housing['hbrd_score'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Create function for calculating the standard error, using the proportions standard error formula\n",
+ "# if the value under the radical is negative, use the ratio standard error formula\n",
+ "def se_prop(x, y, se_x, moe_y): \n",
+ " se_y = moe_y / 1.645\n",
+ " test = se_x**2 - (((x**2)/(y**2))*((se_y)**2))\n",
+ " se = np.where(test < 0,\n",
+ " (1/y) * np.sqrt(se_x**2 + (((x**2)/(y**2))*(se_y**2))), \n",
+ " (1/y) * np.sqrt(se_x**2 - (((x**2)/(y**2))*(se_y**2))))\n",
+ " return se\n",
+ "\n",
+ "housing['se'] = se_prop(housing['summed'], housing['T8_est1'], housing['summed_se'], housing['T8_moe1'])\n",
+ "\n",
+ "# Calculate the relative standard error\n",
+ "housing['rse'] = housing['se'] / housing['hbrd_score']*100\n",
+ "\n",
+ "# Change infinite rse's where the housing burden is 0 to np.nan\n",
+ "housing['rse'].replace(np.inf, np.nan, inplace = True)\n",
+ "\n",
+ "# Calculate the mean standard error for each state\n",
+ "housing['mean_state_se'] = np.zeros(len(housing))\n",
+ "\n",
+ "for state in housing['st'].unique():\n",
+ " mean_se = np.mean(housing[housing['st'] == state]['se'])\n",
+ " housing['mean_state_se'].loc[housing['st'] == state] = mean_se\n",
+ " \n",
+ "# Find census tract estimates that meet both of the following criteria and are thus considered unreliable estimates: \n",
+ "# RSE less than 50 AND\n",
+ "# SE less than the mean state SE or housing burdened low income households\n",
+ "# Convert these scores to nan\n",
+ "housing.loc[(housing['rse'] >= 50) & (housing['rse'] >= housing['mean_state_se']), 'hbrd_score'] = np.nan\n",
+ "\n",
+ "# Rename columns\n",
+ "housing = housing.rename(columns = {'geoid' :'FIPS_tract_id',\n",
+ " 'st' : 'state'\n",
+ " })\n",
+ "\n",
+ "# Calculate percentile rank for census tracts with a score above 0, set percentile to 0 if score is 0, for each state\n",
+ "housing['hbrd_rank'] = housing[\n",
+ " housing['hbrd_score'] != 0][['hbrd_score',\n",
+ " 'state']].groupby('state').rank( \n",
+ " na_option = 'keep', \n",
+ " pct = True) * 100\n",
+ "\n",
+ "housing.loc[housing['hbrd_score'] == 0, 'hbrd_rank'] = 0\n",
+ "\n",
+ "# Create final housing burden df\n",
+ "housingburden = housing.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
source
\n",
+ "
sumlevel
\n",
+ "
FIPS_tract_id
\n",
+ "
name
\n",
+ "
state
\n",
+ "
cnty
\n",
+ "
tract
\n",
+ "
T8_est1
\n",
+ "
T8_est2
\n",
+ "
T8_est3
\n",
+ "
...
\n",
+ "
T8_moe131
\n",
+ "
T8_moe132
\n",
+ "
T8_moe133
\n",
+ "
summed
\n",
+ "
summed_se
\n",
+ "
hbrd_score
\n",
+ "
se
\n",
+ "
rse
\n",
+ "
mean_state_se
\n",
+ "
hbrd_rank
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020100
\n",
+ "
Census Tract 201, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20100
\n",
+ "
765
\n",
+ "
570
\n",
+ "
50
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
80
\n",
+ "
31.721807
\n",
+ "
0.104575
\n",
+ "
0.041032
\n",
+ "
39.237314
\n",
+ "
0.036604
\n",
+ "
46.298077
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020200
\n",
+ "
Census Tract 202, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20200
\n",
+ "
720
\n",
+ "
465
\n",
+ "
65
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
138
\n",
+ "
45.531874
\n",
+ "
0.191667
\n",
+ "
0.061614
\n",
+ "
32.146659
\n",
+ "
0.036604
\n",
+ "
83.269231
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020300
\n",
+ "
Census Tract 203, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20300
\n",
+ "
1295
\n",
+ "
840
\n",
+ "
60
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
170
\n",
+ "
53.722921
\n",
+ "
0.131274
\n",
+ "
0.040927
\n",
+ "
31.176999
\n",
+ "
0.036604
\n",
+ "
63.653846
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020400
\n",
+ "
Census Tract 204, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20400
\n",
+ "
1640
\n",
+ "
1260
\n",
+ "
15
\n",
+ "
...
\n",
+ "
12
\n",
+ "
12
\n",
+ "
12
\n",
+ "
145
\n",
+ "
46.288510
\n",
+ "
0.088415
\n",
+ "
0.027822
\n",
+ "
31.467397
\n",
+ "
0.036604
\n",
+ "
34.615385
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
2014thru2018
\n",
+ "
140
\n",
+ "
01001020500
\n",
+ "
Census Tract 205, Autauga County, Alabama
\n",
+ "
01
\n",
+ "
1
\n",
+ "
20500
\n",
+ "
4175
\n",
+ "
2320
\n",
+ "
175
\n",
+ "
...
\n",
+ "
17
\n",
+ "
17
\n",
+ "
17
\n",
+ "
595
\n",
+ "
147.221693
\n",
+ "
0.142515
\n",
+ "
0.034760
\n",
+ "
24.390193
\n",
+ "
0.036604
\n",
+ "
68.221154
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 280 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " source sumlevel FIPS_tract_id \\\n",
+ "0 2014thru2018 140 01001020100 \n",
+ "1 2014thru2018 140 01001020200 \n",
+ "2 2014thru2018 140 01001020300 \n",
+ "3 2014thru2018 140 01001020400 \n",
+ "4 2014thru2018 140 01001020500 \n",
+ "\n",
+ " name state cnty tract T8_est1 \\\n",
+ "0 Census Tract 201, Autauga County, Alabama 01 1 20100 765 \n",
+ "1 Census Tract 202, Autauga County, Alabama 01 1 20200 720 \n",
+ "2 Census Tract 203, Autauga County, Alabama 01 1 20300 1295 \n",
+ "3 Census Tract 204, Autauga County, Alabama 01 1 20400 1640 \n",
+ "4 Census Tract 205, Autauga County, Alabama 01 1 20500 4175 \n",
+ "\n",
+ " T8_est2 T8_est3 ... T8_moe131 T8_moe132 T8_moe133 summed summed_se \\\n",
+ "0 570 50 ... 12 12 12 80 31.721807 \n",
+ "1 465 65 ... 12 12 12 138 45.531874 \n",
+ "2 840 60 ... 12 12 12 170 53.722921 \n",
+ "3 1260 15 ... 12 12 12 145 46.288510 \n",
+ "4 2320 175 ... 17 17 17 595 147.221693 \n",
+ "\n",
+ " hbrd_score se rse mean_state_se hbrd_rank \n",
+ "0 0.104575 0.041032 39.237314 0.036604 46.298077 \n",
+ "1 0.191667 0.061614 32.146659 0.036604 83.269231 \n",
+ "2 0.131274 0.040927 31.176999 0.036604 63.653846 \n",
+ "3 0.088415 0.027822 31.467397 0.036604 34.615385 \n",
+ "4 0.142515 0.034760 24.390193 0.036604 68.221154 \n",
+ "\n",
+ "[5 rows x 280 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(73056, 280)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "housingburden.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### As desired we see a uniform distribution for the percentile rank for burdened households"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we compute for a baseline comparison "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Owner occupied numerator fields\n",
+ "OWNER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est7\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est10\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est20\",\n",
+ " \n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est23\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est33\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est36\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est13\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est26\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est39\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est52\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est65\",\n",
+ " # Subtotal\n",
+ " # Owner occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_FIELD = \"T8_est2\"\n",
+ "# Subtotal\n",
+ "# Owner occupied\n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "OWNER_OCCUPIED_POPULATION_HAMFI_FIELD = \"T8_est3\"\n",
+ "# Subtotal\n",
+ "# Owner occupied \n",
+ "# All\n",
+ "# All\n",
+ "# All\n",
+ "\n",
+ "# Renter occupied numerator fields\n",
+ "RENTER_OCCUPIED_NUMERATOR_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est73\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est76\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # less than or equal to 30% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est86\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est89\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 30% but less than or equal to 50% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ " \"T8_est99\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 30% but less than or equal to 50%\n",
+ " # All\n",
+ " \"T8_est102\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # greater than 50%\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
+ "RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
+ " # Column Name\n",
+ " # Line_Type\n",
+ " # Tenure\n",
+ " # Household income\n",
+ " # Cost burden\n",
+ " # Facilities\n",
+ " \"T8_est79\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tless than or equal to 30% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est92\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est105\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 50% but less than or equal to 80% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est118\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\tgreater than 80% but less than or equal to 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ " \"T8_est131\",\n",
+ " # Subtotal\n",
+ " # Renter occupied\n",
+ " # greater than 100% of HAMFI\n",
+ " # not computed (no/negative income)\n",
+ " # All\n",
+ "]\n",
+ "\n",
+ "# T8_est68\tSubtotalRenter occupied\tAll\tAll\tAll\n",
+ "RENTER_OCCUPIED_POPULATION_FIELD = \"T8_est68\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_summed_methodology\"] = housingburden[\n",
+ " OWNER_OCCUPIED_NUMERATOR_FIELDS\n",
+ "].sum(axis=1) + housingburden[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ " - housingburden[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ " - housingburden[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_denominator_sans_not_computed\"] = (\n",
+ " housingburden[OWNER_OCCUPIED_POPULATION_FIELD]\n",
+ " + housingburden[RENTER_OCCUPIED_POPULATION_FIELD]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "housingburden[\"current_methodology_percent\"] = np.round(\n",
+ " (housingburden[\"current_summed_methodology\"] / housingburden[\"current_methodology_denominator\"] ), 2) * 100"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we construct the distribution of differences in the number of owned and rented burdened households\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Percentiles Comparison"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_df = housingburden[['FIPS_tract_id', 'state','hbrd_rank','hbrd_score', 'summed', \n",
+ " 'current_summed_methodology', 'T8_est1', \n",
+ " \"current_methodology_denominator_sans_not_computed\",\n",
+ " 'current_methodology_denominator', 'current_methodology_percent']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### First notice here that **T8_est1** and **current_methodology_denominator** should represent same or similar aggregates. In general, we cen see that the current computation performed results in a differerntial that undercounts the total occupied and rental households."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/sarahluw/.pyenv/versions/3.6.2/envs/my-virtual-env-3.6.2/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "final_df[\"differences_aggregate_denominator_sans_not_computed\"] = (\n",
+ " final_df[\"current_methodology_denominator\"] - final_df[\"T8_est1\"] \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "