From a07bf752b06f460c6d963fa8b8c138956c96a982 Mon Sep 17 00:00:00 2001 From: Saran Ahluwalia Date: Tue, 18 Jan 2022 13:08:27 -0500 Subject: [PATCH] Notebook investigating NHPD as a source for providing contemporary foreclosure data (#1012) Co-authored-by: Saran Ahluwalia --- .../nhpd_eda.ipynb | 648 ++++++++++++++++++ 1 file changed, 648 insertions(+) create mode 100644 data/data-pipeline/data_pipeline/ipython/home_foreclosures_analysis_nhpd/nhpd_eda.ipynb diff --git a/data/data-pipeline/data_pipeline/ipython/home_foreclosures_analysis_nhpd/nhpd_eda.ipynb b/data/data-pipeline/data_pipeline/ipython/home_foreclosures_analysis_nhpd/nhpd_eda.ipynb new file mode 100644 index 00000000..a2870aff --- /dev/null +++ b/data/data-pipeline/data_pipeline/ipython/home_foreclosures_analysis_nhpd/nhpd_eda.ipynb @@ -0,0 +1,648 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Problem statement and Goal\n", + "NHPD: For all counties, aggregate the National Housing Preservation Database at the census tract level using whatever variables seem most interesting or relevant, and combine it with the “processed” datasets. What does the overall low-income housing stock look like in areas with high eviction rates? Are any of these features statistically related to the incidence of evictions in these counties? Furthermore, are there any insights common to two or more counties, or is the “state” of low-income housing unique to each county?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data_set = pd.read_excel('nhpd_data.xlsx', engine='openpyxl')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### General inference from the data dictionary\n", + "- The data set 'Active and Inconclusive Properties.xlsx' appears to be the `Data Extract`, as opposed to the `Data Grid`, as each row here represents a property. Here, subsidy information is present alongside property information instead of it being expanded from the property record.\n", + "- One property - One address mapping.\n", + "- Most of the phased properties are separated by address locations. \n", + "- This is a rare scenario: In cases where they're combined into one record, we might need to separate them by address locations. This is rare, and might not be needed since funding is tracked at property level, and unless we need specifics that sub-categorize the property, we find little use of maintaining these records as separate entities. \n", + "- Key words 'Development', 'Project' and 'Property' have been used interchangeably to mean 'cluster of buildings' tracked by the same identification number. \n", + " - If we wish to run some NLP algorithms on the descriptions, we may need to replace these words as 'Property'. If not, the vectorized versions of these words may not be close to each other. \n", + " - Even if we consider state-of-the-art algorithms like Word2Vec, we may not get vectors close to one another because 'Development', 'Project' and 'Property' have different semantic meanings." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NHPDPropertyIDPropertyNamePropertyAddressCityStateZipCBSACodeCBSATypeCountyCountyCode...NumberActiveMRNumberInconclusiveMRNumberInactiveMRMr_1_StatusMr_1_ProgramNameMr_1_AssistedUnitsMr_2_StatusMr_2_ProgramNameMr_2_AssistedUnitsOldNHPDPropertyID
01000000IVY ESTATES6729 Zeigler BlvdMobileAL36608-425333660.0Metropolitan Statistical AreaMobile1097.0...000NaNNaNNaNNaNNaNNaNNaN
11000001RENDU TERRACE WEST7400 Old Shell RdMobileAL36608-454933660.0Metropolitan Statistical AreaMobile1097.0...000NaNNaNNaNNaNNaNNaNNaN
21000002TWB RESIDENTIAL OPPORTUNITIES II93 Canal RdPort Jefferson StationNY11776-302435620.0MetropolitanSuffolk36103.0...000NaNNaNNaNNaNNaNNaNNaN
31000003THE DAISY HOUSE615 Clarissa StRochesterNY14608-248540380.0MetropolitanMonroe36055.0...000NaNNaNNaNNaNNaNNaNNaN
41000004MAIN AVENUE APARTMENTS105 E Walnut StSylacaugaAL35150-301245180.0Micropolitan Statistical AreaTalladega1121.0...000NaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 252 columns

\n", + "
" + ], + "text/plain": [ + " NHPDPropertyID PropertyName PropertyAddress \\\n", + "0 1000000 IVY ESTATES 6729 Zeigler Blvd \n", + "1 1000001 RENDU TERRACE WEST 7400 Old Shell Rd \n", + "2 1000002 TWB RESIDENTIAL OPPORTUNITIES II 93 Canal Rd \n", + "3 1000003 THE DAISY HOUSE 615 Clarissa St \n", + "4 1000004 MAIN AVENUE APARTMENTS 105 E Walnut St \n", + "\n", + " City State Zip CBSACode \\\n", + "0 Mobile AL 36608-4253 33660.0 \n", + "1 Mobile AL 36608-4549 33660.0 \n", + "2 Port Jefferson Station NY 11776-3024 35620.0 \n", + "3 Rochester NY 14608-2485 40380.0 \n", + "4 Sylacauga AL 35150-3012 45180.0 \n", + "\n", + " CBSAType County CountyCode ... NumberActiveMR \\\n", + "0 Metropolitan Statistical Area Mobile 1097.0 ... 0 \n", + "1 Metropolitan Statistical Area Mobile 1097.0 ... 0 \n", + "2 Metropolitan Suffolk 36103.0 ... 0 \n", + "3 Metropolitan Monroe 36055.0 ... 0 \n", + "4 Micropolitan Statistical Area Talladega 1121.0 ... 0 \n", + "\n", + " NumberInconclusiveMR NumberInactiveMR Mr_1_Status Mr_1_ProgramName \\\n", + "0 0 0 NaN NaN \n", + "1 0 0 NaN NaN \n", + "2 0 0 NaN NaN \n", + "3 0 0 NaN NaN \n", + "4 0 0 NaN NaN \n", + "\n", + " Mr_1_AssistedUnits Mr_2_Status Mr_2_ProgramName Mr_2_AssistedUnits \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " OldNHPDPropertyID \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "\n", + "[5 rows x 252 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shape = data_set.shape\n", + "data_set.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No. of data points: 82287\n", + "No. of features: 252\n", + "Different classes, : ['Active' 'Inconclusive']\n" + ] + } + ], + "source": [ + "print(\"No. of data points:\", shape[0])\n", + "print(\"No. of features: \", shape[1])\n", + "print(\"Different property statuses, :\", data_set.loc[:,\"PropertyStatus\"].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NHPDPropertyIDCBSACodeCountyCodeCensusTractLatitudeLongitudeActiveSubsidiesTotalInconclusiveSubsidiesTotalInactiveSubsidiesTotalUnits...NumberInconclusivePBVNumberInactivePBVPbv_1_AssistedUnitsPbv_2_AssistedUnitsNumberActiveMRNumberInconclusiveMRNumberInactiveMRMr_1_AssistedUnitsMr_2_AssistedUnitsOldNHPDPropertyID
count8.228700e+0472919.00000082229.0000008.222400e+0482287.00000082287.00000082287.00000082287.00000082287.00000082287.000000...82287.082287.02784.000000173.00000082287.00000082287.082287.0500.0000009.00000058370.000000
mean1.074656e+0630447.40128128953.5667592.895217e+1038.483402-90.2280691.3863550.0774850.36909866.711145...0.00.043.46300335.0115610.0062340.00.034.20800026.33333352459.141083
std4.021620e+0411096.93589615256.9678351.525466e+104.97547115.6367170.8958680.2874680.73484196.200003...0.00.041.70326329.0622800.0817410.00.025.84004919.68502033880.224028
min1.000000e+0610100.0000001001.0000001.001020e+0913.495030-166.7224780.0000000.0000000.0000001.000000...0.00.011.00000011.0000000.0000000.00.011.00000012.0000004.000000
25%1.039279e+0619740.00000017053.0000001.705396e+1034.983064-96.3807641.0000000.0000000.00000018.000000...0.00.017.00000014.0000000.0000000.00.016.00000013.00000024226.250000
50%1.073499e+0632580.00000029095.0000002.909501e+1039.312214-86.4909461.0000000.0000000.00000040.000000...0.00.029.00000024.0000000.0000000.00.025.00000019.00000049850.500000
75%1.108144e+0639300.00000041015.0000004.101395e+1041.799999-79.0522502.0000000.0000001.00000082.000000...0.00.053.00000046.0000000.0000000.00.043.00000022.00000078774.750000
max1.163400e+0699999.00000069120.0000005.604595e+1065.160556145.751129106.00000013.00000024.0000005881.000000...0.00.0449.000000191.0000005.0000000.00.0187.00000062.000000127185.000000
\n", + "

8 rows × 114 columns

\n", + "
" + ], + "text/plain": [ + " NHPDPropertyID CBSACode CountyCode CensusTract Latitude \\\n", + "count 8.228700e+04 72919.000000 82229.000000 8.222400e+04 82287.000000 \n", + "mean 1.074656e+06 30447.401281 28953.566759 2.895217e+10 38.483402 \n", + "std 4.021620e+04 11096.935896 15256.967835 1.525466e+10 4.975471 \n", + "min 1.000000e+06 10100.000000 1001.000000 1.001020e+09 13.495030 \n", + "25% 1.039279e+06 19740.000000 17053.000000 1.705396e+10 34.983064 \n", + "50% 1.073499e+06 32580.000000 29095.000000 2.909501e+10 39.312214 \n", + "75% 1.108144e+06 39300.000000 41015.000000 4.101395e+10 41.799999 \n", + "max 1.163400e+06 99999.000000 69120.000000 5.604595e+10 65.160556 \n", + "\n", + " Longitude ActiveSubsidies TotalInconclusiveSubsidies \\\n", + "count 82287.000000 82287.000000 82287.000000 \n", + "mean -90.228069 1.386355 0.077485 \n", + "std 15.636717 0.895868 0.287468 \n", + "min -166.722478 0.000000 0.000000 \n", + "25% -96.380764 1.000000 0.000000 \n", + "50% -86.490946 1.000000 0.000000 \n", + "75% -79.052250 2.000000 0.000000 \n", + "max 145.751129 106.000000 13.000000 \n", + "\n", + " TotalInactiveSubsidies TotalUnits ... NumberInconclusivePBV \\\n", + "count 82287.000000 82287.000000 ... 82287.0 \n", + "mean 0.369098 66.711145 ... 0.0 \n", + "std 0.734841 96.200003 ... 0.0 \n", + "min 0.000000 1.000000 ... 0.0 \n", + "25% 0.000000 18.000000 ... 0.0 \n", + "50% 0.000000 40.000000 ... 0.0 \n", + "75% 1.000000 82.000000 ... 0.0 \n", + "max 24.000000 5881.000000 ... 0.0 \n", + "\n", + " NumberInactivePBV Pbv_1_AssistedUnits Pbv_2_AssistedUnits \\\n", + "count 82287.0 2784.000000 173.000000 \n", + "mean 0.0 43.463003 35.011561 \n", + "std 0.0 41.703263 29.062280 \n", + "min 0.0 11.000000 11.000000 \n", + "25% 0.0 17.000000 14.000000 \n", + "50% 0.0 29.000000 24.000000 \n", + "75% 0.0 53.000000 46.000000 \n", + "max 0.0 449.000000 191.000000 \n", + "\n", + " NumberActiveMR NumberInconclusiveMR NumberInactiveMR \\\n", + "count 82287.000000 82287.0 82287.0 \n", + "mean 0.006234 0.0 0.0 \n", + "std 0.081741 0.0 0.0 \n", + "min 0.000000 0.0 0.0 \n", + "25% 0.000000 0.0 0.0 \n", + "50% 0.000000 0.0 0.0 \n", + "75% 0.000000 0.0 0.0 \n", + "max 5.000000 0.0 0.0 \n", + "\n", + " Mr_1_AssistedUnits Mr_2_AssistedUnits OldNHPDPropertyID \n", + "count 500.000000 9.000000 58370.000000 \n", + "mean 34.208000 26.333333 52459.141083 \n", + "std 25.840049 19.685020 33880.224028 \n", + "min 11.000000 12.000000 4.000000 \n", + "25% 16.000000 13.000000 24226.250000 \n", + "50% 25.000000 19.000000 49850.500000 \n", + "75% 43.000000 22.000000 78774.750000 \n", + "max 187.000000 62.000000 127185.000000 \n", + "\n", + "[8 rows x 114 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_set.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Status [WIP]\n", + "- Findout how housing and subsidies work in the US.\n", + "- Map census track to its geographical boundaries.\n", + "- List out all of the different subsidies ().\n", + "- Find out metrics per subsidy and compare various statistical plots." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}