Editing etl script

This commit is contained in:
VincentLaUSDS 2021-09-22 17:30:37 -04:00
parent 28600d3e03
commit 2fe45fb43f
2 changed files with 26 additions and 28 deletions

View file

@ -12,7 +12,6 @@ class EJScreenAreasOfConcernETL(ExtractTransformLoad):
# TO DO: Load from actual source; the issue is that this dataset is not public for now
self.LOCAL_CSV_PATH = self.DATA_PATH / "local"
self.GEOID_CBG_FIELD_NAME = "GEOID10_CBG"
self.df: pd.DataFrame
def extract(self) -> None:
@ -22,7 +21,7 @@ class EJScreenAreasOfConcernETL(ExtractTransformLoad):
filepath_or_buffer=self.LOCAL_CSV_PATH
/ "ejscreen_areas_of_concerns_indicators.csv",
dtype={
self.GEOID_CBG_FIELD_NAME: "string",
self.GEOID_FIELD_NAME: "string",
},
low_memory=False,
)

View file

@ -74,7 +74,6 @@
"GEOID_FIELD_NAME = \"GEOID10\"\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
"GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
"COUNTRY_FIELD_NAME = \"Country\"\n",
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
"\n",
@ -96,7 +95,7 @@
},
{
"cell_type": "markdown",
"id": "67cf1993",
"id": "186c15bf",
"metadata": {},
"source": [
"### National"
@ -105,7 +104,7 @@
{
"cell_type": "code",
"execution_count": 3,
"id": "c6d89263",
"id": "4843efbd",
"metadata": {},
"outputs": [],
"source": [
@ -123,7 +122,7 @@
{
"cell_type": "code",
"execution_count": 4,
"id": "3110606b",
"id": "0a146972",
"metadata": {},
"outputs": [
{
@ -152,9 +151,9 @@
" )\n",
" df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n",
"\n",
" df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)\n",
" df.rename(columns={'ID': GEOID_FIELD_NAME}, inplace=True)\n",
" df['percentile'] = percentile\n",
" df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
" df = df[[GEOID_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
" dfs.append(df)\n",
" \n",
"df = pd.concat(dfs)"
@ -163,7 +162,7 @@
{
"cell_type": "code",
"execution_count": 5,
"id": "7f3bd094",
"id": "65622cbd",
"metadata": {},
"outputs": [
{
@ -248,11 +247,11 @@
{
"cell_type": "code",
"execution_count": 6,
"id": "f85dc722",
"id": "75e2d572",
"metadata": {},
"outputs": [],
"source": [
"df_reshaped_nat = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
"df_reshaped_nat = df.pivot(index=GEOID_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
"df_reshaped_nat.columns = \\\n",
" ['EJSCREEN Areas of Concern, National, {}th percentile'.format(p) for p in df_reshaped_nat.columns]\n",
"df_reshaped_nat.fillna(0, inplace=True)\n",
@ -487,7 +486,7 @@
{
"cell_type": "code",
"execution_count": 8,
"id": "1a1c0ef9",
"id": "7eedff74",
"metadata": {},
"outputs": [
{
@ -784,7 +783,7 @@
{
"cell_type": "code",
"execution_count": 9,
"id": "ecb5a9da",
"id": "428b94f3",
"metadata": {},
"outputs": [
{
@ -977,7 +976,7 @@
},
{
"cell_type": "markdown",
"id": "eca4ebf1",
"id": "7bc0f71c",
"metadata": {},
"source": [
"### State"
@ -986,7 +985,7 @@
{
"cell_type": "code",
"execution_count": 10,
"id": "d8d21a3d",
"id": "2de68aa5",
"metadata": {},
"outputs": [],
"source": [
@ -1004,7 +1003,7 @@
{
"cell_type": "code",
"execution_count": 11,
"id": "7c4f0a6d",
"id": "fccb416e",
"metadata": {},
"outputs": [
{
@ -1033,9 +1032,9 @@
" )\n",
" df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n",
"\n",
" df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)\n",
" df.rename(columns={'ID': GEOID_FIELD_NAME}, inplace=True)\n",
" df['percentile'] = percentile\n",
" df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
" df = df[[GEOID_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
" dfs.append(df)\n",
" \n",
"df = pd.concat(dfs)"
@ -1044,7 +1043,7 @@
{
"cell_type": "code",
"execution_count": 12,
"id": "0c40712a",
"id": "8300e454",
"metadata": {},
"outputs": [
{
@ -1129,11 +1128,11 @@
{
"cell_type": "code",
"execution_count": 13,
"id": "acce11ae",
"id": "5be30b4f",
"metadata": {},
"outputs": [],
"source": [
"df_reshaped_sta = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
"df_reshaped_sta = df.pivot(index=GEOID_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
"df_reshaped_sta.columns = ['EJSCREEN Areas of Concern, State, {}th percentile'.format(p) for p in df_reshaped_sta.columns]\n",
"df_reshaped_sta.fillna(0, inplace=True)\n",
"\n",
@ -1145,7 +1144,7 @@
{
"cell_type": "code",
"execution_count": 14,
"id": "5160b8cd",
"id": "9206132b",
"metadata": {},
"outputs": [
{
@ -1367,7 +1366,7 @@
{
"cell_type": "code",
"execution_count": 15,
"id": "533b81a9",
"id": "b551a4df",
"metadata": {},
"outputs": [
{
@ -1664,7 +1663,7 @@
{
"cell_type": "code",
"execution_count": 16,
"id": "e1cc26bc",
"id": "c3cb5696",
"metadata": {},
"outputs": [
{
@ -1858,19 +1857,19 @@
{
"cell_type": "code",
"execution_count": 17,
"id": "ee632477",
"id": "099cca8c",
"metadata": {},
"outputs": [],
"source": [
"df_reshaped = df_reshaped_nat.merge(\n",
" df_reshaped_sta,\n",
" on=GEOID_CBG_FIELD_NAME)"
" on=GEOID_FIELD_NAME)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "36905865",
"id": "23097787",
"metadata": {},
"outputs": [
{
@ -2259,7 +2258,7 @@
},
{
"cell_type": "markdown",
"id": "4260b002",
"id": "403dfbc6",
"metadata": {},
"source": [
"# Next Steps / Questions\n",