From c2081937b3f4845aaec7d02a4afe420aa1401169 Mon Sep 17 00:00:00 2001 From: VincentLaUSDS Date: Wed, 22 Sep 2021 17:30:37 -0400 Subject: [PATCH] Editing etl script --- .../sources/ejscreen_areas_of_concern/etl.py | 3 +- .../data_pipeline/ipython/ejscreen_load.ipynb | 51 +++++++++---------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py index dfb987f3..56477d6f 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py @@ -12,7 +12,6 @@ class EJScreenAreasOfConcernETL(ExtractTransformLoad): # TO DO: Load from actual source; the issue is that this dataset is not public for now self.LOCAL_CSV_PATH = self.DATA_PATH / "local" - self.GEOID_CBG_FIELD_NAME = "GEOID10_CBG" self.df: pd.DataFrame def extract(self) -> None: @@ -22,7 +21,7 @@ class EJScreenAreasOfConcernETL(ExtractTransformLoad): filepath_or_buffer=self.LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv", dtype={ - self.GEOID_CBG_FIELD_NAME: "string", + self.GEOID_FIELD_NAME: "string", }, low_memory=False, ) diff --git a/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb b/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb index c9a31f77..574c3d61 100644 --- a/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb @@ -74,7 +74,6 @@ "GEOID_FIELD_NAME = \"GEOID10\"\n", "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n", - "GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n", "COUNTRY_FIELD_NAME = \"Country\"\n", "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n", "\n", @@ -96,7 +95,7 @@ }, { "cell_type": "markdown", - "id": "67cf1993", + "id": "186c15bf", "metadata": {}, "source": [ "### National" @@ -105,7 +104,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "c6d89263", + "id": "4843efbd", "metadata": {}, "outputs": [], "source": [ @@ -123,7 +122,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "3110606b", + "id": "0a146972", "metadata": {}, "outputs": [ { @@ -152,9 +151,9 @@ " )\n", " df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n", "\n", - " df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)\n", + " df.rename(columns={'ID': GEOID_FIELD_NAME}, inplace=True)\n", " df['percentile'] = percentile\n", - " df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n", + " df = df[[GEOID_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n", " dfs.append(df)\n", " \n", "df = pd.concat(dfs)" @@ -163,7 +162,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "7f3bd094", + "id": "65622cbd", "metadata": {}, "outputs": [ { @@ -248,11 +247,11 @@ { "cell_type": "code", "execution_count": 6, - "id": "f85dc722", + "id": "75e2d572", "metadata": {}, "outputs": [], "source": [ - "df_reshaped_nat = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n", + "df_reshaped_nat = df.pivot(index=GEOID_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n", "df_reshaped_nat.columns = \\\n", " ['EJSCREEN Areas of Concern, National, {}th percentile'.format(p) for p in df_reshaped_nat.columns]\n", "df_reshaped_nat.fillna(0, inplace=True)\n", @@ -487,7 +486,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "1a1c0ef9", + "id": "7eedff74", "metadata": {}, "outputs": [ { @@ -784,7 +783,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "ecb5a9da", + "id": "428b94f3", "metadata": {}, "outputs": [ { @@ -977,7 +976,7 @@ }, { "cell_type": "markdown", - "id": "eca4ebf1", + "id": "7bc0f71c", "metadata": {}, "source": [ "### State" @@ -986,7 +985,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "d8d21a3d", + "id": "2de68aa5", "metadata": {}, "outputs": [], "source": [ @@ -1004,7 +1003,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "7c4f0a6d", + "id": "fccb416e", "metadata": {}, "outputs": [ { @@ -1033,9 +1032,9 @@ " )\n", " df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n", "\n", - " df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)\n", + " df.rename(columns={'ID': GEOID_FIELD_NAME}, inplace=True)\n", " df['percentile'] = percentile\n", - " df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n", + " df = df[[GEOID_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n", " dfs.append(df)\n", " \n", "df = pd.concat(dfs)" @@ -1044,7 +1043,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "0c40712a", + "id": "8300e454", "metadata": {}, "outputs": [ { @@ -1129,11 +1128,11 @@ { "cell_type": "code", "execution_count": 13, - "id": "acce11ae", + "id": "5be30b4f", "metadata": {}, "outputs": [], "source": [ - "df_reshaped_sta = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n", + "df_reshaped_sta = df.pivot(index=GEOID_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n", "df_reshaped_sta.columns = ['EJSCREEN Areas of Concern, State, {}th percentile'.format(p) for p in df_reshaped_sta.columns]\n", "df_reshaped_sta.fillna(0, inplace=True)\n", "\n", @@ -1145,7 +1144,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "5160b8cd", + "id": "9206132b", "metadata": {}, "outputs": [ { @@ -1367,7 +1366,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "533b81a9", + "id": "b551a4df", "metadata": {}, "outputs": [ { @@ -1664,7 +1663,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "e1cc26bc", + "id": "c3cb5696", "metadata": {}, "outputs": [ { @@ -1858,19 +1857,19 @@ { "cell_type": "code", "execution_count": 17, - "id": "ee632477", + "id": "099cca8c", "metadata": {}, "outputs": [], "source": [ "df_reshaped = df_reshaped_nat.merge(\n", " df_reshaped_sta,\n", - " on=GEOID_CBG_FIELD_NAME)" + " on=GEOID_FIELD_NAME)" ] }, { "cell_type": "code", "execution_count": 18, - "id": "36905865", + "id": "23097787", "metadata": {}, "outputs": [ { @@ -2259,7 +2258,7 @@ }, { "cell_type": "markdown", - "id": "4260b002", + "id": "403dfbc6", "metadata": {}, "source": [ "# Next Steps / Questions\n",