Editing etl script

2025-02-23 10:04:18 -08:00 · 2021-09-22 17:30:37 -04:00 · 2021-09-22 17:30:37 -04:00 · 2fe45fb43f
commit 2fe45fb43f
parent 28600d3e03
2 changed files with 26 additions and 28 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
@ -12,7 +12,6 @@ class EJScreenAreasOfConcernETL(ExtractTransformLoad):

        # TO DO: Load from actual source; the issue is that this dataset is not public for now
        self.LOCAL_CSV_PATH = self.DATA_PATH / "local"
-        self.GEOID_CBG_FIELD_NAME = "GEOID10_CBG"
        self.df: pd.DataFrame

    def extract(self) -> None:
@ -22,7 +21,7 @@ class EJScreenAreasOfConcernETL(ExtractTransformLoad):
            filepath_or_buffer=self.LOCAL_CSV_PATH
            / "ejscreen_areas_of_concerns_indicators.csv",
            dtype={
-                self.GEOID_CBG_FIELD_NAME: "string",
+                self.GEOID_FIELD_NAME: "string",
            },
            low_memory=False,
        )
--- a/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb
@ -74,7 +74,6 @@
    "GEOID_FIELD_NAME = \"GEOID10\"\n",
    "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
    "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
-    "GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
    "COUNTRY_FIELD_NAME = \"Country\"\n",
    "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
    "\n",
@ -96,7 +95,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "67cf1993",
+   "id": "186c15bf",
   "metadata": {},
   "source": [
    "### National"
@ -105,7 +104,7 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "id": "c6d89263",
+   "id": "4843efbd",
   "metadata": {},
   "outputs": [],
   "source": [
@ -123,7 +122,7 @@
  {
   "cell_type": "code",
   "execution_count": 4,
-   "id": "3110606b",
+   "id": "0a146972",
   "metadata": {},
   "outputs": [
    {
@ -152,9 +151,9 @@
    "    )\n",
    "    df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n",
    "\n",
-    "    df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)\n",
+    "    df.rename(columns={'ID': GEOID_FIELD_NAME}, inplace=True)\n",
    "    df['percentile'] = percentile\n",
-    "    df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
+    "    df = df[[GEOID_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
    "    dfs.append(df)\n",
    "    \n",
    "df = pd.concat(dfs)"
@ -163,7 +162,7 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "id": "7f3bd094",
+   "id": "65622cbd",
   "metadata": {},
   "outputs": [
    {
@ -248,11 +247,11 @@
  {
   "cell_type": "code",
   "execution_count": 6,
-   "id": "f85dc722",
+   "id": "75e2d572",
   "metadata": {},
   "outputs": [],
   "source": [
-    "df_reshaped_nat = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
+    "df_reshaped_nat = df.pivot(index=GEOID_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
    "df_reshaped_nat.columns = \\\n",
    "    ['EJSCREEN Areas of Concern, National, {}th percentile'.format(p) for p in df_reshaped_nat.columns]\n",
    "df_reshaped_nat.fillna(0, inplace=True)\n",
@ -487,7 +486,7 @@
  {
   "cell_type": "code",
   "execution_count": 8,
-   "id": "1a1c0ef9",
+   "id": "7eedff74",
   "metadata": {},
   "outputs": [
    {
@ -784,7 +783,7 @@
  {
   "cell_type": "code",
   "execution_count": 9,
-   "id": "ecb5a9da",
+   "id": "428b94f3",
   "metadata": {},
   "outputs": [
    {
@ -977,7 +976,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "eca4ebf1",
+   "id": "7bc0f71c",
   "metadata": {},
   "source": [
    "### State"
@ -986,7 +985,7 @@
  {
   "cell_type": "code",
   "execution_count": 10,
-   "id": "d8d21a3d",
+   "id": "2de68aa5",
   "metadata": {},
   "outputs": [],
   "source": [
@ -1004,7 +1003,7 @@
  {
   "cell_type": "code",
   "execution_count": 11,
-   "id": "7c4f0a6d",
+   "id": "fccb416e",
   "metadata": {},
   "outputs": [
    {
@ -1033,9 +1032,9 @@
    "    )\n",
    "    df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n",
    "\n",
-    "    df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)\n",
+    "    df.rename(columns={'ID': GEOID_FIELD_NAME}, inplace=True)\n",
    "    df['percentile'] = percentile\n",
-    "    df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
+    "    df = df[[GEOID_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
    "    dfs.append(df)\n",
    "    \n",
    "df = pd.concat(dfs)"
@ -1044,7 +1043,7 @@
  {
   "cell_type": "code",
   "execution_count": 12,
-   "id": "0c40712a",
+   "id": "8300e454",
   "metadata": {},
   "outputs": [
    {
@ -1129,11 +1128,11 @@
  {
   "cell_type": "code",
   "execution_count": 13,
-   "id": "acce11ae",
+   "id": "5be30b4f",
   "metadata": {},
   "outputs": [],
   "source": [
-    "df_reshaped_sta = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
+    "df_reshaped_sta = df.pivot(index=GEOID_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
    "df_reshaped_sta.columns = ['EJSCREEN Areas of Concern, State, {}th percentile'.format(p) for p in df_reshaped_sta.columns]\n",
    "df_reshaped_sta.fillna(0, inplace=True)\n",
    "\n",
@ -1145,7 +1144,7 @@
  {
   "cell_type": "code",
   "execution_count": 14,
-   "id": "5160b8cd",
+   "id": "9206132b",
   "metadata": {},
   "outputs": [
    {
@ -1367,7 +1366,7 @@
  {
   "cell_type": "code",
   "execution_count": 15,
-   "id": "533b81a9",
+   "id": "b551a4df",
   "metadata": {},
   "outputs": [
    {
@ -1664,7 +1663,7 @@
  {
   "cell_type": "code",
   "execution_count": 16,
-   "id": "e1cc26bc",
+   "id": "c3cb5696",
   "metadata": {},
   "outputs": [
    {
@ -1858,19 +1857,19 @@
  {
   "cell_type": "code",
   "execution_count": 17,
-   "id": "ee632477",
+   "id": "099cca8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_reshaped = df_reshaped_nat.merge(\n",
    "    df_reshaped_sta,\n",
-    "    on=GEOID_CBG_FIELD_NAME)"
+    "    on=GEOID_FIELD_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
-   "id": "36905865",
+   "id": "23097787",
   "metadata": {},
   "outputs": [
    {
@ -2259,7 +2258,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "4260b002",
+   "id": "403dfbc6",
   "metadata": {},
   "source": [
    "# Next Steps / Questions\n",