Editing etl script

This commit is contained in:
VincentLaUSDS 2021-09-22 17:30:37 -04:00
parent 28600d3e03
commit 2fe45fb43f
2 changed files with 26 additions and 28 deletions

View file

@ -12,7 +12,6 @@ class EJScreenAreasOfConcernETL(ExtractTransformLoad):
# TO DO: Load from actual source; the issue is that this dataset is not public for now # TO DO: Load from actual source; the issue is that this dataset is not public for now
self.LOCAL_CSV_PATH = self.DATA_PATH / "local" self.LOCAL_CSV_PATH = self.DATA_PATH / "local"
self.GEOID_CBG_FIELD_NAME = "GEOID10_CBG"
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def extract(self) -> None:
@ -22,7 +21,7 @@ class EJScreenAreasOfConcernETL(ExtractTransformLoad):
filepath_or_buffer=self.LOCAL_CSV_PATH filepath_or_buffer=self.LOCAL_CSV_PATH
/ "ejscreen_areas_of_concerns_indicators.csv", / "ejscreen_areas_of_concerns_indicators.csv",
dtype={ dtype={
self.GEOID_CBG_FIELD_NAME: "string", self.GEOID_FIELD_NAME: "string",
}, },
low_memory=False, low_memory=False,
) )

View file

@ -74,7 +74,6 @@
"GEOID_FIELD_NAME = \"GEOID10\"\n", "GEOID_FIELD_NAME = \"GEOID10\"\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n", "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
"GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
"COUNTRY_FIELD_NAME = \"Country\"\n", "COUNTRY_FIELD_NAME = \"Country\"\n",
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n", "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
"\n", "\n",
@ -96,7 +95,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "67cf1993", "id": "186c15bf",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### National" "### National"
@ -105,7 +104,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 3,
"id": "c6d89263", "id": "4843efbd",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -123,7 +122,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 4,
"id": "3110606b", "id": "0a146972",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -152,9 +151,9 @@
" )\n", " )\n",
" df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n", " df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n",
"\n", "\n",
" df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)\n", " df.rename(columns={'ID': GEOID_FIELD_NAME}, inplace=True)\n",
" df['percentile'] = percentile\n", " df['percentile'] = percentile\n",
" df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n", " df = df[[GEOID_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
" dfs.append(df)\n", " dfs.append(df)\n",
" \n", " \n",
"df = pd.concat(dfs)" "df = pd.concat(dfs)"
@ -163,7 +162,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 5,
"id": "7f3bd094", "id": "65622cbd",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -248,11 +247,11 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 6,
"id": "f85dc722", "id": "75e2d572",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"df_reshaped_nat = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n", "df_reshaped_nat = df.pivot(index=GEOID_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
"df_reshaped_nat.columns = \\\n", "df_reshaped_nat.columns = \\\n",
" ['EJSCREEN Areas of Concern, National, {}th percentile'.format(p) for p in df_reshaped_nat.columns]\n", " ['EJSCREEN Areas of Concern, National, {}th percentile'.format(p) for p in df_reshaped_nat.columns]\n",
"df_reshaped_nat.fillna(0, inplace=True)\n", "df_reshaped_nat.fillna(0, inplace=True)\n",
@ -487,7 +486,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 8,
"id": "1a1c0ef9", "id": "7eedff74",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -784,7 +783,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 9,
"id": "ecb5a9da", "id": "428b94f3",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -977,7 +976,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "eca4ebf1", "id": "7bc0f71c",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### State" "### State"
@ -986,7 +985,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 10,
"id": "d8d21a3d", "id": "2de68aa5",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1004,7 +1003,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 11,
"id": "7c4f0a6d", "id": "fccb416e",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -1033,9 +1032,9 @@
" )\n", " )\n",
" df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n", " df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])\n",
"\n", "\n",
" df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)\n", " df.rename(columns={'ID': GEOID_FIELD_NAME}, inplace=True)\n",
" df['percentile'] = percentile\n", " df['percentile'] = percentile\n",
" df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n", " df = df[[GEOID_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]\n",
" dfs.append(df)\n", " dfs.append(df)\n",
" \n", " \n",
"df = pd.concat(dfs)" "df = pd.concat(dfs)"
@ -1044,7 +1043,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 12,
"id": "0c40712a", "id": "8300e454",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -1129,11 +1128,11 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 13,
"id": "acce11ae", "id": "5be30b4f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"df_reshaped_sta = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n", "df_reshaped_sta = df.pivot(index=GEOID_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')\n",
"df_reshaped_sta.columns = ['EJSCREEN Areas of Concern, State, {}th percentile'.format(p) for p in df_reshaped_sta.columns]\n", "df_reshaped_sta.columns = ['EJSCREEN Areas of Concern, State, {}th percentile'.format(p) for p in df_reshaped_sta.columns]\n",
"df_reshaped_sta.fillna(0, inplace=True)\n", "df_reshaped_sta.fillna(0, inplace=True)\n",
"\n", "\n",
@ -1145,7 +1144,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 14,
"id": "5160b8cd", "id": "9206132b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -1367,7 +1366,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 15,
"id": "533b81a9", "id": "b551a4df",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -1664,7 +1663,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 16,
"id": "e1cc26bc", "id": "c3cb5696",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -1858,19 +1857,19 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 17,
"id": "ee632477", "id": "099cca8c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"df_reshaped = df_reshaped_nat.merge(\n", "df_reshaped = df_reshaped_nat.merge(\n",
" df_reshaped_sta,\n", " df_reshaped_sta,\n",
" on=GEOID_CBG_FIELD_NAME)" " on=GEOID_FIELD_NAME)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 18,
"id": "36905865", "id": "23097787",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2259,7 +2258,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "4260b002", "id": "403dfbc6",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Next Steps / Questions\n", "# Next Steps / Questions\n",