Issue 970: reverse percentiles for AMI and life expectancy (#1018)

* switching to low

* fixing score-etl-post

* updating comments

* fixing comparison

* create separate field for clarity

* comment fix

* removing healthy food

* fixing bug in score post

* running black and adding comment

* Update pickles and add a helpful notes to README

Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
This commit is contained in:
Lucas Merrill Brown 2021-12-10 10:16:22 -05:00 committed by GitHub
commit 7fcecaee42
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 144 additions and 100 deletions

View file

@ -8,7 +8,10 @@
"outputs": [],
"source": [
"import IPython\n",
"import os\n",
"import pandas as pd\n",
"import pathlib\n",
"import sys\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
"if module_path not in sys.path:\n",
@ -28,12 +31,8 @@
"outputs": [],
"source": [
"# Load\n",
"path_to_score_file_1 = (\n",
" DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa1.csv\"\n",
")\n",
"path_to_score_file_2 = (\n",
" DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa2.csv\"\n",
")\n",
"path_to_score_file_1 = DATA_DIR / \"compare_two_score_csvs/usa (pre 970).csv\"\n",
"path_to_score_file_2 = DATA_DIR / \"compare_two_score_csvs/usa (post 970).csv\"\n",
"\n",
"score_1_df = pd.read_csv(\n",
" path_to_score_file_1,\n",
@ -55,7 +54,7 @@
"metadata": {},
"outputs": [],
"source": [
"# List columns in one but not the other \n",
"# List columns in one but not the other\n",
"score_2_df.columns.difference(score_1_df.columns)"
]
},
@ -68,11 +67,16 @@
"source": [
"# List rows in one but not the other\n",
"\n",
"if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]):\n",
"if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(\n",
" score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]\n",
"):\n",
" print(\"Different lengths!\")\n",
"\n",
"print(\"Difference in tract IDs:\")\n",
"print(set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]))\n"
"print(\n",
" set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME])\n",
" ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME])\n",
")"
]
},
{
@ -82,8 +86,13 @@
"metadata": {},
"outputs": [],
"source": [
"# Join \n",
"merged_df = score_1_df.merge(score_2_df, how=\"outer\", on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME, suffixes=('_1', '_2'))\n",
"# Join\n",
"merged_df = score_1_df.merge(\n",
" score_2_df,\n",
" how=\"outer\",\n",
" on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME,\n",
" suffixes=(\"_1\", \"_2\"),\n",
")\n",
"merged_df"
]
},
@ -94,14 +103,32 @@
"metadata": {},
"outputs": [],
"source": [
"# Check each duplicate column: \n",
"# Check each duplicate column:\n",
"# Remove the suffix \"_1\"\n",
"duplicate_columns = [x[:-2] for x in merged_df.columns if \"_1\" in x]\n",
"\n",
"for duplicate_column in duplicate_columns:\n",
" print(f\"Checking duplicate column {duplicate_column}\")\n",
" if not merged_df[f\"{duplicate_column}_1\"].equals(merged_df[f\"{duplicate_column}_2\"]):\n",
" print(merged_df[f\"{duplicate_column}_1\"].compare(merged_df[f\"{duplicate_column}_2\"]))\n",
" raise ValueError(f\"Error! Different values in {duplicate_column}\")"
"columns_to_exclude_from_duplicates_check = [\n",
" \"Total threshold criteria exceeded\"\n",
"]\n",
"\n",
"columns_to_check = [column for column in duplicate_columns if column not in columns_to_exclude_from_duplicates_check]\n",
"\n",
"any_errors_found = False\n",
"for column_to_check in columns_to_check:\n",
" print(f\"Checking duplicate column {column_to_check}\")\n",
" if not merged_df[f\"{column_to_check}_1\"].equals(\n",
" merged_df[f\"{column_to_check}_2\"]\n",
" ):\n",
" print(f\"Error! Different values in {column_to_check}\")\n",
" print(\n",
" merged_df[f\"{column_to_check}_1\"].compare(\n",
" merged_df[f\"{column_to_check}_2\"]\n",
" )\n",
" )\n",
" any_errors_found = True\n",
"\n",
"if any_errors_found:\n",
" raise ValueError(f\"Error! Different values in one or more columns.\")"
]
}
],