mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 02:51:17 -07:00
Issue 970: reverse percentiles for AMI and life expectancy (#1018)
* switching to low * fixing score-etl-post * updating comments * fixing comparison * create separate field for clarity * comment fix * removing healthy food * fixing bug in score post * running black and adding comment * Update pickles and add a helpful notes to README Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
This commit is contained in:
parent
24bac56d9e
commit
7fcecaee42
11 changed files with 144 additions and 100 deletions
|
@ -8,7 +8,10 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"import os\n",
|
||||
"import pandas as pd\n",
|
||||
"import pathlib\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
|
@ -28,12 +31,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Load\n",
|
||||
"path_to_score_file_1 = (\n",
|
||||
" DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa1.csv\"\n",
|
||||
")\n",
|
||||
"path_to_score_file_2 = (\n",
|
||||
" DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa2.csv\"\n",
|
||||
")\n",
|
||||
"path_to_score_file_1 = DATA_DIR / \"compare_two_score_csvs/usa (pre 970).csv\"\n",
|
||||
"path_to_score_file_2 = DATA_DIR / \"compare_two_score_csvs/usa (post 970).csv\"\n",
|
||||
"\n",
|
||||
"score_1_df = pd.read_csv(\n",
|
||||
" path_to_score_file_1,\n",
|
||||
|
@ -55,7 +54,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# List columns in one but not the other \n",
|
||||
"# List columns in one but not the other\n",
|
||||
"score_2_df.columns.difference(score_1_df.columns)"
|
||||
]
|
||||
},
|
||||
|
@ -68,11 +67,16 @@
|
|||
"source": [
|
||||
"# List rows in one but not the other\n",
|
||||
"\n",
|
||||
"if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]):\n",
|
||||
"if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(\n",
|
||||
" score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]\n",
|
||||
"):\n",
|
||||
" print(\"Different lengths!\")\n",
|
||||
"\n",
|
||||
"print(\"Difference in tract IDs:\")\n",
|
||||
"print(set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]))\n"
|
||||
"print(\n",
|
||||
" set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME])\n",
|
||||
" ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME])\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -82,8 +86,13 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Join \n",
|
||||
"merged_df = score_1_df.merge(score_2_df, how=\"outer\", on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME, suffixes=('_1', '_2'))\n",
|
||||
"# Join\n",
|
||||
"merged_df = score_1_df.merge(\n",
|
||||
" score_2_df,\n",
|
||||
" how=\"outer\",\n",
|
||||
" on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME,\n",
|
||||
" suffixes=(\"_1\", \"_2\"),\n",
|
||||
")\n",
|
||||
"merged_df"
|
||||
]
|
||||
},
|
||||
|
@ -94,14 +103,32 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check each duplicate column: \n",
|
||||
"# Check each duplicate column:\n",
|
||||
"# Remove the suffix \"_1\"\n",
|
||||
"duplicate_columns = [x[:-2] for x in merged_df.columns if \"_1\" in x]\n",
|
||||
"\n",
|
||||
"for duplicate_column in duplicate_columns:\n",
|
||||
" print(f\"Checking duplicate column {duplicate_column}\")\n",
|
||||
" if not merged_df[f\"{duplicate_column}_1\"].equals(merged_df[f\"{duplicate_column}_2\"]):\n",
|
||||
" print(merged_df[f\"{duplicate_column}_1\"].compare(merged_df[f\"{duplicate_column}_2\"]))\n",
|
||||
" raise ValueError(f\"Error! Different values in {duplicate_column}\")"
|
||||
"columns_to_exclude_from_duplicates_check = [\n",
|
||||
" \"Total threshold criteria exceeded\"\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"columns_to_check = [column for column in duplicate_columns if column not in columns_to_exclude_from_duplicates_check]\n",
|
||||
"\n",
|
||||
"any_errors_found = False\n",
|
||||
"for column_to_check in columns_to_check:\n",
|
||||
" print(f\"Checking duplicate column {column_to_check}\")\n",
|
||||
" if not merged_df[f\"{column_to_check}_1\"].equals(\n",
|
||||
" merged_df[f\"{column_to_check}_2\"]\n",
|
||||
" ):\n",
|
||||
" print(f\"Error! Different values in {column_to_check}\")\n",
|
||||
" print(\n",
|
||||
" merged_df[f\"{column_to_check}_1\"].compare(\n",
|
||||
" merged_df[f\"{column_to_check}_2\"]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" any_errors_found = True\n",
|
||||
"\n",
|
||||
"if any_errors_found:\n",
|
||||
" raise ValueError(f\"Error! Different values in one or more columns.\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue