Issue 105: Configure and run black and other pre-commit hooks (clean branch) (#1962)

* Configure and run `black` and other pre-commit hooks

Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov>
This commit is contained in:
Lucas Merrill Brown 2022-10-04 18:08:47 -04:00 committed by GitHub
commit 6e6223cd5e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
162 changed files with 716 additions and 602 deletions

View file

@ -17,7 +17,7 @@ DEMOGRAPHIC_COLUMNS:
DEMOGRAPHIC_FILE: ../../data_pipeline/data/dataset/census_acs_2019/usa.csv
OUTPUT_DATA_PATH: output/donut_hole_dac
SCORE_FILE: ../../data_pipeline/data/score/csv/full/usa.csv
OTHER_COMPARATOR_COLUMNS:
OTHER_COMPARATOR_COLUMNS:
- donut_hole_dac
- P200_PFS
- HSEF

View file

@ -12,12 +12,12 @@ To see more: https://buildmedia.readthedocs.org/media/pdf/papermill/latest/paper
To run:
` $ python src/run_tract_comparison.py --template_notebook=TEMPLATE.ipynb --parameter_yaml=PARAMETERS.yaml`
"""
import os
import datetime
import argparse
import yaml
import datetime
import os
import papermill as pm
import yaml
def _read_param_file(param_file: str) -> dict:

View file

@ -16,7 +16,7 @@
"import matplotlib.pyplot as plt\n",
"\n",
"from data_pipeline.score import field_names\n",
"from data_pipeline.comparison_tool.src import utils \n",
"from data_pipeline.comparison_tool.src import utils\n",
"\n",
"pd.options.display.float_format = \"{:,.3f}\".format\n",
"%load_ext lab_black"
@ -128,9 +128,7 @@
"metadata": {},
"outputs": [],
"source": [
"utils.validate_new_data(\n",
" file_path=COMPARATOR_FILE, score_col=COMPARATOR_COLUMN\n",
")"
"utils.validate_new_data(file_path=COMPARATOR_FILE, score_col=COMPARATOR_COLUMN)"
]
},
{
@ -148,20 +146,25 @@
"metadata": {},
"outputs": [],
"source": [
"comparator_cols = [COMPARATOR_COLUMN] + OTHER_COMPARATOR_COLUMNS if OTHER_COMPARATOR_COLUMNS else [COMPARATOR_COLUMN]\n",
"comparator_cols = (\n",
" [COMPARATOR_COLUMN] + OTHER_COMPARATOR_COLUMNS\n",
" if OTHER_COMPARATOR_COLUMNS\n",
" else [COMPARATOR_COLUMN]\n",
")\n",
"\n",
"#papermill_description=Loading_data\n",
"# papermill_description=Loading_data\n",
"joined_df = pd.concat(\n",
" [\n",
" utils.read_file(\n",
" file_path=SCORE_FILE,\n",
" columns=[TOTAL_POPULATION_COLUMN, SCORE_COLUMN] + ADDITIONAL_DEMO_COLUMNS,\n",
" columns=[TOTAL_POPULATION_COLUMN, SCORE_COLUMN]\n",
" + ADDITIONAL_DEMO_COLUMNS,\n",
" geoid=GEOID_COLUMN,\n",
" ),\n",
" utils.read_file(\n",
" file_path=COMPARATOR_FILE,\n",
" columns=comparator_cols,\n",
" geoid=GEOID_COLUMN\n",
" geoid=GEOID_COLUMN,\n",
" ),\n",
" utils.read_file(\n",
" file_path=DEMOGRAPHIC_FILE,\n",
@ -196,13 +199,13 @@
"metadata": {},
"outputs": [],
"source": [
"#papermill_description=Summary_stats\n",
"# papermill_description=Summary_stats\n",
"population_df = utils.produce_summary_stats(\n",
" joined_df=joined_df,\n",
" comparator_column=COMPARATOR_COLUMN,\n",
" score_column=SCORE_COLUMN,\n",
" population_column=TOTAL_POPULATION_COLUMN,\n",
" geoid_column=GEOID_COLUMN\n",
" geoid_column=GEOID_COLUMN,\n",
")\n",
"population_df"
]
@ -224,18 +227,18 @@
"metadata": {},
"outputs": [],
"source": [
"#papermill_description=Tract_stats\n",
"# papermill_description=Tract_stats\n",
"tract_level_by_identification_df = pd.concat(\n",
" [\n",
" utils.get_demo_series(\n",
" grouping_column=COMPARATOR_COLUMN,\n",
" joined_df=joined_df,\n",
" demo_columns=ADDITIONAL_DEMO_COLUMNS + DEMOGRAPHIC_COLUMNS\n",
" demo_columns=ADDITIONAL_DEMO_COLUMNS + DEMOGRAPHIC_COLUMNS,\n",
" ),\n",
" utils.get_demo_series(\n",
" grouping_column=SCORE_COLUMN,\n",
" joined_df=joined_df,\n",
" demo_columns=ADDITIONAL_DEMO_COLUMNS + DEMOGRAPHIC_COLUMNS\n",
" demo_columns=ADDITIONAL_DEMO_COLUMNS + DEMOGRAPHIC_COLUMNS,\n",
" ),\n",
" ],\n",
" axis=1,\n",
@ -256,17 +259,25 @@
" y=\"Variable\",\n",
" x=\"Avg in tracts\",\n",
" hue=\"Definition\",\n",
" data=tract_level_by_identification_df.sort_values(by=COMPARATOR_COLUMN, ascending=False)\n",
" data=tract_level_by_identification_df.sort_values(\n",
" by=COMPARATOR_COLUMN, ascending=False\n",
" )\n",
" .stack()\n",
" .reset_index()\n",
" .rename(\n",
" columns={\"level_0\": \"Variable\", \"level_1\": \"Definition\", 0: \"Avg in tracts\"}\n",
" columns={\n",
" \"level_0\": \"Variable\",\n",
" \"level_1\": \"Definition\",\n",
" 0: \"Avg in tracts\",\n",
" }\n",
" ),\n",
" palette=\"Blues\",\n",
")\n",
"plt.xlim(0, 1)\n",
"plt.title(\"Tract level averages by identification strategy\")\n",
"plt.savefig(os.path.join(OUTPUT_DATA_PATH, \"tract_lvl_avg.jpg\"), bbox_inches='tight')"
"plt.savefig(\n",
" os.path.join(OUTPUT_DATA_PATH, \"tract_lvl_avg.jpg\"), bbox_inches=\"tight\"\n",
")"
]
},
{
@ -276,13 +287,13 @@
"metadata": {},
"outputs": [],
"source": [
"#papermill_description=Tract_stats_grouped\n",
"# papermill_description=Tract_stats_grouped\n",
"tract_level_by_grouping_df = utils.get_tract_level_grouping(\n",
" joined_df=joined_df,\n",
" score_column=SCORE_COLUMN,\n",
" comparator_column=COMPARATOR_COLUMN,\n",
" demo_columns=ADDITIONAL_DEMO_COLUMNS + DEMOGRAPHIC_COLUMNS,\n",
" keep_missing_values=KEEP_MISSING_VALUES_FOR_SEGMENTATION\n",
" keep_missing_values=KEEP_MISSING_VALUES_FOR_SEGMENTATION,\n",
")\n",
"\n",
"tract_level_by_grouping_formatted_df = utils.format_multi_index_for_excel(\n",
@ -315,7 +326,7 @@
"metadata": {},
"outputs": [],
"source": [
"#papermill_description=Population_stats\n",
"# papermill_description=Population_stats\n",
"population_weighted_stats_df = pd.concat(\n",
" [\n",
" utils.construct_weighted_statistics(\n",
@ -363,7 +374,7 @@
"comparator_and_cejst_proportion_series, states = utils.get_final_summary_info(\n",
" population=population_df,\n",
" comparator_file=COMPARATOR_FILE,\n",
" geoid_col=GEOID_COLUMN\n",
" geoid_col=GEOID_COLUMN,\n",
")"
]
},
@ -393,7 +404,7 @@
"metadata": {},
"outputs": [],
"source": [
"#papermill_description=Writing_excel\n",
"# papermill_description=Writing_excel\n",
"utils.write_single_comparison_excel(\n",
" output_excel=OUTPUT_EXCEL,\n",
" population_df=population_df,\n",
@ -401,7 +412,7 @@
" population_weighted_stats_df=population_weighted_stats_df,\n",
" tract_level_by_grouping_formatted_df=tract_level_by_grouping_formatted_df,\n",
" comparator_and_cejst_proportion_series=comparator_and_cejst_proportion_series,\n",
" states_text=states_text\n",
" states_text=states_text,\n",
")"
]
}

View file

@ -1,9 +1,9 @@
import pathlib
import pandas as pd
import xlsxwriter
from data_pipeline.score import field_names
from data_pipeline.etl.sources.census.etl_utils import get_state_information
from data_pipeline.score import field_names
# Some excel parameters
DEFAULT_COLUMN_WIDTH = 18