Imputing income using geographic neighbors (#1559)

Imputes income field with a light refactor. Needs more refactor and more tests (I spotchecked). Next ticket will check and address but a lot of "narwhal" architecture is here.
This commit is contained in:
Emma Nechamkin 2022-04-27 15:59:10 -04:00 committed by Emma Nechamkin
commit f047ca9d83
16 changed files with 1245 additions and 81 deletions

View file

@ -5,6 +5,9 @@ from data_pipeline.config import settings
from data_pipeline.score import field_names
## note: to keep map porting "right" fields, keeping descriptors the same.
# Base Paths
DATA_PATH = Path(settings.APP_ROOT) / "data"
TMP_PATH = DATA_PATH / "tmp"
@ -179,6 +182,8 @@ TILES_SCORE_COLUMNS = {
+ field_names.PERCENTILE_FIELD_SUFFIX: "P100_PFS",
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "P200_PFS",
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "P200_I_PFS",
field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "LPF_PFS",
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "NPL_PFS",
@ -198,7 +203,8 @@ TILES_SCORE_COLUMNS = {
field_names.M_HOUSING: "M_HSG",
field_names.M_POLLUTION: "M_PLN",
field_names.M_HEALTH: "M_HLTH",
field_names.SCORE_M_COMMUNITIES: "SM_C",
# temporarily update this so that it's the Narwhal score that gets visualized on the map
field_names.SCORE_N_COMMUNITIES: "SM_C",
field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EALRLI",
@ -283,7 +289,7 @@ TILES_SCORE_COLUMNS = {
## Low high school and low higher ed for t&wd
field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI",
## FPL 200 and low higher ed for all others
field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES: "M_EBSI",
field_names.FPL_200_SERIES: "M_EBSI",
}
# columns to round floats to 2 decimals
@ -311,6 +317,8 @@ TILES_SCORE_FLOAT_COLUMNS = [
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
@ -332,7 +340,6 @@ TILES_SCORE_FLOAT_COLUMNS = [
field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD,
field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.COLLEGE_NON_ATTENDANCE_FIELD,
field_names.COLLEGE_ATTENDANCE_FIELD,
]

View file

@ -405,6 +405,7 @@ class ScoreETL(ExtractTransformLoad):
df[field_names.MEDIAN_INCOME_FIELD] / df[field_names.AMI_FIELD]
)
# QQ: why don't we just filter to the numeric columns by type?
numeric_columns = [
field_names.HOUSING_BURDEN_FIELD,
field_names.TOTAL_POP_FIELD,
@ -458,6 +459,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.IMPENETRABLE_SURFACES_FIELD,
# We have to pass this boolean here in order to include it in ag value loss percentiles.
field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
]
non_numeric_columns = [

View file

@ -29,7 +29,7 @@ from . import constants
logger = get_module_logger(__name__)
# Define the DAC variable
DISADVANTAGED_COMMUNITIES_FIELD = field_names.SCORE_M_COMMUNITIES
DISADVANTAGED_COMMUNITIES_FIELD = field_names.SCORE_N_COMMUNITIES
class PostScoreETL(ExtractTransformLoad):