mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
Rescaling linguistic isolation (#1750)
Rescales linguistic isolation to drop puerto rico
This commit is contained in:
parent
2ab24c60fa
commit
f6efdd4e14
1 changed files with 57 additions and 56 deletions
|
@ -264,6 +264,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
df: pd.DataFrame,
|
||||
input_column_name: str,
|
||||
output_column_name_root: str,
|
||||
drop_tracts: list = None,
|
||||
ascending: bool = True,
|
||||
) -> pd.DataFrame:
|
||||
"""Creates percentiles.
|
||||
|
@ -298,10 +299,15 @@ class ScoreETL(ExtractTransformLoad):
|
|||
something like "3rd grade reading proficiency" and `output_column_name_root`
|
||||
may be something like "Low 3rd grade reading proficiency".
|
||||
"""
|
||||
if (
|
||||
output_column_name_root
|
||||
!= field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
|
||||
):
|
||||
|
||||
# We have two potential options for assessing how to calculate percentiles.
|
||||
# For the vast majority of columns, we will simply calculate percentiles overall.
|
||||
# However, for Linguistic Isolation and Agricultural Value Loss, there exist conditions
|
||||
# for which we drop out tracts from consideration in the percentile. More details on those
|
||||
# are below, for them, we provide a list of tracts to not include.
|
||||
# Because of the fancy transformations below, I have removed the urban / rural percentiles,
|
||||
# which are now deprecated.
|
||||
if not drop_tracts:
|
||||
# Create the "basic" percentile.
|
||||
df[
|
||||
f"{output_column_name_root}"
|
||||
|
@ -309,62 +315,23 @@ class ScoreETL(ExtractTransformLoad):
|
|||
] = df[input_column_name].rank(pct=True, ascending=ascending)
|
||||
|
||||
else:
|
||||
# For agricultural loss, we are using whether there is value at all to determine percentile and then
|
||||
# filling places where the value is False with 0
|
||||
tmp_series = df[input_column_name].where(
|
||||
~df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts),
|
||||
np.nan,
|
||||
)
|
||||
logger.info(
|
||||
f"Creating special case column for percentiles from {input_column_name}"
|
||||
)
|
||||
df[
|
||||
f"{output_column_name_root}"
|
||||
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
|
||||
] = (
|
||||
df.where(
|
||||
df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
|
||||
== 1.0
|
||||
)[input_column_name]
|
||||
.rank(ascending=ascending, pct=True)
|
||||
.fillna(
|
||||
df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
|
||||
)
|
||||
)
|
||||
] = tmp_series.rank(ascending=ascending, pct=True)
|
||||
|
||||
# Create the urban/rural percentiles.
|
||||
urban_rural_percentile_fields_to_combine = []
|
||||
for (urban_or_rural_string, urban_heuristic_bool) in [
|
||||
("urban", True),
|
||||
("rural", False),
|
||||
]:
|
||||
# Create a field with only those values
|
||||
this_category_only_value_field = (
|
||||
f"{input_column_name} (value {urban_or_rural_string} only)"
|
||||
)
|
||||
df[this_category_only_value_field] = np.where(
|
||||
df[field_names.URBAN_HEURISTIC_FIELD] == urban_heuristic_bool,
|
||||
df[input_column_name],
|
||||
None,
|
||||
)
|
||||
|
||||
# Calculate the percentile for only this category
|
||||
this_category_only_percentile_field = (
|
||||
f"{output_column_name_root} "
|
||||
f"(percentile {urban_or_rural_string} only)"
|
||||
)
|
||||
df[this_category_only_percentile_field] = df[
|
||||
this_category_only_value_field
|
||||
].rank(
|
||||
pct=True,
|
||||
# Set ascending to the parameter value.
|
||||
ascending=ascending,
|
||||
)
|
||||
|
||||
# Add the field name to this list. Later, we'll combine this list.
|
||||
urban_rural_percentile_fields_to_combine.append(
|
||||
this_category_only_percentile_field
|
||||
)
|
||||
|
||||
# Combine both urban and rural into one field:
|
||||
df[
|
||||
f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}"
|
||||
] = df[urban_rural_percentile_fields_to_combine].mean(
|
||||
axis=1, skipna=True
|
||||
)
|
||||
# Check that "drop tracts" were dropped (quicker than creating a fixture?)
|
||||
assert df[df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts)][
|
||||
f"{output_column_name_root}"
|
||||
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
|
||||
].isna().sum() == len(drop_tracts), "Not all tracts were dropped"
|
||||
|
||||
return df
|
||||
|
||||
|
@ -524,13 +491,47 @@ class ScoreETL(ExtractTransformLoad):
|
|||
df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)
|
||||
|
||||
# Convert all columns to numeric and do math
|
||||
# Note that we have a few special conditions here, that we handle explicitly.
|
||||
# For *Linguistic Isolation*, we do NOT want to include Puerto Rico in the percentile
|
||||
# calculation. This is because linguistic isolation as a category doesn't make much sense
|
||||
# in Puerto Rico, where Spanish is a recognized language. Thus, we construct a list
|
||||
# of tracts to drop from the percentile calculation.
|
||||
#
|
||||
# For *Expected Agricultural Loss*, we only want to include in the percentile tracts
|
||||
# in which there is some agricultural value. This helps us adjust the data such that we have
|
||||
# the ability to discern which tracts truly are at the 90th percentile, since many tracts have 0 value.
|
||||
|
||||
for numeric_column in numeric_columns:
|
||||
drop_tracts = []
|
||||
if (
|
||||
numeric_column
|
||||
== field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
|
||||
):
|
||||
drop_tracts = df_copy[
|
||||
~df_copy[field_names.AGRICULTURAL_VALUE_BOOL_FIELD]
|
||||
.astype(bool)
|
||||
.fillna(False)
|
||||
][field_names.GEOID_TRACT_FIELD].to_list()
|
||||
logger.info(
|
||||
f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss"
|
||||
)
|
||||
|
||||
elif numeric_column == field_names.LINGUISTIC_ISO_FIELD:
|
||||
drop_tracts = df_copy[
|
||||
# 72 is the FIPS code for Puerto Rico
|
||||
df_copy[field_names.GEOID_TRACT_FIELD].str.startswith("72")
|
||||
][field_names.GEOID_TRACT_FIELD].to_list()
|
||||
logger.info(
|
||||
f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation"
|
||||
)
|
||||
|
||||
df_copy = self._add_percentiles_to_df(
|
||||
df=df_copy,
|
||||
input_column_name=numeric_column,
|
||||
# For this use case, the input name and output name root are the same.
|
||||
output_column_name_root=numeric_column,
|
||||
ascending=True,
|
||||
drop_tracts=drop_tracts,
|
||||
)
|
||||
|
||||
# Min-max normalization:
|
||||
|
|
Loading…
Add table
Reference in a new issue