Cleaning up quick code (#1349)

Did some quick, mostly cosmetic changes and updates to the quick launch changes. This mostly entailed changing strings to constants and cleaning up some code to make it neater.

Changes -- PR AMI, updating ag loss, and dropping pr from some threshold counts.
This commit is contained in:
Emma Nechamkin 2022-03-02 16:50:04 -05:00 committed by GitHub
commit aea49cbb5a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 341 additions and 348 deletions

View file

@ -22,7 +22,6 @@ DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"
# Score paths
DATA_SCORE_DIR = DATA_PATH / "score"
@ -66,6 +65,9 @@ CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
# Drop FIPS codes from map
DROP_FIPS_CODES = ["66", "78"]
# Drop FIPS codes from incrementing
DROP_FIPS_FROM_NON_WTD_THRESHOLDS = "72"
# Percent prefixes for rounding
PERCENT_PREFIXES_SUFFIXES = [
"Percent",

View file

@ -298,33 +298,20 @@ class ScoreETL(ExtractTransformLoad):
] = df[input_column_name].rank(pct=True, ascending=ascending)
else:
# For agricultural loss, we are using whether there is value at all to determine percentile
# This is not the most thoughtfully written code, but it works.
# Take only rows with agrivalue
tmp_df = df[df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD] == 1][
[input_column_name, field_names.GEOID_TRACT_FIELD]
].copy()
# Construct a percentile only among those tracts
tmp_df["temporary_ranking"] = tmp_df[input_column_name].transform(
lambda x: x.rank(pct=True, ascending=True)
)
# # Create a map for just those tracts and map it onto the df
temporary_ranking = tmp_df.set_index(field_names.GEOID_TRACT_FIELD)[
"temporary_ranking"
].to_dict()
# For agricultural loss, we are using whether there is value at all to determine percentile and then
# filling places where the value is False with 0
df[
f"{output_column_name_root}"
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
] = np.where(
df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].isna(),
np.nan,
df[field_names.GEOID_TRACT_FIELD]
.map(temporary_ranking)
.fillna(0),
] = (
df.where(
df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
== 1.0
)[input_column_name]
.rank(ascending=ascending, pct=True)
.fillna(
df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
)
)
# Create the urban/rural percentiles.

View file

@ -264,7 +264,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
low_memory=False,
)
logger.info("Pulling PR info down.")
logger.info("Pulling PR tract list down.")
# This step is necessary because PR is not in geocorr at the level that gets joined
pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
download_file_from_url(
file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
@ -273,11 +274,11 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
filepath_or_buffer=self.get_tmp_path()
/ "pr_tracts"
/ "pr_tracts.csv",
# Skip second row, which has descriptions.
# The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={"GEOID10_TRACT": str},
low_memory=False,
)
self.pr_tracts["State Abbreviation"] = "PR"
# Download MSA median incomes
logger.info("Starting download of MSA median incomes.")
@ -298,12 +299,10 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
msa_median_incomes_df = self._transform_msa_median_incomes()
state_median_incomes_df = self._transform_state_median_incomes()
# Adds 945 PR tracts
geocorr_df_plus_pr = geocorr_df.merge(
self.pr_tracts, how="outer", indicator=True
)
# Adds 945 PR tracts to the geocorr dataframe
geocorr_df_plus_pr = geocorr_df.merge(self.pr_tracts, how="outer")
# Join tracts on MSA incomes (this is where we lose PR)
# Join tracts on MSA incomes
merged_df = geocorr_df_plus_pr.merge(
msa_median_incomes_df, on=self.MSA_ID_FIELD_NAME, how="left"
)