County Names for Score #188 (#347)

* starting PR

* completed feature

* checkpoint

* adding new fips and updating counties to 2010

* updated sources to 2010 - 2019

* more cleanup

* creating tiles score csv
This commit is contained in:
Jorge Escobar 2021-07-15 13:34:08 -04:00 committed by GitHub
commit 0316906a69
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 425 additions and 54 deletions

View file

@ -11,10 +11,14 @@ logger = get_module_logger(__name__)
class CensusACSETL(ExtractTransformLoad):
def __init__(self):
self.ACS_YEAR = 2019
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
)
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = "Linguistic isolation (total)"
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
"Linguistic isolation (total)"
)
self.LINGUISTIC_ISOLATION_FIELDS = [
"C16002_001E",
"C16002_004E",
@ -24,7 +28,9 @@ class CensusACSETL(ExtractTransformLoad):
]
self.df: pd.DataFrame
def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
def _fips_from_censusdata_censusgeo(
self, censusgeo: censusdata.censusgeo
) -> str:
"""Create a FIPS code from the proprietary censusgeo index."""
fips = "".join([value for (key, value) in censusgeo.params()])
return fips
@ -32,7 +38,9 @@ class CensusACSETL(ExtractTransformLoad):
def extract(self) -> None:
dfs = []
for fips in get_state_fips_codes(self.DATA_PATH):
logger.info(f"Downloading data for state/territory with FIPS code {fips}")
logger.info(
f"Downloading data for state/territory with FIPS code {fips}"
)
dfs.append(
censusdata.download(
@ -61,7 +69,9 @@ class CensusACSETL(ExtractTransformLoad):
# Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
self.df[self.UNEMPLOYED_FIELD_NAME] = (
self.df.B23025_005E / self.df.B23025_003E
)
# Calculate linguistic isolation.
individual_limited_english_fields = [