Merge branch 'main' into esfoobar-usds/1062-implement-changes-export-files

This commit is contained in:
Jorge Escobar 2022-01-06 12:18:51 -05:00
commit d01bbc7dfa
32 changed files with 1463 additions and 562 deletions

View file

@ -3,6 +3,7 @@ import json
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.score.etl_utils import floor_series
from data_pipeline.utils import get_module_logger, zip_files
from data_pipeline.score import field_names
@ -207,13 +208,15 @@ class PostScoreETL(ExtractTransformLoad):
# filter the columns on full score
score_tiles = score_county_state_merged_df[tiles_score_column_titles]
# round decimals
decimals = pd.Series(
[constants.TILES_ROUND_NUM_DECIMALS]
* len(constants.TILES_SCORE_FLOAT_COLUMNS),
index=constants.TILES_SCORE_FLOAT_COLUMNS,
score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[
constants.TILES_SCORE_FLOAT_COLUMNS
].apply(
func=lambda series: floor_series(
series=series,
number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS,
),
axis=0,
)
score_tiles = score_tiles.round(decimals)
# create indexes
score_tiles = score_tiles.rename(

View file

@ -1,6 +1,9 @@
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.utils import (
@ -48,3 +51,60 @@ def check_score_data_source(
"No local score tiles data found. Please use '-d aws` to fetch from AWS"
)
sys.exit()
def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
"""Floors all non-null numerical values to a specific number of decimal points
Args:
series (pd.Series): Input pandas series
number_of_decimals (int): Number of decimal points to floor all numerical values to
Returns:
floored_series (pd.Series): A Pandas Series of numerical values with appropriate number of decimal points
"""
# we perform many operations using the division operator
# as well as elementwise multiplication. The result of such
# operations can introduce such values, below, due to numerical
# instability. This results in unsafe type inference for numpy
# float types - exacerbated by panda's type inference engine.
# Hence, to handle such offending values we default to None
# Please see the reference, below, on nullable integer types for more details
unacceptable_values = [-np.inf, np.inf, "None", np.nan]
mapping = {
unacceptable_value: None for unacceptable_value in unacceptable_values
}
# ensure we are working with a numpy array (which is really what a pandas series is)
if not isinstance(series, pd.Series):
raise TypeError(
f"Argument series must be of type pandas series, not of type {type(series).__name__}."
)
# raise exception for handling empty series
if series.empty:
raise ValueError("Empty series provided.")
# if we have any values, just replace them with None
if series.isin(unacceptable_values).any():
series.replace(mapping, regex=False, inplace=True)
multiplication_factor = 10 ** number_of_decimals
# In order to safely cast NaNs
# First coerce series to float type: series.astype(float)
# Please see here:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#nullable-integer-data-type
product_for_numerator = np.floor(
series.astype(float) * multiplication_factor
)
floored_series = np.where(
series.isnull(),
# For all null values default to null
None,
# The other default condition - floor non-null values
product_for_numerator / multiplication_factor,
)
return floored_series

View file

@ -0,0 +1,72 @@
import pandas as pd
import numpy as np
import pytest
from data_pipeline.etl.score.etl_utils import floor_series
def test_floor_series():
# test examples
series = pd.Series(data=[None, 1, 0.324534, 1.2341], dtype="float64")
series_exponentiated = pd.Series(
data=[
-np.inf,
np.inf,
"None",
-0.131321313123123,
5.62322441e-15,
1.2341123131313131312e12,
]
)
series_of_nan_values = pd.Series(data=[None, None, None, None, None])
series_empty = pd.Series(data=[], dtype="float64")
# list of randomly generated values
invalid_type = list(np.random.uniform(1, 1000000, size=15))
floored_series_1 = floor_series(series, number_of_decimals=2)
floored_series_2 = floor_series(series, number_of_decimals=3)
floored_series_3 = floor_series(series, number_of_decimals=1)
floored_series_4 = floor_series(series_of_nan_values, number_of_decimals=10)
floored_series_5 = floor_series(series_exponentiated, number_of_decimals=1)
# expected fixtures
expected_1 = np.array([None, 1.0, 0.32, 1.23])
expected_2 = np.array([None, 1.00, 0.324, 1.234])
expected_3 = np.array([None, 1.0, 0.3, 1.2])
expected_4 = np.array([None, None, None, None, None])
expected_5 = np.array([None, None, None, -0.2, 0.0, 1234112313131.3])
# Test for expected value with 2 decimal places
# Elewentwise comparison to ensure all values are equal
all_elements_are_equal_one = np.equal(expected_1, floored_series_1)
assert all_elements_are_equal_one.all()
# Test for expected value with 3 decimal places
# Elewentwise comparison to ensure all values are equal
all_elements_are_equal_two = np.equal(expected_2, floored_series_2)
assert all_elements_are_equal_two.all()
# Test for expected value with 1 decimal place
# Elewentwise comparison to ensure all values are equal
all_elements_are_equal_three = np.equal(expected_3, floored_series_3)
assert all_elements_are_equal_three.all()
# Test for expected value for some arbitrary decimal place
# Elewentwise comparison to ensure all values are equal for NaN
all_elements_are_equal_four = np.equal(expected_4, floored_series_4)
assert all_elements_are_equal_four.all()
# Test for expected value for some arbitrary decimal place
# Elewentwise comparison to ensure all floating point imprecision
# is clamped to a certain number of decimal points
all_elements_are_equal_five = np.equal(expected_5, floored_series_5)
assert all_elements_are_equal_five.all()
# Test for empty series - should raise a ValueError exception
with pytest.raises(ValueError, match="Empty series provided."):
floor_series(series_empty, number_of_decimals=2)
# Test for invalid type - should raise a TypeError exception
with pytest.raises(
TypeError,
match="Argument series must be of type pandas series, not of type list.",
):
floor_series(invalid_type, number_of_decimals=3)

View file

@ -95,9 +95,13 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
)
# Convert to boolean:
self.df[field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE] = \
self.df[field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE
].astype('bool')
self.df[
field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE
] = self.df[
field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE
].astype(
"bool"
)
def validate(self) -> None:
logger.info("Validating data")

View file

@ -1,48 +1,39 @@
appnope==0.1.2; sys_platform == "darwin" and python_version >= "3.7" and platform_system == "Darwin"
argcomplete==1.12.3; python_version < "3.8.0" and python_version >= "3.7"
argon2-cffi==21.1.0; python_version >= "3.6"
astroid==2.8.0; python_version >= "3.6" and python_version < "4.0"
atomicwrites==1.4.0; python_version >= "3.6" and python_full_version < "3.0.0" and sys_platform == "win32" or sys_platform == "win32" and python_version >= "3.6" and python_full_version >= "3.4.0"
attrs==21.2.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.6"
astroid==2.8.3; python_version >= "3.6" and python_version < "4.0"
attrs==21.2.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7"
backcall==0.2.0; python_version >= "3.7"
backports.entry-points-selectable==1.1.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "2.7"
black==21.9b0; python_full_version >= "3.6.2"
bleach==4.1.0; python_version >= "3.7"
censusdata==1.15; python_version >= "2.7"
certifi==2021.5.30; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.7"
cffi==1.14.6; implementation_name == "pypy" and python_version >= "3.6"
charset-normalizer==2.0.6; python_full_version >= "3.6.0" and python_version >= "3"
certifi==2021.10.8; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.7"
cffi==1.15.0; implementation_name == "pypy" and python_version >= "3.7" and python_full_version >= "3.6.1"
charset-normalizer==2.0.7; python_full_version >= "3.6.0" and python_version >= "3"
click-plugins==1.1.1; python_version >= "3.6"
click==8.0.1; python_version >= "3.6"
click==8.0.3; python_version >= "3.6"
cligj==0.7.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version < "4" and python_version >= "3.6"
colorama==0.4.4; platform_system == "Windows" and python_version >= "3.7" and python_full_version >= "3.6.2" and sys_platform == "win32" and python_version < "4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" and sys_platform == "win32" or sys_platform == "win32" and python_version >= "3.6" and python_full_version >= "3.5.0") and (python_version >= "3.7" and python_full_version < "3.0.0" and sys_platform == "win32" or sys_platform == "win32" and python_version >= "3.7" and python_full_version >= "3.5.0")
configparser==5.0.2; python_version >= "3.6"
colorama==0.4.4; python_version >= "3.7" and python_full_version < "3.0.0" and platform_system == "Windows" and sys_platform == "win32" and python_version < "4.0" or platform_system == "Windows" and python_version >= "3.7" and python_full_version >= "3.5.0" and sys_platform == "win32" and python_version < "4.0"
cycler==0.10.0; python_version >= "3.7"
debugpy==1.4.3; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7"
debugpy==1.5.1; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7"
decorator==5.1.0; python_version >= "3.7"
defusedxml==0.7.1; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7"
distlib==0.3.2; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
dparse==0.5.1; python_version >= "3.5"
dynaconf==3.1.7; python_version >= "3.7"
entrypoints==0.3; python_full_version >= "3.6.1" and python_version >= "3.7"
et-xmlfile==1.1.0; python_version >= "3.6"
filelock==3.0.12; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
fiona==1.8.20; python_version >= "3.6"
flake8==3.9.2; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
geopandas==0.9.0; python_version >= "3.6"
idna==3.2; python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.5"
importlib-metadata==4.8.1; python_version == "3.7" and (python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "3.8" or python_full_version >= "3.5.0" and python_version < "3.8" and python_version >= "3.6") and python_full_version >= "3.6.2"
iniconfig==1.1.1; python_version >= "3.6"
ipykernel==6.4.1; python_version >= "3.7"
idna==3.3; python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.5"
importlib-metadata==4.8.1; python_version == "3.7"
ipdb==0.13.9; python_version >= "2.7"
ipykernel==6.4.2; python_version >= "3.7"
ipython-genutils==0.2.0; python_version >= "3.7"
ipython==7.27.0; python_version >= "3.7"
ipython==7.28.0; python_version >= "3.7"
ipywidgets==7.6.5
isort==5.9.3; python_full_version >= "3.6.1" and python_version < "4.0" and python_version >= "3.6"
jedi==0.18.0; python_version >= "3.7"
jellyfish==0.6.1
jinja2==3.0.1; python_version >= "3.7"
jsonschema==3.2.0; python_version >= "3.5"
jupyter-client==7.0.3; python_full_version >= "3.6.1" and python_version >= "3.7"
jinja2==3.0.2; python_version >= "3.7"
jsonschema==4.1.2; python_version >= "3.7"
jupyter-client==7.0.6; python_full_version >= "3.6.1" and python_version >= "3.7"
jupyter-console==6.4.0; python_version >= "3.6"
jupyter-contrib-core==0.3.3
jupyter-contrib-nbextensions==0.5.1
@ -55,79 +46,64 @@ jupyterlab-pygments==0.1.2; python_version >= "3.7"
jupyterlab-widgets==1.0.2; python_version >= "3.6"
kiwisolver==1.3.2; python_version >= "3.7"
lazy-object-proxy==1.6.0; python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.6" and python_version < "4.0" and python_full_version >= "3.6.0"
liccheck==0.6.2; python_version >= "2.7"
lxml==4.6.3; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
lxml==4.6.5; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
markupsafe==2.0.1; python_version >= "3.7"
matplotlib-inline==0.1.3; python_version >= "3.7"
matplotlib==3.4.3; python_version >= "3.7"
mccabe==0.6.1; python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.6" and python_version < "4.0" and python_full_version >= "3.5.0"
mccabe==0.6.1; python_version >= "3.6" and python_version < "4.0"
mistune==0.8.4; python_version >= "3.7"
munch==2.5.0; python_version >= "3.6"
mypy-extensions==0.4.3; python_full_version >= "3.6.2" and python_version >= "3.5"
mypy==0.910; python_version >= "3.5"
nbclient==0.5.4; python_full_version >= "3.6.1" and python_version >= "3.7"
nbconvert==6.1.0; python_version >= "3.7"
nbconvert==6.2.0; python_version >= "3.7"
nbformat==5.1.3; python_full_version >= "3.6.1" and python_version >= "3.7"
nest-asyncio==1.5.1; python_full_version >= "3.6.1" and python_version >= "3.7"
notebook==6.4.4; python_version >= "3.6"
notebook==6.4.5; python_version >= "3.6"
numpy==1.21.1; python_version >= "3.7"
openpyxl==3.0.7; python_version >= "3.6"
packaging==21.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7"
pandas==1.3.3; python_full_version >= "3.7.1"
packaging==21.0; python_version >= "3.7"
pandas==1.3.4; python_full_version >= "3.7.1"
pandocfilters==1.5.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7"
parso==0.8.2; python_version >= "3.7"
pathspec==0.9.0; python_full_version >= "3.6.2"
pexpect==4.8.0; sys_platform != "win32" and python_version >= "3.7"
pickleshare==0.7.5; python_version >= "3.7"
pillow==8.3.2; python_version >= "3.7"
platformdirs==2.3.0; python_version >= "3.6" and python_full_version >= "3.6.2" and python_version < "4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.6")
pluggy==1.0.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.6"
pillow==8.4.0; python_version >= "3.7"
platformdirs==2.4.0; python_version >= "3.6" and python_version < "4.0"
prometheus-client==0.11.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
prompt-toolkit==3.0.20; python_full_version >= "3.6.2" and python_version >= "3.7"
prompt-toolkit==3.0.21; python_full_version >= "3.6.2" and python_version >= "3.7"
ptyprocess==0.7.0; sys_platform != "win32" and python_version >= "3.7" and os_name != "nt"
py==1.10.0; python_version >= "3.6" and python_full_version < "3.0.0" and implementation_name == "pypy" or python_full_version >= "3.5.0" and python_version >= "3.6" and implementation_name == "pypy"
pycodestyle==2.7.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
pycparser==2.20; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
pyflakes==2.3.1; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
py==1.10.0; implementation_name == "pypy" and python_version >= "3.7" and python_full_version >= "3.6.1"
pycparser==2.20; implementation_name == "pypy" and python_version >= "3.7" and python_full_version >= "3.6.1"
pygments==2.10.0; python_version >= "3.7"
pylint==2.11.1; python_version >= "3.6" and python_version < "4.0"
pypandoc==1.6.4
pyparsing==2.4.7; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.7"
pyproj==3.2.1; python_version >= "3.7"
pyrsistent==0.18.0; python_version >= "3.6"
pytest-mock==3.6.1; python_version >= "3.6"
pytest==6.2.5; python_version >= "3.6"
pyrsistent==0.18.0; python_version >= "3.7"
python-dateutil==2.8.2; python_full_version >= "3.7.1" and python_version >= "3.7"
pytz==2021.1; python_full_version >= "3.7.1" and python_version >= "2.7"
pywin32==301; sys_platform == "win32" and platform_python_implementation != "PyPy" and python_version >= "3.6"
pytz==2021.3; python_full_version >= "3.7.1" and python_version >= "3.6"
pywin32==302; sys_platform == "win32" and platform_python_implementation != "PyPy" and python_version >= "3.7"
pywinpty==1.1.4; os_name == "nt" and python_version >= "3.6"
pyyaml==5.4.1; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0"
pyyaml==6.0; python_version >= "3.6"
pyzmq==22.3.0; python_full_version >= "3.6.1" and python_version >= "3.7"
qtconsole==5.1.1; python_version >= "3.6"
qtpy==1.11.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.6"
regex==2021.8.28; python_full_version >= "3.6.2"
qtpy==1.11.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.6"
requests==2.26.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.6.0")
safety==1.10.3; python_version >= "3.5"
semantic-version==2.8.5; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "2.7"
send2trash==1.8.0; python_version >= "3.6"
shapely==1.7.1; python_version >= "3.6"
six==1.16.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7"
six==1.16.0; python_full_version >= "3.7.1" and python_version >= "3.7" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.6") and (python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.7")
terminado==0.12.1; python_version >= "3.6"
testpath==0.5.0; python_version >= "3.7"
toml==0.10.2; python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "4.0" or python_full_version >= "3.5.0" and python_version >= "3.6" and python_version < "4.0"
tomli==1.2.1; python_version >= "3.6" and python_full_version >= "3.6.2"
toml==0.10.2; python_version > "3.6" and python_full_version < "3.0.0" and python_version < "4.0" or python_full_version >= "3.3.0" and python_version > "3.6" and python_version < "4.0"
tornado==6.1; python_full_version >= "3.6.1" and python_version >= "3.7"
tox==3.24.4; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
tqdm==4.62.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.4.0")
traitlets==5.1.0; python_full_version >= "3.6.1" and python_version >= "3.7"
types-requests==2.25.8
typing-extensions==3.10.0.2; python_version < "3.8" and python_version >= "3.6"
urllib3==1.26.6; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version < "4" and python_version >= "2.7"
typed-ast==1.4.3; implementation_name == "cpython" and python_version < "3.8" and python_version >= "3.6"
types-requests==2.25.11
typing-extensions==3.10.0.2; python_version >= "3.6" and python_version < "3.8"
urllib3==1.26.7; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version < "4" and python_version >= "2.7"
us==2.0.2
virtualenv==20.8.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
wcwidth==0.2.5; python_full_version >= "3.6.2" and python_version >= "3.7"
webencodings==0.5.1; python_version >= "3.7"
widgetsnbextension==3.5.1
wrapt==1.12.1; python_version >= "3.6" and python_version < "4.0"
wrapt==1.13.2; python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.6" and python_version < "4.0" and python_full_version >= "3.5.0"
xlsxwriter==2.0.0
zipp==3.5.0; python_version < "3.8" and python_version >= "3.6"
zipp==3.6.0; python_version < "3.8" and python_version >= "3.6"

View file

@ -7,19 +7,32 @@ skip_missing_interpreters = true
[testenv:lint]
# lints python code in src and tests
basepython = python3.9
deps = -rrequirements.txt
# These are "external" because they are dev dependencies that not in
# "requirements.txt" and therefore not installed in this env, but they will be
# available where this is being run. See:
# https://stackoverflow.com/questions/47642747/tox-warningtest-command-found-but-not-installed-in-testenv
allowlist_externals = black
flake8
pylint
commands = black data_pipeline
flake8 data_pipeline
pylint data_pipeline
# Ignore tests this lint check because test dependencies are not installed here.
pylint data_pipeline --ignore tests
[testenv:checkdeps]
# checks the dependencies for security vulnerabilities and open source licenses
deps = -rrequirements.txt
# These are "external" because they are dev dependencies that not in
# "requirements.txt" and therefore not installed in this env, but they will be
# available where this is being run. See:
# https://stackoverflow.com/questions/47642747/tox-warningtest-command-found-but-not-installed-in-testenv
allowlist_externals = safety
liccheck
commands = safety check
liccheck
[testenv:pytest]
# Run tests
deps = pytest
commands = pytest
commands = pytest