j40-cejst-2/data/data-pipeline/pyproject.toml

145 lines
4 KiB
TOML
Raw Permalink Normal View History

[tool.poetry]
name = "justice40-data-pipeline"
version = "0.1.0"
description = "ETL, Score and Map Generation of Justice 40 Tool"
authors = ["Justice40 Engineering <j40-engineering@lists.usds.gov>"]
keywords = ["justice40", "environmental_justice", "python", "etl"]
readme = "README.md"
license = "MIT"
homepage = "https://github.com/usds/justice40-tool/tree/main/data/data-pipeline"
repository = "https://github.com/usds/justice40-tool"
include = [
"LICENSE",
]
packages = [
{include = "data_pipeline"}
]
[tool.poetry.dependencies]
CensusData = "^1.13"
2022-09-26 12:05:40 -04:00
click = "8.0.4" # pinning for now per https://github.com/psf/black/issues/2964
dynaconf = "^3.1.4"
Add FUDS ETL (#1817) * Add spatial join method (#1871) Since we'll need to figure out the tracts for a large number of points in future tickets, add a utility to handle grabbing the tract geometries and adding tract data to a point dataset. * Add FUDS, also jupyter lab (#1871) * Add YAML configs for FUDS (#1871) * Allow input geoid to be optional (#1871) * Add FUDS ETL, tests, test-datae noteobook (#1871) This adds the ETL class for Formerly Used Defense Sites (FUDS). This is different from most other ETLs since these FUDS are not provided by tract, but instead by geographic point, so we need to assign FUDS to tracts and then do calculations from there. * Floats -> Ints, as I intended (#1871) * Floats -> Ints, as I intended (#1871) * Formatting fixes (#1871) * Add test false positive GEOIDs (#1871) * Add gdal binaries (#1871) * Refactor pandas code to be more idiomatic (#1871) Per Emma, the more pandas-y way of doing my counts is using np.where to add the values i need, then groupby and size. It is definitely more compact, and also I think more correct! * Update configs per Emma suggestions (#1871) * Type fixed! (#1871) * Remove spurious import from vscode (#1871) * Snapshot update after changing col name (#1871) * Move up GDAL (#1871) * Adjust geojson strategy (#1871) * Try running census separately first (#1871) * Fix import order (#1871) * Cleanup cache strategy (#1871) * Download census data from S3 instead of re-calculating (#1871) * Clarify pandas code per Emma (#1871)
2022-08-16 13:28:39 -04:00
geopandas = "^0.11.0"
ipdb = "^0.13.9"
ipython = "^7.31.1"
jupyter = "^1.0.0"
jupyter-contrib-nbextensions = "^0.5.1"
marshmallow-dataclass = "^8.5.3"
marshmallow-enum = "^1.5.1"
matplotlib = "^3.4.2"
numpy = "^1.22.1"
pandas = "^1.2.5"
pylint = "^2.11.1"
pillow = "9.0.1"
python = "^3.8"
2021-08-09 22:24:14 -05:00
pypandoc = "^1.6.3"
PyYAML = "^6.0"
requests = "^2.25.1"
2021-08-09 22:24:14 -05:00
tqdm = "4.62.0"
types-requests = "^2.25.0"
2021-08-09 22:24:14 -05:00
us = "^2.0.2"
xlsxwriter = "^2.0.0"
pydantic = "^1.9.0"
Add FUDS ETL (#1817) * Add spatial join method (#1871) Since we'll need to figure out the tracts for a large number of points in future tickets, add a utility to handle grabbing the tract geometries and adding tract data to a point dataset. * Add FUDS, also jupyter lab (#1871) * Add YAML configs for FUDS (#1871) * Allow input geoid to be optional (#1871) * Add FUDS ETL, tests, test-datae noteobook (#1871) This adds the ETL class for Formerly Used Defense Sites (FUDS). This is different from most other ETLs since these FUDS are not provided by tract, but instead by geographic point, so we need to assign FUDS to tracts and then do calculations from there. * Floats -> Ints, as I intended (#1871) * Floats -> Ints, as I intended (#1871) * Formatting fixes (#1871) * Add test false positive GEOIDs (#1871) * Add gdal binaries (#1871) * Refactor pandas code to be more idiomatic (#1871) Per Emma, the more pandas-y way of doing my counts is using np.where to add the values i need, then groupby and size. It is definitely more compact, and also I think more correct! * Update configs per Emma suggestions (#1871) * Type fixed! (#1871) * Remove spurious import from vscode (#1871) * Snapshot update after changing col name (#1871) * Move up GDAL (#1871) * Adjust geojson strategy (#1871) * Try running census separately first (#1871) * Fix import order (#1871) * Cleanup cache strategy (#1871) * Download census data from S3 instead of re-calculating (#1871) * Clarify pandas code per Emma (#1871)
2022-08-16 13:28:39 -04:00
Rtree = "^1.0.0"
[tool.poetry.dev-dependencies]
black = {version = "^21.6b0", allow-prereleases = true}
flake8 = "^3.9.2"
liccheck = "^0.6.2"
mypy = "^0.910"
openpyxl = "^3.0.7"
pylint = "^2.9.6"
pytest = "^6.2.4"
safety = "^1.10.3"
tox = "^3.24.0"
pytest-mock = "^3.6.1"
tox-poetry = "^0.4.1"
pandas-vet = "^0.2.2"
pytest-snapshot = "^0.8.1"
nb-black = "^1.0.7"
seaborn = "^0.11.2"
papermill = "^2.3.4"
Add FUDS ETL (#1817) * Add spatial join method (#1871) Since we'll need to figure out the tracts for a large number of points in future tickets, add a utility to handle grabbing the tract geometries and adding tract data to a point dataset. * Add FUDS, also jupyter lab (#1871) * Add YAML configs for FUDS (#1871) * Allow input geoid to be optional (#1871) * Add FUDS ETL, tests, test-datae noteobook (#1871) This adds the ETL class for Formerly Used Defense Sites (FUDS). This is different from most other ETLs since these FUDS are not provided by tract, but instead by geographic point, so we need to assign FUDS to tracts and then do calculations from there. * Floats -> Ints, as I intended (#1871) * Floats -> Ints, as I intended (#1871) * Formatting fixes (#1871) * Add test false positive GEOIDs (#1871) * Add gdal binaries (#1871) * Refactor pandas code to be more idiomatic (#1871) Per Emma, the more pandas-y way of doing my counts is using np.where to add the values i need, then groupby and size. It is definitely more compact, and also I think more correct! * Update configs per Emma suggestions (#1871) * Type fixed! (#1871) * Remove spurious import from vscode (#1871) * Snapshot update after changing col name (#1871) * Move up GDAL (#1871) * Adjust geojson strategy (#1871) * Try running census separately first (#1871) * Fix import order (#1871) * Cleanup cache strategy (#1871) * Download census data from S3 instead of re-calculating (#1871) * Clarify pandas code per Emma (#1871)
2022-08-16 13:28:39 -04:00
jupyterlab = "^3.4.4"
[build-system]
build-backend = "poetry.core.masonry.api"
requires = ["poetry-core>=1.0.0"]
[tool.pylint]
[tool.pylint."MESSAGE CONTROL"]
disable = [
"C0114", # Disables module docstrings
"R0903", # Disables too few public methods
"C0103", # Disables name case styling
"W0511", # Disables FIXME warning
"W1203", # Disables f-string interpolation for logging warning # Errors temporarily ignored for further discussion
"W0107", # Disables unnecessary pass
"W0221", # Disables arguments differ
"R0902", # Disables too many instance attributes
"R0914", # Disables too many local variables
"W0621", # Disables redefined outer name
"C0302", # Disables too many lines in module
"R1732", # Disables consider using "with"
"R1720", # Disables unnecessary "else" after "raise"
"C0206", # Disables consider iteratig with ".items()"
"C0200", # Disables consider using "enumerate" instead of "range" + "len"
"W0612", # Disables unused variable
"W0613", # Disables unused argument
"C0116", # Disables missing function or method docstring
"C0115", # Disables missing class docstring
"R0915", # Disables too many statements (score generation transform)
Add ETL Contract Checks (#619) * Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline * Adds new methods to ExtractTransformLoad base class: - __init__() Initializes class attributes - _get_census_fips_codes() Loads a dataframe with the fips codes for census block group and tract - validate_init() Checks that the class was initialized correctly - validate_output() Checks that the output was loaded correctly * Adds test for ExtractTransformLoad.__init__() and base.py * Fixes failing flake8 test * Changes geo_col to geoid_col and changes is_dataset to is_census in yaml * Adds test for validate_output() * Adds remaining tests * Removes is_dataset from init method * Makes CENSUS_CSV a class attribute instead of a class global: This ensures that CENSUS_CSV is only set when the ETL class is for a non-census dataset and removes the need to overwrite the value in mock_etl fixture * Re-formats files with black and fixes broken tox tests
2021-10-13 15:54:15 -04:00
"W0231", # Disables super init not called
"R0801", # Disables duplicate code. There are a couple places we have similar code and
# unfortunately you can't disable this rule for individual lines or files, it's a
# known bug. https://github.com/PyCQA/pylint/issues/214#
]
[tool.pylint.FORMAT]
max-line-length = 150
[tool.pylint.typecheck]
Add ETL Contract Checks (#619) * Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline * Adds new methods to ExtractTransformLoad base class: - __init__() Initializes class attributes - _get_census_fips_codes() Loads a dataframe with the fips codes for census block group and tract - validate_init() Checks that the class was initialized correctly - validate_output() Checks that the output was loaded correctly * Adds test for ExtractTransformLoad.__init__() and base.py * Fixes failing flake8 test * Changes geo_col to geoid_col and changes is_dataset to is_census in yaml * Adds test for validate_output() * Adds remaining tests * Removes is_dataset from init method * Makes CENSUS_CSV a class attribute instead of a class global: This ensures that CENSUS_CSV is only set when the ETL class is for a non-census dataset and removes the need to overwrite the value in mock_etl fixture * Re-formats files with black and fixes broken tox tests
2021-10-13 15:54:15 -04:00
generated-members = "pandas.*" # fixes E1101 for ETL.df
[tool.pylint.SIMILARITIES]
# Configures how pylint detects repetitive code
ignore-comments = "yes"
ignore-docstrings = "yes"
ignore-imports = "yes"
min-similarity-lines = 4
[tool.black]
line-length = 80
[tool.liccheck]
# Authorized and unauthorized licenses in LOWER CASE
authorized_licenses = [
"bsd",
"new bsd",
"bsd license",
"bsd 3-clause",
"new bsd license",
"simplified bsd",
"apache",
"apache 2.0",
"apache license 2.0",
"apache software license",
"apache software",
"gnu lgpl",
"gnu lesser general public license v2 (lgplv2)",
"gnu general public license v2 (gplv2)",
"gnu library or lesser general public license (lgpl)",
"lgpl with exceptions or zpl",
"isc license",
"isc license (iscl)",
"mit",
"mit license",
"mozilla public license 2.0 (mpl 2.0)",
"public domain",
"python software foundation license",
"python software foundation",
"zpl 2.1",
"gpl v3",
"historical permission notice and disclaimer (hpnd)",
2022-09-26 12:03:18 -04:00
]