Changes to allow local runs

This commit is contained in:
Carlos Felix 2024-11-05 11:31:54 -05:00 committed by Carlos Felix
commit ff9e7b9aa2
11 changed files with 3231 additions and 1867 deletions

1
.github/CODEOWNERS vendored
View file

@ -1 +0,0 @@
* @vim-usds @travis-newby @sampowers-usds @mattbowen-usds

View file

@ -45,7 +45,7 @@ jobs:
# Initializes the CodeQL tools for scanning. # Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL - name: Initialize CodeQL
uses: github/codeql-action/init@v1 uses: github/codeql-action/init@v2
with: with:
languages: ${{ matrix.language }} languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file. # If you wish to specify custom queries, you can do so here or in a config file.
@ -56,7 +56,7 @@ jobs:
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below) # If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild - name: Autobuild
uses: github/codeql-action/autobuild@v1 uses: github/codeql-action/autobuild@v2
# Command-line programs to run using the OS shell. # Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl # 📚 https://git.io/JvXDl
@ -70,4 +70,4 @@ jobs:
# make release # make release
- name: Perform CodeQL Analysis - name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1 uses: github/codeql-action/analyze@v2

View file

@ -16,7 +16,7 @@ jobs:
strategy: strategy:
matrix: matrix:
# checks all of the versions allowed in pyproject.toml # checks all of the versions allowed in pyproject.toml
python-version: [3.8, 3.9] python-version: [3.10.15]
steps: steps:
# installs Python # installs Python
# one execution of the tests per version listed above # one execution of the tests per version listed above

View file

@ -0,0 +1,7 @@
import logging
LOG_LEVEL = logging.DEBUG
"""Log level for all loggers."""
NO_SSL_VERIFY = True
"""Set to true to skip SSL verification when downloading files. Useful for local development."""

View file

@ -22,6 +22,7 @@ from pathlib import Path
from typing import List from typing import List
from dataclasses import dataclass from dataclasses import dataclass
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from data_pipeline.constants import NO_SSL_VERIFY
from data_pipeline.etl.downloader import Downloader from data_pipeline.etl.downloader import Downloader
from data_pipeline.etl.sources.census_acs.etl_utils import ( from data_pipeline.etl.sources.census_acs.etl_utils import (
@ -65,7 +66,7 @@ class FileDataSource(DataSource):
Downloader.download_file_from_url( Downloader.download_file_from_url(
file_url=self.source, file_url=self.source,
download_file_name=self.destination, download_file_name=self.destination,
verify=True, verify=not NO_SSL_VERIFY,
) )
def __str__(self): def __str__(self):
@ -85,7 +86,7 @@ class ZIPDataSource(DataSource):
Downloader.download_zip_file_from_url( Downloader.download_zip_file_from_url(
file_url=self.source, file_url=self.source,
unzipped_file_path=self.destination, unzipped_file_path=self.destination,
verify=True, verify=not NO_SSL_VERIFY,
) )
def __str__(self): def __str__(self):

View file

@ -6,7 +6,9 @@ import shutil
from pathlib import Path from pathlib import Path
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class Downloader: class Downloader:
"""A simple class to encapsulate the download capabilities of the application""" """A simple class to encapsulate the download capabilities of the application"""
@ -34,12 +36,13 @@ class Downloader:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
download_file_name.parent.mkdir(parents=True, exist_ok=True) download_file_name.parent.mkdir(parents=True, exist_ok=True)
logger.debug(f"Downloading {file_url}")
response = requests.get( response = requests.get(
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
) )
if response.status_code == 200: if response.status_code == 200:
file_contents = response.content file_contents = response.content
logger.debug("Downloaded.")
else: else:
raise Exception( raise Exception(
f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}" f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"

View file

@ -215,6 +215,7 @@ class CensusETL(ExtractTransformLoad):
state_gdf = gpd.read_file(file_name) state_gdf = gpd.read_file(file_name)
usa_df = usa_df.append(state_gdf) usa_df = usa_df.append(state_gdf)
logger.debug("Converting to CRS")
usa_df = usa_df.to_crs( usa_df = usa_df.to_crs(
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
) )

View file

@ -13,6 +13,7 @@ import requests
import urllib3 import urllib3
import yaml import yaml
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.constants import LOG_LEVEL
from data_pipeline.content.schemas.download_schemas import CodebookConfig from data_pipeline.content.schemas.download_schemas import CodebookConfig
from data_pipeline.content.schemas.download_schemas import CSVConfig from data_pipeline.content.schemas.download_schemas import CSVConfig
from data_pipeline.content.schemas.download_schemas import ExcelConfig from data_pipeline.content.schemas.download_schemas import ExcelConfig
@ -48,7 +49,7 @@ def get_module_logger(module_name: str) -> logging.Logger:
) )
handler.setFormatter(formatter) handler.setFormatter(formatter)
logger.addHandler(handler) logger.addHandler(handler)
logger.setLevel(logging.INFO) logger.setLevel(LOG_LEVEL)
logger.propagate = False # don't send log messages to the parent logger (to avoid duplicate log messages) logger.propagate = False # don't send log messages to the parent logger (to avoid duplicate log messages)
return logger return logger

File diff suppressed because it is too large Load diff

View file

@ -17,7 +17,7 @@ packages = [
[tool.poetry.dependencies] [tool.poetry.dependencies]
CensusData = "^1.13" CensusData = "^1.13"
certifi = "^2022.12.07" # explicit callout due to https://pyup.io/v/52365/f17/ certifi = ">= 2024.07.04" # Due to https://data.safetycli.com/v/72083/f17
click = "8.0.4" # pinning for now per https://github.com/psf/black/issues/2964 click = "8.0.4" # pinning for now per https://github.com/psf/black/issues/2964
dynaconf = "^3.1.4" dynaconf = "^3.1.4"
geopandas = "^0.11.0" geopandas = "^0.11.0"
@ -29,19 +29,20 @@ marshmallow-dataclass = "^8.5.3"
marshmallow-enum = "^1.5.1" marshmallow-enum = "^1.5.1"
matplotlib = "^3.4.2" matplotlib = "^3.4.2"
numpy = "^1.22.1" numpy = "^1.22.1"
pandas = "^1.2.5" pandas = "~1.4.3"
pylint = "^2.11.1" pylint = "^2.11.1"
pillow = "9.3.0" pillow = "9.3.0" # Newer versions break tile generation
python = "^3.8" python = "^3.10"
pypandoc = "^1.6.3" pypandoc = "^1.6.3"
PyYAML = "^6.0" PyYAML = "^6.0"
requests = "^2.25.1" requests = "^2.25.1"
tqdm = "4.62.0" tqdm = "^4.66.3"
types-requests = "^2.25.0" types-requests = "^2.25.0"
us = "^2.0.2" us = "^2.0.2"
xlsxwriter = "^2.0.0" xlsxwriter = "^2.0.0"
pydantic = "^1.9.0" pydantic = "^1.9.0"
Rtree = "^1.0.0" Rtree = "^1.0.0"
fiona = "~1.8.21"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
black = {version = "^21.6b0", allow-prereleases = true} black = {version = "^21.6b0", allow-prereleases = true}
@ -59,7 +60,7 @@ pandas-vet = "^0.2.2"
pytest-snapshot = "^0.8.1" pytest-snapshot = "^0.8.1"
seaborn = "^0.11.2" seaborn = "^0.11.2"
papermill = "^2.3.4" papermill = "^2.3.4"
jupyterlab = "3.4.4" jupyterlab = "^3.6.7"
[build-system] [build-system]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"

View file

@ -1,7 +1,7 @@
[tox] [tox]
# required because we use pyproject.toml # required because we use pyproject.toml
isolated_build = true isolated_build = true
envlist = py38, py39, lint, checkdeps, pytest envlist = py310, lint, checkdeps, pytest
# only checks python versions installed locally # only checks python versions installed locally
skip_missing_interpreters = true skip_missing_interpreters = true
@ -16,7 +16,9 @@ commands = black data_pipeline
# checks the dependencies for security vulnerabilities and open source licenses # checks the dependencies for security vulnerabilities and open source licenses
allowlist_externals = bash allowlist_externals = bash
commands = pip install -U wheel commands = pip install -U wheel
safety check --ignore 51457 --ignore 44715 # known issue: https://github.com/pyupio/safety/issues/364 # known issue: https://github.com/pyupio/safety/issues/364
# jinja2 false positive for our use: https://data.safetycli.com/v/70612/f17
safety check --ignore 51457 --ignore 44715 --ignore 70612
bash scripts/run-liccheck.sh bash scripts/run-liccheck.sh
[testenv:pytest] [testenv:pytest]