mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 01:31:25 -08:00
Merge pull request #47 from agilesix/cfelix/merge-v2-20250113
CEQ-J40 merge v2 code - 20250113
This commit is contained in:
commit
bba35c1a15
52 changed files with 1467 additions and 1514 deletions
20
.github/workflows/deploy_backend_main.yml
vendored
20
.github/workflows/deploy_backend_main.yml
vendored
|
@ -1,5 +1,10 @@
|
|||
name: Deploy Backend Main
|
||||
on: workflow_dispatch
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "data/**"
|
||||
- ".github/workflows/deploy_backend_main.yml"
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
@ -45,21 +50,22 @@ jobs:
|
|||
aws-access-key-id: ${{ secrets.DATA_DEV_AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ secrets.DATA_DEV_AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: us-east-1
|
||||
- name: Install GDAL/ogr2ogr
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install gdal-bin
|
||||
ogrinfo --version
|
||||
- name: Cleanup Data
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application data-cleanup
|
||||
- name: Cache Census Data
|
||||
id: cache-census
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: data/data-pipeline/data_pipeline/data/census
|
||||
key: data-census
|
||||
- name: Get Census Data
|
||||
if: steps.cache-census.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application census-data-download
|
||||
- name: Run ETL
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application etl-run
|
||||
poetry run python3 -m data_pipeline.application etl-run --dataset tribal
|
||||
- name: Generate Score
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application score-run
|
||||
|
|
26
.github/workflows/pr_backend.yml
vendored
26
.github/workflows/pr_backend.yml
vendored
|
@ -6,7 +6,6 @@ concurrency:
|
|||
cancel-in-progress: true
|
||||
env:
|
||||
python-version: '3.10'
|
||||
CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }}
|
||||
J40_VERSION_LABEL_STRING: ${{ vars.SCORE_VERSION }}
|
||||
jobs:
|
||||
# JOB to run change detection
|
||||
|
@ -51,7 +50,7 @@ jobs:
|
|||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_backend_main.yml') }}
|
||||
key: cejst-poetry-env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_backend_main.yml') }}
|
||||
- name: Install poetry
|
||||
uses: snok/install-poetry@v1
|
||||
- name: Install dependencies
|
||||
|
@ -64,7 +63,7 @@ jobs:
|
|||
- name: Run static code analysis
|
||||
run: poetry run pylint data_pipeline/
|
||||
- name: Check library safety
|
||||
run: poetry run safety check --ignore 51457 --ignore 44715 --ignore 70612
|
||||
run: poetry run safety check --ignore 51457 --ignore 44715 --ignore 70612 --ignore 74439
|
||||
- name: Run unit tests
|
||||
run: |
|
||||
poetry run pytest data_pipeline/
|
||||
|
@ -91,7 +90,7 @@ jobs:
|
|||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_backend_main.yml') }}
|
||||
key: cejst-poetry-env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_backend_main.yml') }}
|
||||
- name: Install poetry
|
||||
uses: snok/install-poetry@v1
|
||||
- name: Print Poetry settings
|
||||
|
@ -99,21 +98,22 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: poetry add s4cmd && poetry install
|
||||
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
|
||||
- name: Install GDAL/ogr2ogr
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install gdal-bin
|
||||
ogrinfo --version
|
||||
- name: Cleanup Data
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application data-cleanup
|
||||
- name: Load cached ETL data
|
||||
id: cached-etl-data
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
data/data-pipeline/data_pipeline/data/census
|
||||
data/data-pipeline/data_pipeline/data/dataset
|
||||
key: cejst-dataset-env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('data/data-pipeline/data_pipeline/etl/**/*') }}-${{ hashFiles('data/data-pipeline/data_pipeline/utils.py') }}
|
||||
- name: Get Census Data
|
||||
if: steps.cached-etl-data.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application census-data-download
|
||||
- name: Run ETL
|
||||
if: steps.cached-etl-data.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application etl-run
|
||||
poetry run python3 -m data_pipeline.application etl-run --dataset tribal
|
||||
- name: Generate Score
|
||||
run: |
|
||||
poetry run python3 -m data_pipeline.application score-run
|
||||
|
|
9
.github/workflows/pr_frontend.yml
vendored
9
.github/workflows/pr_frontend.yml
vendored
|
@ -55,10 +55,11 @@ jobs:
|
|||
run: ls -la public
|
||||
- name: Lint
|
||||
run: npm run lint
|
||||
# Disabling for now due to jsonlint - TODO: put this back
|
||||
# - name: License Check
|
||||
# run: npm run licenses
|
||||
- name: Test
|
||||
- name: License Check
|
||||
run: npm run licenses
|
||||
- name: Unit tests
|
||||
run: npm test
|
||||
- name: Spanish translation test
|
||||
run: npm run test:intl-translations
|
||||
# - name: Check for security vulnerabilities
|
||||
# run: npm audit --production
|
|
@ -21,8 +21,24 @@ $ PIPELINE_CMD="data_pipeline.application full-run" docker compose up
|
|||
```
|
||||
The above command will build and spin up three containers: A data pipeline container, a data server, and a web server.
|
||||
|
||||
The data pipeline container can run the entire data pipeline, or any individual step. Because running the entire pipeline is a time-consuming process, the application command has been turned into a variable so individual parts of the pipeline can be run by docker compose. Once the full-run has been completed, you can change the PIPELINE_CMD environment variable to any other valid parameter for future runs. For example setting `PIPELINE_CMD="full-run --help"` would show the options for the full-run command. This would be helpful if you didn't want to run the data pipeline but merely wanted to see front end changes.
|
||||
The data pipeline container can run the entire data pipeline, or any individual step. Because running the entire pipeline is a time-consuming process, the application command has been turned into a variable so individual parts of the pipeline can be run by docker compose. Once the full-run has been completed, you can change the PIPELINE_CMD environment variable to any other valid parameter for future runs. For example setting `PIPELINE_CMD="data_pipeline.application full-run --help"` would show the options for the full-run command. This would be helpful if you didn't want to run the data pipeline but merely wanted to see front end changes.
|
||||
|
||||
The data server will make the files created by the data pipeline container available to the web server. The data pipeline container mounts the local repo directories to read and write files. The data server presents the local files to the webserver to render the map and downloadables.
|
||||
The data server will make the files created by the data pipeline container available to the web server. The data pipeline container mounts the local repo directories to read and write files. The data server presents the local files to the webserver to render the map and downloadable files.
|
||||
|
||||
The web server will run the application website. After it initializes, you should be able to open the web server in your browser at [`http://localhost:8000`](http://localhost:8000). If the data pipeline container is set to run the full data pipeline, the website will not pick up the changes until the pipeline completes.
|
||||
The web server will run the application website. After it initializes, you should be able to open the web server in your browser at [`http://localhost:8000`](http://localhost:8000). If the data pipeline container is set to run the full data pipeline, the website will not pick up the changes until the pipeline completes.
|
||||
|
||||
In order for docker to pick up code changes, the images will need to be rebuilt. If there are code changes in the data folder, the data pipeline image should be rebuilt. If there are code changes in the the client folder, the web server image should be rebuilt. The data server image should never have to be rebuilt.
|
||||
|
||||
Command to rebuild the data pipeline image:
|
||||
|
||||
```sh
|
||||
$ docker build ./data/data-pipeline -t 'j40_data_pipeline'
|
||||
```
|
||||
|
||||
Command to rebuild the web server image:
|
||||
|
||||
```sh
|
||||
$ docker build ./client -t 'j40_website'
|
||||
```
|
||||
|
||||
Once one or both images are rebuilt, you can re-run the docker compose command.
|
|
@ -28,7 +28,6 @@ GATSBY_FILE_DL_PATH_1_0_SHAPE_FILE_ZIP=downloadable/1.0-shapefile-codebook.zip
|
|||
GATSBY_FILE_DL_PATH_1_0_DATA_DOC=downloadable/1.0-data-documentation.zip
|
||||
GATSBY_FILE_DL_PATH_BETA_TRAINING_SLIDES_PPT=downloadable/technical-training-slides.pptx
|
||||
|
||||
|
||||
GATSBY_FILE_DL_PATH_2_0_COMMUNITIES_LIST_XLS=downloadable/2.0-communities.xlsx
|
||||
GATSBY_FILE_DL_PATH_2_0_COMMUNITIES_LIST_CSV=downloadable/2.0-communities.csv
|
||||
GATSBY_FILE_DL_PATH_2_0_COMMUNITIES_LIST_PDF=downloadable/2.0-communities-list.pdf
|
||||
|
@ -44,9 +43,4 @@ GATSBY_FILE_DL_PATH_2_0_M_23_09_SIGNED_PDF=downloadable/M-23-09_Signed_CEQ_CPO_e
|
|||
GATSBY_FILE_DL_PATH_TSD_ES_PDF=downloadable/cejst-technical-support-document.pdf
|
||||
GATSBY_FILE_DL_PATH_HOW_TO_COMMUNITIES_PDF=downloadable/draft-communities-list.pdf
|
||||
|
||||
GATSBY_MAP_TILES_PATH=tiles
|
||||
|
||||
# If you want the map to render a MapBox base map (as opposed to the
|
||||
# open source one from CartoDB), please create your own API TOKEN from
|
||||
# your MapBox account and add the token here:
|
||||
MAPBOX_STYLES_READ_TOKEN=''
|
||||
GATSBY_MAP_TILES_PATH=tiles
|
|
@ -16,7 +16,7 @@
|
|||
"clean": "gatsby clean",
|
||||
"cy:open": "CYPRESS_REMOTE_DEBUGGING_PORT=9222 cypress open",
|
||||
"cy:run": "cypress run",
|
||||
"licenses": "license-checker --production --onlyAllow 'Apache-2.0;BSD;BSD-2-Clause;BSD-3-Clause;CC0-1.0;CC-BY-3.0;CC-BY-4.0;ISC;MIT;Public Domain;Unlicense;UNLICENSED;ODC-By-1.0;WTFPL;MPL-2.0'",
|
||||
"licenses": "license-checker --production --onlyAllow 'Apache-2.0;BSD;BSD-2-Clause;BSD-3-Clause;CC0-1.0;CC-BY-3.0;CC-BY-4.0;ISC;MIT;Public Domain;Unlicense;UNLICENSED;ODC-By-1.0;WTFPL;MPL-2.0' --excludePackages '@mapbox/jsonlint-lines-primitives@2.0.2'",
|
||||
"test": "jest",
|
||||
"test:e2e": "start-server-and-test develop http://localhost:8000 cy:open",
|
||||
"test:e2e:ci": "start-server-and-test develop http://localhost:8000 cy:run",
|
||||
|
@ -28,6 +28,7 @@
|
|||
"intl:removeNesting": "node src/intl/removeNesting.js",
|
||||
"intl:compile-en": "formatjs compile src/intl/en.json --ast --out-file compiled-lang/en.json",
|
||||
"test:intl-extraction": "node src/intl/testIntlExtraction",
|
||||
"test:intl-translations": "node src/intl/diffEnEs",
|
||||
"prepare": "cd .. && husky install client/.husky",
|
||||
"gc": "node .generate_component $1",
|
||||
"compile": "tsc"
|
||||
|
@ -78,7 +79,6 @@
|
|||
"ts-jest": "^27.1.3"
|
||||
},
|
||||
"dependencies": {
|
||||
"-": "^0.0.1",
|
||||
"@sentry/gatsby": "^7.7.0",
|
||||
"@trussworks/react-uswds": "^3.1.0",
|
||||
"@turf/bbox": "^6.5.0",
|
||||
|
|
|
@ -129,6 +129,18 @@ export const getTribalPercentValue = (tribalPercentRaw: number) => {
|
|||
const AreaDetail = ({properties}: IAreaDetailProps) => {
|
||||
const intl = useIntl();
|
||||
|
||||
/**
|
||||
* Set the indicators for a given category.
|
||||
* @param {string} id the category ID
|
||||
* @param {indicatorInfo[]} indicators the indicators to set for the category.
|
||||
* @throws Error if the category ID does not exist
|
||||
*/
|
||||
const setCategoryIndicators = (id: string, indicators: indicatorInfo[]) => {
|
||||
const cat = categories.find((category) => category.id === id);
|
||||
if (cat) cat.indicators = indicators;
|
||||
else throw new Error('Unknown side panel category ID ' + id);
|
||||
};
|
||||
|
||||
// console.log the properties of the census that is selected:
|
||||
console.log(
|
||||
"BE signals for tract (last one is the tract currently selected): ",
|
||||
|
@ -986,36 +998,14 @@ const AreaDetail = ({properties}: IAreaDetailProps) => {
|
|||
* This sidePanelState has 3 values; namely, Nation, Puerto Rico and Island Areas.
|
||||
*/
|
||||
if (sidePanelState === constants.SIDE_PANEL_STATE_VALUES.PUERTO_RICO) {
|
||||
// Allow all categories except health burdens:
|
||||
categories = categories.filter(
|
||||
(category) => category.id !== "health-burdens",
|
||||
);
|
||||
|
||||
// Re-define which burdens show up for each category:
|
||||
|
||||
// 'climate-change'
|
||||
categories[0].indicators = [flooding];
|
||||
|
||||
// 'clean-energy'
|
||||
categories[1].indicators = [energyCost];
|
||||
|
||||
// 'health-burdens'
|
||||
// not showing this category
|
||||
|
||||
// 'sustain-house'
|
||||
categories[2].indicators = [houseCost, lackPlumbing, leadPaint];
|
||||
|
||||
// 'leg-pollute'
|
||||
categories[3].indicators = [proxHaz, proxRMP, proxNPL];
|
||||
|
||||
// 'clean-transport'
|
||||
categories[4].indicators = [dieselPartMatter, trafficVolume];
|
||||
|
||||
// 'clean-water'
|
||||
// show all
|
||||
|
||||
// 'work-dev'
|
||||
categories[6].indicators = [lowMedInc, poverty, unemploy];
|
||||
setCategoryIndicators('climate-change', [flooding]);
|
||||
setCategoryIndicators('clean-energy', [energyCost]);
|
||||
setCategoryIndicators('sustain-house', [historicUnderinvest, houseCost, lackPlumbing, leadPaint]);
|
||||
setCategoryIndicators('leg-pollute', [proxHaz, proxRMP, proxNPL]);
|
||||
setCategoryIndicators('clean-transport', [dieselPartMatter, trafficVolume]);
|
||||
setCategoryIndicators('work-dev', [lowMedInc, poverty, unemploy]);
|
||||
}
|
||||
|
||||
if (sidePanelState === constants.SIDE_PANEL_STATE_VALUES.ISLAND_AREAS) {
|
||||
|
@ -1070,9 +1060,6 @@ const AreaDetail = ({properties}: IAreaDetailProps) => {
|
|||
<>
|
||||
{/* Indicators - filters then map */}
|
||||
{category.indicators
|
||||
.filter(
|
||||
indicatorFilter(EXPLORE_COPY.SIDE_PANEL_INDICATORS.HIST_UNDERINVEST),
|
||||
)
|
||||
.map((indicator: any, index: number) => {
|
||||
return <Indicator key={`ind${index}`} indicator={indicator} />;
|
||||
})}
|
||||
|
|
|
@ -4,6 +4,7 @@ import React from 'react';
|
|||
import {defineMessages} from 'react-intl';
|
||||
import * as COMMON_COPY from './common';
|
||||
import {VERSION_NUMBER, VERSIONS} from './methodology';
|
||||
import {TILE_BASE_URL} from '../constants';
|
||||
|
||||
export const PAGE_INTRO = defineMessages({
|
||||
PAGE_TILE: {
|
||||
|
@ -29,19 +30,26 @@ export const PAGE_INTRO = defineMessages({
|
|||
});
|
||||
|
||||
export const getDownloadFileUrl = (filePath: string | undefined, version: VERSIONS = VERSIONS.V2_0) => {
|
||||
const scorePath = version === VERSIONS.BETA ?
|
||||
process.env.GATSBY_BETA_SCORE_PATH :
|
||||
version === VERSIONS.V1_0 ?
|
||||
process.env.GATSBY_1_0_SCORE_PATH :
|
||||
process.env.GATSBY_2_0_SCORE_PATH;
|
||||
let scorePath;
|
||||
|
||||
if (process.env.DATA_SOURCE === 'local') {
|
||||
scorePath = process.env.GATSBY_DATA_PIPELINE_SCORE_PATH_LOCAL;
|
||||
} else {
|
||||
scorePath = version === VERSIONS.BETA ?
|
||||
process.env.GATSBY_BETA_SCORE_PATH :
|
||||
version === VERSIONS.V1_0 ?
|
||||
process.env.GATSBY_1_0_SCORE_PATH :
|
||||
process.env.GATSBY_2_0_SCORE_PATH;
|
||||
}
|
||||
|
||||
return [
|
||||
process.env.GATSBY_CDN_TILES_BASE_URL,
|
||||
TILE_BASE_URL,
|
||||
scorePath,
|
||||
filePath,
|
||||
].join('/');
|
||||
};
|
||||
|
||||
// Define meta data on dowload files
|
||||
// Define meta data on download files
|
||||
export const DOWNLOAD_FILES = {
|
||||
NARWAL: {
|
||||
COMMUNITIES_LIST_XLS: {
|
||||
|
|
|
@ -348,9 +348,9 @@ export const FAQ_ANSWERS = {
|
|||
}}
|
||||
/>,
|
||||
Q16_P1: <FormattedMessage
|
||||
id={ 'faqs.page.answers.Q16'}
|
||||
id={ 'faqs.page.answers.Q16_P1'}
|
||||
defaultMessage={ `CEQ launched a beta—or draft—version of the CEJST in February 2022 with support from the U.S. Digital Service (USDS), and in collaboration with other Federal agencies and departments. The CEJST was released in a beta version in order to seek <link1>feedback</link1> from Federal agencies, Tribal Nations, State and local governments, Members of Congress, environmental justice stakeholders, and the public. The 90 day public comment period <link2>closed</link2> on May 25, 2022. CEQ and the USDS hosted several <link3>public training</link3> sessions on the beta version of the CEJST. All of this feedback on the beta version of the CEJST helped to inform the release of version 1.0 of the CEJST.`}
|
||||
description={ 'Navigate to the FAQs page, this will be an answer, Q16'}
|
||||
description={ 'Navigate to the FAQs page, this will be an answer, Q16_P1'}
|
||||
values={{
|
||||
link1: linkFn('https://www.federalregister.gov/documents/2022/02/23/2022-03920/climate-and-economic-justice-screening-tool-beta-version', false, true),
|
||||
link2: linkFn('https://www.whitehouse.gov/ceq/news-updates/2022/04/21/ceq-extends-public-comment-period-on-beta-version-of-the-climate-and-economic-justice-screening-tool/', false, true),
|
||||
|
|
|
@ -13,12 +13,14 @@ const esKeys = Object.keys(esJson);
|
|||
|
||||
const missingKeys = enKeys.filter((key) => !esKeys.includes(key));
|
||||
const unusedKeys = esKeys.filter((key) => !enKeys.includes(key));
|
||||
let isError = false;
|
||||
|
||||
if (missingKeys.length > 0 || unusedKeys.length > 0) {
|
||||
console.log('\nMISSING: These keys need to be added to es.json:');
|
||||
console.log(missingKeys);
|
||||
console.log('\nUNUSED: These keys in es.json are not in en.json:');
|
||||
console.log(unusedKeys);
|
||||
isError = true;
|
||||
} else {
|
||||
console.log('SUCCESS: All keys match between en.json and es.json');
|
||||
}
|
||||
|
@ -30,6 +32,7 @@ if (untranslatedValues.length > 0) {
|
|||
console.log('\nIDENTICAL: These keys have identical text in both languages');
|
||||
console.log('(If any of these are intentionally identical, add them to identicalKeysEnEs.js):');
|
||||
console.log(untranslatedValues);
|
||||
isError = true;
|
||||
}
|
||||
|
||||
// Check for keys in identicalKeysEnEs that no longer exist in either translation file
|
||||
|
@ -39,4 +42,7 @@ const nonexistentIdenticalKeys = identicalKeysEnEs.filter(
|
|||
if (nonexistentIdenticalKeys.length > 0) {
|
||||
console.log('\nOUTDATED MATCH: These keys in identicalKeysEnEs.js no longer exist in translations:');
|
||||
console.log(nonexistentIdenticalKeys);
|
||||
isError = true;
|
||||
}
|
||||
|
||||
if (isError) process.exit(1);
|
||||
|
|
|
@ -1311,9 +1311,9 @@
|
|||
"defaultMessage": "The public can also email {general_email_address}",
|
||||
"description": "Navigate to the FAQs page, this will be an answer, Q15_P1_4"
|
||||
},
|
||||
"faqs.page.answers.Q16": {
|
||||
"faqs.page.answers.Q16_P1": {
|
||||
"defaultMessage": "CEQ launched a beta—or draft—version of the CEJST in February 2022 with support from the U.S. Digital Service (USDS), and in collaboration with other Federal agencies and departments. The CEJST was released in a beta version in order to seek <link1>feedback</link1> from Federal agencies, Tribal Nations, State and local governments, Members of Congress, environmental justice stakeholders, and the public. The 90 day public comment period <link2>closed</link2> on May 25, 2022. CEQ and the USDS hosted several <link3>public training</link3> sessions on the beta version of the CEJST. All of this feedback on the beta version of the CEJST helped to inform the release of version 1.0 of the CEJST.",
|
||||
"description": "Navigate to the FAQs page, this will be an answer, Q16"
|
||||
"description": "Navigate to the FAQs page, this will be an answer, Q16_P1"
|
||||
},
|
||||
"faqs.page.answers.Q16_P2": {
|
||||
"defaultMessage": "The 1.0 version was released in <link1>{version1Release}</link1>. The current version, version {currentVersion}, was released in {currentVersionRelease}.",
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
"common.pages.footer.logo.title": "Consejo sobre la Calidad del Medio Ambiente",
|
||||
"common.pages.footer.moreinfoheader": "Más información",
|
||||
"common.pages.footer.privacy.text": "Política de privacidad",
|
||||
"common.pages.footer.whitehouselogoalt": "Whitehouse logo",
|
||||
"common.pages.footer.whitehouselogoalt": "Logo de la Casa Blanca",
|
||||
"common.pages.header.about": "Información básica",
|
||||
"common.pages.header.contact": "Contacto",
|
||||
"common.pages.header.downloads": "Descargas",
|
||||
|
@ -59,7 +59,7 @@
|
|||
"download.page.download.file.1": "<Link1>datos de la lista Comunidades</link1> (.xlsx {cldXlsFileSize})",
|
||||
"download.page.download.file.2": "<Link2>datos de la lista Comunidades</link2> (.csv {cldCsvFileSize})",
|
||||
"download.page.download.file.3": "<link3>Archivo de forma </link3> (libro de códigos incluido con el archivo de forma {shapeFileSize} descomprimido)",
|
||||
"download.page.download.file.4": "<link4es>documento de apoyo técnico</link4es> (.pdf {tsdFileSize})",
|
||||
"download.page.download.file.4": "<link4es>Documento de apoyo técnico</link4es> (.pdf {tsdFileSize})",
|
||||
"download.page.download.file.5": "<link5es>Instrucciones a las agencias federales sobre el uso del CEJST</link5es> (.pdf {instructionsEs})",
|
||||
"download.page.files.section.title": "Formatos de archivo de la versión {version}",
|
||||
"download.page.release.2_0.update.HEADER": "Actualización de la publicación {release} de la versión - {date}",
|
||||
|
@ -262,10 +262,9 @@
|
|||
"explore.map.page.side.panel.info.para.3": " . O un simple sí o no.",
|
||||
"explore.map.page.side.panel.info.para.3.part.1": "Las comunidades desfavorecidas viven en distritos censales que experimentan cargas. Se resaltan estos distritos censales ",
|
||||
"explore.map.page.side.panel.info.para.3.part.2": " en el mapa.",
|
||||
"explore.map.page.side.panel.info.para.4": ".",
|
||||
"explore.map.page.side.panel.info.para.4.part.1": "Los distritos censales identificados como desfavorecidos en la versión 1.0 de la herramienta se consideran desfavorecidas en esta versión de la herramienta. Se resaltan estos distritos censales ",
|
||||
"explore.map.page.side.panel.info.para.4.part.2": " en el mapa.",
|
||||
"explore.map.page.side.panel.info.para.5.part.1": "La herramienta clasifica la mayoría de las cargas mediante el uso de percentiles ",
|
||||
"explore.map.page.side.panel.info.para.5.part.2": ". Los percentiles muestran la carga que experimenta cada distrito censal en comparación con otros distritos censales.",
|
||||
"explore.map.page.side.panel.info.para.6.part1": "Se destacan las tierras dentro de los límites de las tribus reconocidas a nivel federal y las ubicaciones puntuales de los pueblos nativos de Alaska ",
|
||||
"explore.map.page.side.panel.info.para.6.part2": " en el mapa. Estas comunidades también se consideran desfavorecidas.",
|
||||
"explore.map.page.side.panel.is.community.of.focus": "¿Está identificada como desfavorecida?",
|
||||
|
@ -307,7 +306,6 @@
|
|||
"faqs.page.Q14": "¿Cómo se utilizaron las recomendaciones del Consejo Asesor de Justicia Ambiental de la Casa Blanca (WHEJAC, por sus siglas en inglés) para esta herramienta?",
|
||||
"faqs.page.Q15": "¿Puede el público dar comentarios sobre esta herramienta?",
|
||||
"faqs.page.Q16": "¿Cuándo se lanzó la herramienta?",
|
||||
"faqs.page.Q17": "¿Cuándo salió la versión oficial de la herramienta?",
|
||||
"faqs.page.Q19": "¿Qué archivos y documentación están disponibles en la herramienta?",
|
||||
"faqs.page.Q2": "¿Cómo identifica y define la herramienta a las comunidades?",
|
||||
"faqs.page.Q20": "¿Cómo funciona el archivo de forma de la herramienta?",
|
||||
|
@ -357,7 +355,8 @@
|
|||
"faqs.page.answers.Q6_P1": "Las diferentes áreas del mapa tendrán diferentes colores porque se identifican como desfavorecidas de diferentes maneras. Algunos distritos censales que contienen tierras dentro de los límites de tribus reconocidas a nivel federal también se consideran desfavorecidas porque cumplen los umbrales de carga para al menos una de las categorías de la herramienta. Cuando esto ocurre, las zonas aparecen más oscuras en el mapa de la herramienta.",
|
||||
"faqs.page.answers.Q6_P2": ": Distritos censales desfavorecidos (cumple la metodología del umbral O contiene tierras de las tribus)",
|
||||
"faqs.page.answers.Q6_P3": ": Distritos censales desfavorecidos y tierras dentro de los límites de tribus reconocidas a nivel federal (cumple la metodología de umbral Y contiene tierras de las tribus)",
|
||||
"faqs.page.answers.Q6_P4": "Cualquier zona que aparezca resaltada se considera desfavorecida, independientemente de que sea un tono claro u oscuro. La herramienta mostrará si todo un distrito censal se considera desfavorecido o sólo las partes que se encuentran dentro de los límites de las tribus reconocidas a nivel federal.",
|
||||
"faqs.page.answers.Q6_P4": ": Distritos censales desfavorecidas (con derechos adquiridos)",
|
||||
"faqs.page.answers.Q6_P5": "Cualquier zona que aparezca resaltada se considera desfavorecida, independientemente de que sea un tono claro u oscuro. La herramienta mostrará si todo un distrito censal se considera desfavorecido o sólo las partes que se encuentran dentro de los límites de las tribus reconocidas a nivel federal.",
|
||||
"faqs.page.answers.Q7": "Un distrito censal que no cumple con ninguno de los umbrales de carga de la herramienta no suele considerarse una comunidad desfavorecida. Sin embargo, si dicho distrito censal contiene tierras dentro de los límites de las tribus reconocidas a nivel federal, entonces las partes de dicho distrito que contienen las tierras de las tribus se consideran desfavorecidas. La herramienta mostrará este tipo de distrito censal como \"parcialmente desfavorecido\".",
|
||||
"faqs.page.coming.soon.text": "¡Próximamente!",
|
||||
"faqs.page.title.text": "Preguntas frecuentes",
|
||||
|
@ -571,7 +570,7 @@
|
|||
"privacy.use.info.heading": "Cómo Utilizamos su Información",
|
||||
"privacy.use.info.body1": "Utilizamos la información que recopilamos para:",
|
||||
"privacy.use.info.list.item1": "Operar, mantener y mejorar la herramienta CEJST;",
|
||||
"privacy.use.info.list.item2": "Analyze how the tool is being used to inform future improvements;",
|
||||
"privacy.use.info.list.item2": "Analizar cómo se utiliza la herramienta para informar mejoras futuras;",
|
||||
"privacy.use.info.list.item3": "Comunicarnos con usted acerca de la herramienta, si nos envía comentarios completando un formulario o cuestionario; y",
|
||||
"privacy.use.info.list.item4": "Cumplir con las obligaciones legales y hacer cumplir nuestras políticas.",
|
||||
"privacy.sharing.info.body1": "No vendemos ni alquilamos su información personal a terceros para sus propios fines de marketing.",
|
||||
|
|
|
@ -8,9 +8,8 @@
|
|||
* This file must be manually updated as needed.
|
||||
*/
|
||||
|
||||
export const identicalKeysEnEs = [
|
||||
const identicalKeysEnEs = [
|
||||
'common.pages.footer.findcontact.link',
|
||||
'common.pages.footer.whitehouselogoalt',
|
||||
'common.pages.tsd.url',
|
||||
'explore.map.page.map.layer.selector.tribal.short',
|
||||
'explore.map.page.map.territoryFocus.alaska.long',
|
||||
|
@ -30,5 +29,4 @@ export const identicalKeysEnEs = [
|
|||
'explore.map.page.side.panel.not.community.of.focus',
|
||||
'methodology.page.dataset.indicator.diabetes.title.text',
|
||||
];
|
||||
|
||||
export default identicalKeysEnEs;
|
||||
module.exports = {identicalKeysEnEs};
|
||||
|
|
157
data/data-pipeline/.vscode/launch.json
vendored
157
data/data-pipeline/.vscode/launch.json
vendored
|
@ -4,27 +4,9 @@
|
|||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Score Run",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"score-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Score Post",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-score-post"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Data Cleanup",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
|
@ -33,7 +15,7 @@
|
|||
},
|
||||
{
|
||||
"name": "Census Cleanup",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
|
@ -42,73 +24,25 @@
|
|||
},
|
||||
{
|
||||
"name": "Download Census",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"census-data-download"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Score Full Run",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"score-full-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Combine Score and GeoJSON",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"geo-score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Score Tiles",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-map-tiles"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Tribal Tiles",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-map-tiles",
|
||||
"-t"
|
||||
"census-data-download", "-u"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "ETL Run",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"etl-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "ETL Run NRI",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"etl-run",
|
||||
"--dataset",
|
||||
"national_risk_index"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "ETL Run Tribal",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
|
@ -117,18 +51,91 @@
|
|||
"tribal"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Score Run",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"score-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Combine Score and GeoJSON",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"geo-score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Score Post",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-score-post"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Score Tiles",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-map-tiles"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Tribal Tiles",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"generate-map-tiles",
|
||||
"-t"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Score Full Run",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"score-full-run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Data Full Run",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"data-full-run",
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Comparator",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.comparator",
|
||||
"args": [
|
||||
"compare-score",
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Convert score to CSV",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "data_pipeline.application",
|
||||
"args": [
|
||||
"convert-score",
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "poetry install",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "poetry",
|
||||
"args": [
|
||||
|
@ -137,7 +144,7 @@
|
|||
},
|
||||
{
|
||||
"name": "poetry update",
|
||||
"type": "python",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "poetry",
|
||||
"args": [
|
||||
|
|
|
@ -58,7 +58,6 @@ The application requires the installation of three 3rd party tools.
|
|||
|
||||
| Tool | Purpose | Link |
|
||||
| --------------- | -------------------- | --------------------------------------------------------- |
|
||||
| GDAL | Generate census data | [GDAL library](https://github.com/OSGeo/gdal) |
|
||||
| libspatialindex | Score generation | [libspatialindex](https://libspatialindex.org/en/latest/) |
|
||||
| tippecanoe | Generate map tiles | [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe) |
|
||||
|
||||
|
@ -66,7 +65,6 @@ The application requires the installation of three 3rd party tools.
|
|||
|
||||
Use Homebrew to install the three tools.
|
||||
|
||||
- GDAL: `brew install gdal`
|
||||
- libspatialindex: `brew install spatialindex`
|
||||
- tippecanoe: `brew install tippecanoe`
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ The detailed steps for performing [local environment installation can be found i
|
|||
|
||||
## Running the Data Pipeline and Scoring Application
|
||||
|
||||
The Justice40 Data Pipeline and Scoring Application is a multistep process that,
|
||||
The Justice40 Data Pipeline and Scoring Application is a multi-step process that,
|
||||
|
||||
1. Retrieves input data sources (extract), standardizes those input data sources' data into an intermediate format (transform), and saves the results to the file system (load). It performs those steps for each configured input data source (found at [`data_pipeline/etl/sources`](data_pipeline/etl/sources))
|
||||
2. Calculates a score
|
||||
|
@ -117,7 +117,7 @@ Begin the process of running the application in your local environment by downlo
|
|||
|
||||
To download census data, run the command `poetry run python3 data_pipeline/application.py census-data-download`.
|
||||
|
||||
If you have a high speed internet connection and don't want to generate the census data or install `GDAL` locally, you can download [a zip version of the Census file](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census` folder.
|
||||
If you have a high speed internet connection and don't want to generate the census data locally, you can download [a zip version of the Census file](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census` folder.
|
||||
|
||||
#### Run the Application
|
||||
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from subprocess import call
|
||||
|
||||
import click
|
||||
|
@ -19,6 +22,7 @@ from data_pipeline.etl.sources.tribal.etl_utils import (
|
|||
reset_data_directories as tribal_reset,
|
||||
)
|
||||
from data_pipeline.tile.generate import generate_tiles
|
||||
from data_pipeline.etl.score import constants
|
||||
from data_pipeline.utils import check_first_run
|
||||
from data_pipeline.utils import data_folder_cleanup
|
||||
from data_pipeline.utils import downloadable_cleanup
|
||||
|
@ -29,8 +33,6 @@ from data_pipeline.utils import geo_score_folder_cleanup
|
|||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
dataset_cli_help = "Grab the data from either 'local' for local access or 'aws' to retrieve from Justice40 S3 repository"
|
||||
|
||||
LOG_LINE_WIDTH = 60
|
||||
|
||||
use_cache_option = click.option(
|
||||
|
@ -38,7 +40,7 @@ use_cache_option = click.option(
|
|||
"--use-cache",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
|
||||
help="When set, will check for cached data sources to use before downloading new ones.",
|
||||
)
|
||||
|
||||
dataset_option = click.option(
|
||||
|
@ -46,7 +48,7 @@ dataset_option = click.option(
|
|||
"--dataset",
|
||||
required=False,
|
||||
type=str,
|
||||
help=dataset_cli_help,
|
||||
help="Name of dataset to run. If not provided, all datasets will be run.",
|
||||
)
|
||||
|
||||
data_source_option = click.option(
|
||||
|
@ -55,7 +57,7 @@ data_source_option = click.option(
|
|||
default="local",
|
||||
required=False,
|
||||
type=str,
|
||||
help=dataset_cli_help,
|
||||
help="Grab the data from either 'local' for local access or 'aws' to retrieve from Justice40 S3 repository. Default is 'local'.",
|
||||
)
|
||||
|
||||
|
||||
|
@ -141,9 +143,14 @@ def pull_census_data(data_source: str):
|
|||
@cli.command(
|
||||
help="Run all ETL processes or a specific one",
|
||||
)
|
||||
@click.option(
|
||||
"--no-concurrency",
|
||||
is_flag=True,
|
||||
help="Run ETLs sequentially instead of concurrently.",
|
||||
)
|
||||
@dataset_option
|
||||
@use_cache_option
|
||||
def etl_run(dataset: str, use_cache: bool):
|
||||
def etl_run(dataset: str, use_cache: bool, no_concurrency: bool):
|
||||
"""Run a specific or all ETL processes
|
||||
|
||||
Args:
|
||||
|
@ -155,7 +162,7 @@ def etl_run(dataset: str, use_cache: bool):
|
|||
log_title("Run ETL")
|
||||
|
||||
log_info("Running dataset(s)")
|
||||
etl_runner(dataset, use_cache)
|
||||
etl_runner(dataset, use_cache, no_concurrency)
|
||||
|
||||
log_goodbye()
|
||||
|
||||
|
@ -290,10 +297,10 @@ def generate_map_tiles(generate_tribal_layer):
|
|||
@data_source_option
|
||||
@use_cache_option
|
||||
def data_full_run(check: bool, data_source: str, use_cache: bool):
|
||||
"""CLI command to run ETL, score, JSON combine and generate tiles in one command
|
||||
"""CLI command to run ETL, score, JSON combine and generate tiles including tribal layer in one command
|
||||
|
||||
Args:
|
||||
check (bool): Run the full data run only if the first run sempahore file is not set (optional)
|
||||
check (bool): Run the full data run only if the first run semaphore file is not set (optional)
|
||||
data_source (str): Source for the census data (optional)
|
||||
Options:
|
||||
- local: fetch census and score data from the local data directory
|
||||
|
@ -327,25 +334,11 @@ def data_full_run(check: bool, data_source: str, use_cache: bool):
|
|||
temp_folder_cleanup()
|
||||
tribal_reset(data_path)
|
||||
|
||||
if data_source == "local":
|
||||
log_info("Downloading census data")
|
||||
etl_runner("census", use_cache)
|
||||
log_info("Downloading census data")
|
||||
etl_runner("census", use_cache)
|
||||
|
||||
log_info("Running all ETLs")
|
||||
etl_runner(use_cache=True)
|
||||
|
||||
log_info("Running tribal ETL")
|
||||
etl_runner("tribal", use_cache)
|
||||
|
||||
else:
|
||||
log_info("Downloading census data")
|
||||
etl_runner("census", use_cache=False)
|
||||
|
||||
log_info("Running all ETLs")
|
||||
etl_runner(use_cache=False)
|
||||
|
||||
log_info("Running tribal ETL")
|
||||
etl_runner("tribal", use_cache=False)
|
||||
log_info("Running all ETLs")
|
||||
etl_runner(use_cache)
|
||||
|
||||
log_info("Generating score")
|
||||
score_generate()
|
||||
|
@ -445,7 +438,7 @@ def clear_data_source_cache(dataset: str):
|
|||
)
|
||||
@click.pass_context
|
||||
def full_post_etl(ctx):
|
||||
"""Generate scoring and tiles"""
|
||||
"""Generate scoring and tiles including tribal layer"""
|
||||
ctx.invoke(score_run)
|
||||
ctx.invoke(generate_score_post, data_source=None)
|
||||
ctx.invoke(geo_score, data_source=None)
|
||||
|
@ -459,15 +452,46 @@ def full_post_etl(ctx):
|
|||
@use_cache_option
|
||||
@click.pass_context
|
||||
def full_run(ctx, use_cache):
|
||||
"""Run all downloads, ETLs, and generate scores and tiles"""
|
||||
"""Run all downloads, ETLs, and generate scores and tiles including tribal layer"""
|
||||
if not use_cache:
|
||||
ctx.invoke(data_cleanup)
|
||||
ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache)
|
||||
ctx.invoke(etl_run, dataset=None, use_cache=use_cache)
|
||||
ctx.invoke(etl_run, dataset="tribal", use_cache=use_cache)
|
||||
ctx.invoke(full_post_etl)
|
||||
|
||||
|
||||
@cli.command(
|
||||
help="Convert a Pickle or Parquet file to GeoJSON or CSV depending on the contents of the file.",
|
||||
)
|
||||
@click.option(
|
||||
"--source",
|
||||
"-s",
|
||||
type=click.Path(),
|
||||
# We don't require this option, otherwise the tool will not run when there is no score
|
||||
default=constants.DATA_SCORE_CSV_FULL_FILE_PATH,
|
||||
help="Path to the input file. Defaults to the default location of the local score file.",
|
||||
)
|
||||
@click.option(
|
||||
"--destination",
|
||||
"-d",
|
||||
type=click.Path(writable=True),
|
||||
default=Path(
|
||||
os.path.splitext(constants.DATA_SCORE_CSV_FULL_FILE_PATH)[0] + ".csv"
|
||||
),
|
||||
help="Path to the input file. Defaults to the source file with CSV extension.",
|
||||
)
|
||||
def convert_score(source: Path, destination: Path):
|
||||
"""Converts the score file to CSV."""
|
||||
if source.exists():
|
||||
score_df = pd.read_parquet(source)
|
||||
logger.info(f"Saving score as CSV to {destination}")
|
||||
score_df.to_csv(destination, index=False)
|
||||
logger.info("Done.")
|
||||
else:
|
||||
logger.error(f"Error: Unable to read {source}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def log_title(title: str, subtitle: str = None):
|
||||
"""Logs a title in our fancy title format"""
|
||||
logger.info("-" * LOG_LINE_WIDTH)
|
||||
|
|
|
@ -51,12 +51,19 @@ def _read_from_file(file_path: Path):
|
|||
"Please generate the score and try again."
|
||||
)
|
||||
sys.exit(1)
|
||||
return pd.read_csv(
|
||||
file_path,
|
||||
index_col="GEOID10_TRACT",
|
||||
dtype={"GEOID10_TRACT": str},
|
||||
low_memory=False,
|
||||
).sort_index()
|
||||
df = pd.DataFrame()
|
||||
if file_path.suffix == ".parquet":
|
||||
df = pd.read_parquet(file_path)
|
||||
df.set_index("GEOID10_TRACT", inplace=True)
|
||||
else:
|
||||
df = pd.read_csv(
|
||||
file_path,
|
||||
index_col="GEOID10_TRACT",
|
||||
dtype={"GEOID10_TRACT": str},
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
return df.sort_index()
|
||||
|
||||
|
||||
def _add_tract_list(tract_list: list[str]):
|
||||
|
@ -67,7 +74,7 @@ def _add_tract_list(tract_list: list[str]):
|
|||
tract_list (list[str]): a list of tracts
|
||||
"""
|
||||
if len(tract_list) > 0:
|
||||
_add_text("Those tracts are:\n")
|
||||
_add_text(" Those tracts are:\n")
|
||||
# First extract the Census states/territories
|
||||
states_by_tract = []
|
||||
for tract in tract_list:
|
||||
|
@ -125,7 +132,7 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
|||
local_df (pd.DataFrame): the local score
|
||||
"""
|
||||
log_info("Comparing dataframe contents (production vs local)")
|
||||
_add_text("\n\n## Scores\n")
|
||||
_add_text("\n## Scores\n")
|
||||
|
||||
production_row_count = len(prod_df.index)
|
||||
local_row_count = len(local_df.index)
|
||||
|
@ -189,10 +196,10 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
|||
f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population."
|
||||
)
|
||||
_add_text(
|
||||
" The number of tracts match!\n "
|
||||
" The number of tracts match!\n"
|
||||
if len(production_disadvantaged_tracts_set)
|
||||
== len(local_disadvantaged_tracts_set)
|
||||
else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n "
|
||||
else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n"
|
||||
)
|
||||
|
||||
removed_tracts = production_disadvantaged_tracts_set.difference(
|
||||
|
@ -213,17 +220,44 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
|||
)
|
||||
_add_tract_list(added_tracts)
|
||||
|
||||
# Grandfathered tracts from v1.0
|
||||
grandfathered_tracts = local_df.loc[
|
||||
local_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
|
||||
].index
|
||||
if len(grandfathered_tracts) > 0:
|
||||
_add_text(
|
||||
f"* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring."
|
||||
)
|
||||
_add_tract_list(grandfathered_tracts)
|
||||
|
||||
def _check_grandfathered_tracts(
|
||||
prod_df: pd.DataFrame, local_df: pd.DataFrame, compare_to_version: str
|
||||
):
|
||||
"""
|
||||
Find grandfathered tracts for v1.0 comparisons.
|
||||
|
||||
Args:
|
||||
prod_df (pd.DataFrame): the production score
|
||||
local_df (pd.DataFrame): the local score
|
||||
compare_to_version (str): the compare to version
|
||||
"""
|
||||
|
||||
# Set the field we will check for grandfathering.
|
||||
# This allows us to add other fields for other versions.
|
||||
grandfathered_field = (
|
||||
field_names.GRANDFATHERED_N_COMMUNITIES_V1_0
|
||||
if compare_to_version.startswith("1")
|
||||
else None
|
||||
)
|
||||
|
||||
# If there is a grandfathered field then check for those tracts
|
||||
if grandfathered_field:
|
||||
log_info("Checking for grandfathered tracks")
|
||||
grandfathered_tracts = local_df.loc[
|
||||
local_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
|
||||
].index
|
||||
if len(grandfathered_tracts) > 0:
|
||||
_add_text(
|
||||
f"\n* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring."
|
||||
)
|
||||
_add_tract_list(grandfathered_tracts)
|
||||
else:
|
||||
_add_text(
|
||||
"* There are NO grandfathered tracts from v1.0 scoring.\n"
|
||||
)
|
||||
else:
|
||||
_add_text("* There are NO grandfathered tracts from v1.0 scoring.\n")
|
||||
_add_text("\n* There is no grandfathered tract list for this version.")
|
||||
|
||||
|
||||
def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
||||
|
@ -234,7 +268,7 @@ def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame):
|
|||
prod_df (pd.DataFrame): the production score
|
||||
local_df (pd.DataFrame): the local score
|
||||
"""
|
||||
_add_text("\n## Delta\n")
|
||||
_add_text("\n\n## Delta\n")
|
||||
# First we make the columns on two dataframes to be the same to be able to compare
|
||||
local_score_df_columns = local_df.columns.array.tolist()
|
||||
production_score_df_columns = prod_df.columns.array.tolist()
|
||||
|
@ -287,7 +321,7 @@ def cli():
|
|||
@click.option(
|
||||
"-v",
|
||||
"--compare-to-version",
|
||||
default="1.0",
|
||||
default="2.0",
|
||||
required=False,
|
||||
type=str,
|
||||
help="Set the production score version to compare to",
|
||||
|
@ -359,8 +393,10 @@ def compare_score(
|
|||
|
||||
_compare_score_columns(production_score_df, local_score_df)
|
||||
_compare_score_results(production_score_df, local_score_df)
|
||||
_check_grandfathered_tracts(
|
||||
production_score_df, local_score_df, compare_to_version
|
||||
)
|
||||
_generate_delta(production_score_df, local_score_df)
|
||||
|
||||
result_doc = _get_result_doc()
|
||||
print(result_doc)
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ settings = Dynaconf(
|
|||
# set root dir
|
||||
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
|
||||
settings.DATA_PATH = settings.APP_ROOT / "data"
|
||||
settings.REQUESTS_DEFAULT_TIMOUT = 3600
|
||||
settings.REQUESTS_DEFAULT_TIMOUT = 300
|
||||
settings.REQUESTS_DEFAULT_RETRIES = 3
|
||||
# To set an environment use:
|
||||
# Linux/OSX: export ENV_FOR_DYNACONF=staging
|
||||
# Windows: set ENV_FOR_DYNACONF=staging
|
||||
|
|
|
@ -155,7 +155,13 @@ DATASET_LIST = [
|
|||
"class_name": "HistoricRedliningETL",
|
||||
"is_memory_intensive": False,
|
||||
},
|
||||
# This has to come after us.json exists
|
||||
{
|
||||
"name": "tribal",
|
||||
"module_dir": "tribal",
|
||||
"class_name": "TribalETL",
|
||||
"is_memory_intensive": False,
|
||||
},
|
||||
# This has to come after us_geo.parquet exists
|
||||
{
|
||||
"name": "census_acs",
|
||||
"module_dir": "census_acs",
|
||||
|
@ -196,10 +202,3 @@ CENSUS_INFO = {
|
|||
"class_name": "CensusETL",
|
||||
"is_memory_intensive": False,
|
||||
}
|
||||
|
||||
TRIBAL_INFO = {
|
||||
"name": "tribal",
|
||||
"module_dir": "tribal",
|
||||
"class_name": "TribalETL",
|
||||
"is_memory_intensive": False,
|
||||
}
|
||||
|
|
|
@ -12,13 +12,26 @@ from tenacity import retry, stop_after_attempt, wait_exponential
|
|||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def _log_retry_failure(retry_state):
|
||||
logger.warning(
|
||||
f"Failure downloading {retry_state.kwargs['file_url']}. Will retry."
|
||||
)
|
||||
|
||||
|
||||
class Downloader:
|
||||
"""A simple class to encapsulate the download capabilities of the application"""
|
||||
|
||||
num_retries = (
|
||||
settings.REQUEST_RETRIES
|
||||
if "REQUEST_RETRIES" in settings
|
||||
else settings.REQUESTS_DEFAULT_RETRIES
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
stop=stop_after_attempt(num_retries),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||
before_sleep=_log_retry_failure,
|
||||
)
|
||||
def download_file_from_url(
|
||||
cls,
|
||||
|
@ -43,9 +56,12 @@ class Downloader:
|
|||
|
||||
download_file_name.parent.mkdir(parents=True, exist_ok=True)
|
||||
logger.debug(f"Downloading {file_url}")
|
||||
response = requests.get(
|
||||
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
|
||||
timeout = (
|
||||
settings.REQUEST_TIMEOUT
|
||||
if "REQUEST_TIMEOUT" in settings
|
||||
else settings.REQUESTS_DEFAULT_TIMOUT
|
||||
)
|
||||
response = requests.get(file_url, verify=verify, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
file_contents = response.content
|
||||
logger.debug("Downloaded.")
|
||||
|
@ -64,8 +80,9 @@ class Downloader:
|
|||
|
||||
@classmethod
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
stop=stop_after_attempt(num_retries),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||
before_sleep=_log_retry_failure,
|
||||
)
|
||||
def download_zip_file_from_url(
|
||||
cls,
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import concurrent.futures
|
||||
import importlib
|
||||
import time
|
||||
import typing
|
||||
import os
|
||||
|
||||
from functools import reduce
|
||||
|
||||
|
@ -26,9 +28,7 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
|
|||
None
|
||||
"""
|
||||
dataset_list = constants.DATASET_LIST
|
||||
etls_to_search = (
|
||||
dataset_list + [constants.CENSUS_INFO] + [constants.TRIBAL_INFO]
|
||||
)
|
||||
etls_to_search = dataset_list + [constants.CENSUS_INFO]
|
||||
|
||||
if dataset_to_run:
|
||||
dataset_element = next(
|
||||
|
@ -58,6 +58,8 @@ def _get_dataset(dataset: dict) -> ExtractTransformLoad:
|
|||
def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
|
||||
"""Runs one etl process."""
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
logger.info(f"Running ETL for {dataset['name']}")
|
||||
etl_instance = _get_dataset(dataset)
|
||||
|
||||
|
@ -82,9 +84,16 @@ def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
|
|||
etl_instance.cleanup()
|
||||
|
||||
logger.info(f"Finished ETL for dataset {dataset['name']}")
|
||||
logger.debug(
|
||||
f"Execution time for ETL for dataset {dataset['name']} was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
|
||||
def etl_runner(
|
||||
dataset_to_run: str = None,
|
||||
use_cache: bool = False,
|
||||
no_concurrency: bool = False,
|
||||
) -> None:
|
||||
"""Runs all etl processes or a specific one
|
||||
|
||||
Args:
|
||||
|
@ -112,9 +121,12 @@ def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
|
|||
dataset for dataset in dataset_list if dataset["is_memory_intensive"]
|
||||
]
|
||||
|
||||
max_workers = 1 if no_concurrency else os.cpu_count()
|
||||
if concurrent_datasets:
|
||||
logger.info("Running concurrent ETL jobs")
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
logger.info(f"Running concurrent ETL jobs on {max_workers} thread(s)")
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=max_workers
|
||||
) as executor:
|
||||
futures = {
|
||||
executor.submit(
|
||||
_run_one_dataset, dataset=dataset, use_cache=use_cache
|
||||
|
@ -189,10 +201,14 @@ def score_generate() -> None:
|
|||
"""
|
||||
|
||||
# Score Gen
|
||||
start_time = time.time()
|
||||
score_gen = ScoreETL()
|
||||
score_gen.extract()
|
||||
score_gen.transform()
|
||||
score_gen.load()
|
||||
logger.debug(
|
||||
f"Execution time for Score Generation was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def score_post(data_source: str = "local") -> None:
|
||||
|
@ -208,11 +224,15 @@ def score_post(data_source: str = "local") -> None:
|
|||
None
|
||||
"""
|
||||
# Post Score Processing
|
||||
start_time = time.time()
|
||||
score_post = PostScoreETL(data_source=data_source)
|
||||
score_post.extract()
|
||||
score_post.transform()
|
||||
score_post.load()
|
||||
score_post.cleanup()
|
||||
logger.debug(
|
||||
f"Execution time for Score Post was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def score_geo(data_source: str = "local") -> None:
|
||||
|
@ -229,10 +249,14 @@ def score_geo(data_source: str = "local") -> None:
|
|||
"""
|
||||
|
||||
# Score Geo
|
||||
start_time = time.time()
|
||||
score_geo = GeoScoreETL(data_source=data_source)
|
||||
score_geo.extract()
|
||||
score_geo.transform()
|
||||
score_geo.load()
|
||||
logger.debug(
|
||||
f"Execution time for Score Geo was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def _find_dataset_index(dataset_list, key, value):
|
||||
|
|
|
@ -24,7 +24,7 @@ DATA_CENSUS_DIR = DATA_PATH / "census"
|
|||
DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
|
||||
DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
|
||||
DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"
|
||||
DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us.json"
|
||||
DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us_geo.parquet"
|
||||
|
||||
# Score paths
|
||||
DATA_SCORE_DIR = DATA_PATH / "score"
|
||||
|
@ -32,7 +32,7 @@ DATA_SCORE_DIR = DATA_PATH / "score"
|
|||
## Score CSV Paths
|
||||
DATA_SCORE_CSV_DIR = DATA_SCORE_DIR / "csv"
|
||||
DATA_SCORE_CSV_FULL_DIR = DATA_SCORE_CSV_DIR / "full"
|
||||
DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa.csv"
|
||||
DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa_score.parquet"
|
||||
FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
|
||||
DATA_SCORE_CSV_FULL_DIR / "usa_counties.csv"
|
||||
)
|
||||
|
@ -52,7 +52,7 @@ DATA_TILES_SEARCH_DIR = DATA_SCORE_DIR / "search"
|
|||
|
||||
# Downloadable paths
|
||||
if not os.environ.get("J40_VERSION_LABEL_STRING"):
|
||||
version_str = "beta"
|
||||
version_str = "2.0"
|
||||
else:
|
||||
version_str = os.environ.get("J40_VERSION_LABEL_STRING")
|
||||
|
||||
|
|
|
@ -727,4 +727,4 @@ class ScoreETL(ExtractTransformLoad):
|
|||
def load(self) -> None:
|
||||
constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
|
||||
self.df.to_parquet(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
|
||||
|
|
|
@ -37,9 +37,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
|
||||
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
|
||||
|
||||
self.CENSUS_USA_GEOJSON = (
|
||||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
)
|
||||
self.CENSUS_USA_GEOJSON = constants.DATA_CENSUS_GEOJSON_FILE_PATH
|
||||
|
||||
# Import the shortened name for Score N to be used on tiles.
|
||||
# We should no longer be using PFS
|
||||
|
@ -87,16 +85,14 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
score_data_source=self.DATA_SOURCE,
|
||||
)
|
||||
|
||||
logger.info("Reading US GeoJSON (~6 minutes)")
|
||||
full_geojson_usa_df = gpd.read_file(
|
||||
logger.info("Reading US GeoJSON")
|
||||
full_geojson_usa_df = gpd.read_parquet(
|
||||
self.CENSUS_USA_GEOJSON,
|
||||
dtype={self.GEOID_FIELD_NAME: "string"},
|
||||
usecols=[
|
||||
columns=[
|
||||
self.GEOID_FIELD_NAME,
|
||||
self.GEOMETRY_FIELD_NAME,
|
||||
self.LAND_FIELD_NAME,
|
||||
],
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
# We only want to keep tracts to visualize that have non-0 land
|
||||
|
@ -104,7 +100,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
full_geojson_usa_df[self.LAND_FIELD_NAME] > 0
|
||||
]
|
||||
|
||||
logger.info("Reading score CSV")
|
||||
logger.info("Reading tile score CSV")
|
||||
self.score_usa_df = pd.read_csv(
|
||||
self.TILE_SCORE_CSV,
|
||||
dtype={
|
||||
|
|
|
@ -94,12 +94,8 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
||||
logger.debug("Reading Score CSV")
|
||||
df = pd.read_csv(
|
||||
score_path,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
logger.debug("Reading Score")
|
||||
df = pd.read_parquet(score_path)
|
||||
|
||||
# Convert total population to an int
|
||||
df["Total population"] = df["Total population"].astype(
|
||||
|
@ -116,8 +112,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
gpd.GeoDataFrame: the census geo json data
|
||||
"""
|
||||
logger.debug("Reading Census GeoJSON")
|
||||
with open(geo_path, "r", encoding="utf-8") as file:
|
||||
data = gpd.read_file(file)
|
||||
data = gpd.read_parquet(geo_path)
|
||||
return data
|
||||
|
||||
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||
|
@ -517,7 +512,6 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
num_cols = len(excel_df.columns)
|
||||
worksheet.set_column(0, num_cols - 1, num_excel_cols_width)
|
||||
|
||||
writer.save()
|
||||
return excel_csv_config
|
||||
|
||||
def _load_tile_csv(
|
||||
|
|
|
@ -70,7 +70,7 @@ def state_data_initial(sample_data_dir):
|
|||
|
||||
@pytest.fixture()
|
||||
def score_data_initial(sample_data_dir):
|
||||
return sample_data_dir / "score_data_initial.csv"
|
||||
return sample_data_dir / "score_data_initial.parquet"
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -104,8 +104,8 @@ def states_transformed_expected():
|
|||
|
||||
@pytest.fixture()
|
||||
def score_transformed_expected():
|
||||
return pd.read_pickle(
|
||||
pytest.SNAPSHOT_DIR / "score_transformed_expected.pkl"
|
||||
return pd.read_parquet(
|
||||
pytest.SNAPSHOT_DIR / "score_transformed_expected.parquet"
|
||||
)
|
||||
|
||||
|
||||
|
@ -122,7 +122,7 @@ def national_tract_df():
|
|||
|
||||
@pytest.fixture()
|
||||
def score_data_expected():
|
||||
return pd.read_pickle(pytest.SNAPSHOT_DIR / "score_data_expected.pkl")
|
||||
return pd.read_parquet(pytest.SNAPSHOT_DIR / "score_data_expected.parquet")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -144,8 +144,8 @@ def create_tile_data_expected():
|
|||
|
||||
@pytest.fixture()
|
||||
def downloadable_data_expected():
|
||||
return pd.read_pickle(
|
||||
pytest.SNAPSHOT_DIR / "downloadable_data_expected.pkl"
|
||||
return pd.read_parquet(
|
||||
pytest.SNAPSHOT_DIR / "downloadable_data_expected.parquet"
|
||||
)
|
||||
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -33,8 +33,7 @@ def test_extract_states(etl, state_data_initial):
|
|||
|
||||
def test_extract_score(etl, score_data_initial):
|
||||
extracted = etl._extract_score(score_data_initial)
|
||||
string_cols = ["GEOID10_TRACT"]
|
||||
assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
|
||||
assert len(extracted) > 0
|
||||
|
||||
|
||||
# Transform Tests
|
||||
|
@ -107,6 +106,7 @@ def test_create_downloadable_data(
|
|||
pdt.assert_frame_equal(
|
||||
output_downloadable_df_actual,
|
||||
downloadable_data_expected,
|
||||
check_dtype=False,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
import csv
|
||||
import json
|
||||
import subprocess
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
||||
import geopandas as gpd
|
||||
import pandas as pd
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
@ -26,8 +25,8 @@ class CensusETL(ExtractTransformLoad):
|
|||
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
|
||||
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
|
||||
NATIONAL_TRACT_CSV_PATH = CSV_BASE_PATH / "us.csv"
|
||||
NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us.json"
|
||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||
NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us_geo.parquet"
|
||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10"
|
||||
|
||||
def __init__(self):
|
||||
|
||||
|
@ -59,7 +58,7 @@ class CensusETL(ExtractTransformLoad):
|
|||
/ f"tl_2010_{fips_code}_tract10.shp"
|
||||
)
|
||||
elif file_type == GeoFileType.GEOJSON:
|
||||
file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.json")
|
||||
file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.parquet")
|
||||
elif file_type == GeoFileType.CSV:
|
||||
file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
|
||||
return file_path
|
||||
|
@ -93,14 +92,8 @@ class CensusETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
if not geojson_file_path.is_file():
|
||||
cmd = [
|
||||
"ogr2ogr",
|
||||
"-f",
|
||||
"GeoJSON",
|
||||
str(geojson_file_path),
|
||||
str(shp_file_path),
|
||||
]
|
||||
subprocess.run(cmd, check=True)
|
||||
gdf = gpd.read_file(shp_file_path)
|
||||
gdf.to_parquet(geojson_file_path)
|
||||
|
||||
def _generate_tract_table(self) -> None:
|
||||
"""Generate Tract CSV table for pandas, load in memory
|
||||
|
@ -110,20 +103,15 @@ class CensusETL(ExtractTransformLoad):
|
|||
"""
|
||||
logger.debug("Transforming tracts")
|
||||
|
||||
for file in self.GEOJSON_BASE_PATH.iterdir():
|
||||
if file.suffix == ".json":
|
||||
logger.debug(f"Adding GEOID10 for file {file.name}")
|
||||
with open(self.GEOJSON_BASE_PATH / file, encoding="utf-8") as f:
|
||||
geojson = json.load(f)
|
||||
for feature in geojson["features"]:
|
||||
tractid10 = feature["properties"]["GEOID10"]
|
||||
self.TRACT_NATIONAL.append(str(tractid10))
|
||||
tractid10_state_id = tractid10[:2]
|
||||
if not self.TRACT_PER_STATE.get(tractid10_state_id):
|
||||
self.TRACT_PER_STATE[tractid10_state_id] = []
|
||||
self.TRACT_PER_STATE[tractid10_state_id].append(
|
||||
tractid10
|
||||
)
|
||||
files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
|
||||
files.sort()
|
||||
for file in files:
|
||||
logger.debug(f"Adding GEOID10 for file {file.name}")
|
||||
state_df = gpd.read_parquet(file)
|
||||
tract_list = state_df["GEOID10"].to_list()
|
||||
self.TRACT_NATIONAL.extend(tract_list)
|
||||
tractid10_state_id = state_df["STATEFP10"][0]
|
||||
self.TRACT_PER_STATE[tractid10_state_id] = tract_list
|
||||
|
||||
def transform(self) -> None:
|
||||
"""Download all census shape files from the Census FTP and extract the geojson
|
||||
|
@ -210,18 +198,24 @@ class CensusETL(ExtractTransformLoad):
|
|||
|
||||
usa_df = gpd.GeoDataFrame()
|
||||
|
||||
for file_name in self.GEOJSON_BASE_PATH.rglob("*.json"):
|
||||
# Read state only files and append them into a MEGA US GPD
|
||||
files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
|
||||
files.sort()
|
||||
for file_name in files:
|
||||
logger.debug(f"Adding national GeoJSON file {file_name.name}")
|
||||
state_gdf = gpd.read_file(file_name)
|
||||
usa_df = usa_df.append(state_gdf)
|
||||
state_gdf = gpd.read_parquet(file_name)
|
||||
usa_df = pd.concat([usa_df, state_gdf], ignore_index=True)
|
||||
|
||||
assert len(usa_df.columns) > 0
|
||||
logger.debug("Converting to CRS")
|
||||
usa_df = usa_df.to_crs(
|
||||
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
||||
)
|
||||
usa_df = usa_df.to_crs("EPSG:4326")
|
||||
|
||||
logger.debug("Saving national GeoJSON file")
|
||||
usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
|
||||
# Convert tract ID to a string
|
||||
usa_df[self.GEOID_TRACT_FIELD_NAME] = usa_df[
|
||||
self.GEOID_TRACT_FIELD_NAME
|
||||
].astype(str, errors="ignore")
|
||||
usa_df.to_parquet(self.NATIONAL_TRACT_JSON_PATH)
|
||||
|
||||
def load(self) -> None:
|
||||
"""Create state CSVs, National CSV, and National GeoJSON
|
||||
|
|
|
@ -104,7 +104,7 @@ def check_census_data_source(
|
|||
)
|
||||
else:
|
||||
# check if census data is found locally
|
||||
if not os.path.isfile(census_data_path / "geojson" / "us.json"):
|
||||
if not os.path.isfile(census_data_path / "geojson" / "us_geo.parquet"):
|
||||
logger.error(
|
||||
"No local census data found. Please use '-s aws` to fetch from AWS"
|
||||
)
|
||||
|
|
|
@ -507,7 +507,7 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
# geojson file for all of the US, this will read it off of S3
|
||||
logger.debug("Reading in geojson for the country")
|
||||
if not os.path.exists(
|
||||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
self.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
|
||||
):
|
||||
logger.debug("Fetching Census data from AWS S3")
|
||||
unzip_file_from_url(
|
||||
|
@ -515,9 +515,8 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.DATA_PATH / "tmp",
|
||||
self.DATA_PATH,
|
||||
)
|
||||
|
||||
self.geo_df = gpd.read_file(
|
||||
self.DATA_PATH / "census" / "geojson" / "us.json",
|
||||
self.geo_df = gpd.read_parquet(
|
||||
self.DATA_PATH / "census" / "geojson" / "us_geo.parquet",
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
|
|
|
@ -33,7 +33,7 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
/ f"census_decennial_{DECENNIAL_YEAR}"
|
||||
)
|
||||
CENSUS_GEOJSON_PATH = (
|
||||
ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
|
||||
)
|
||||
|
||||
def __get_api_url(
|
||||
|
@ -148,7 +148,7 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
"""Impute income for both income measures."""
|
||||
# Merges Census geojson to imput values from.
|
||||
logger.debug(f"Reading GeoJSON from {geojson_path}")
|
||||
geo_df = gpd.read_file(geojson_path)
|
||||
geo_df = gpd.read_parquet(geojson_path)
|
||||
self.df_all = CensusACSETL.merge_geojson(
|
||||
df=self.df_all,
|
||||
usa_geo_df=geo_df,
|
||||
|
|
|
@ -26,10 +26,7 @@ def get_tract_geojson(
|
|||
census_etl.extract()
|
||||
census_etl.transform()
|
||||
census_etl.load()
|
||||
tract_data = gpd.read_file(
|
||||
GEOJSON_PATH,
|
||||
include_fields=["GEOID10"],
|
||||
)
|
||||
tract_data = gpd.read_parquet(GEOJSON_PATH)
|
||||
tract_data = tract_data.rename(
|
||||
columns={"GEOID10": "GEOID10_TRACT"}, errors="raise"
|
||||
)
|
||||
|
|
|
@ -7,10 +7,13 @@ from data_pipeline.score.field_names import GEOID_TRACT_FIELD
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def final_score_df():
|
||||
return pd.read_csv(
|
||||
settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv",
|
||||
dtype={GEOID_TRACT_FIELD: str},
|
||||
low_memory=False,
|
||||
return pd.read_parquet(
|
||||
settings.APP_ROOT
|
||||
/ "data"
|
||||
/ "score"
|
||||
/ "csv"
|
||||
/ "full"
|
||||
/ "usa_score.parquet",
|
||||
)
|
||||
|
||||
|
||||
|
@ -173,7 +176,7 @@ def geocorr_urban_rural_df():
|
|||
@pytest.fixture()
|
||||
def census_decennial_df():
|
||||
census_decennial_csv = (
|
||||
constants.DATA_PATH / "dataset" / "census_decennial_2010" / "usa.csv"
|
||||
constants.DATA_PATH / "dataset" / "census_decennial_2020" / "usa.csv"
|
||||
)
|
||||
return pd.read_csv(
|
||||
census_decennial_csv,
|
||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
|
@ -17,7 +17,7 @@ from data_pipeline.score.utils import (
|
|||
@contextmanager
|
||||
def patch_calculate_tract_adjacency_scores():
|
||||
# Use fixtures for tract data.
|
||||
tract_data_path = Path(__file__).parent / "data" / "us.geojson"
|
||||
tract_data_path = Path(__file__).parent / "data" / "us_geo.parquet"
|
||||
|
||||
get_tract_geojson_mock = partial(
|
||||
get_tract_geojson, _tract_data_path=tract_data_path
|
||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
|
@ -68,7 +68,7 @@ def transformed_data_fixture(
|
|||
"""Load the test data and call the ETL transform"""
|
||||
dec = CensusDecennialETL()
|
||||
dec.df_all = extracted_data_fixture
|
||||
dec.transform(imputed_path_fixture / "census-us-territory-geojson.json")
|
||||
dec.transform(imputed_path_fixture / "census-us-territory-geojson.parquet")
|
||||
return dec.df_all
|
||||
|
||||
|
||||
|
|
|
@ -147,9 +147,12 @@ def download_file_from_url(
|
|||
if not os.path.isdir(download_file_name.parent):
|
||||
os.mkdir(download_file_name.parent)
|
||||
|
||||
response = requests.get(
|
||||
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
|
||||
timeout = (
|
||||
settings.REQUEST_TIMEOUT
|
||||
if "REQUEST_TIMEOUT" in settings
|
||||
else settings.REQUESTS_DEFAULT_TIMOUT
|
||||
)
|
||||
response = requests.get(file_url, verify=verify, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
file_contents = response.content
|
||||
else:
|
||||
|
|
2063
data/data-pipeline/poetry.lock
generated
2063
data/data-pipeline/poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "justice40-data-pipeline"
|
||||
version = "1.0.1"
|
||||
version = "2.0"
|
||||
description = "ETL, Score and Map Generation of Justice 40 Tool"
|
||||
authors = ["Justice40 Engineering <j40-engineering@lists.usds.gov>"]
|
||||
keywords = ["justice40", "environmental_justice", "python", "etl"]
|
||||
|
@ -42,6 +42,7 @@ pydantic = "^1.9.0"
|
|||
Rtree = "^1.0.0"
|
||||
fiona = "~1.8.21"
|
||||
tenacity = ">=5.0.2"
|
||||
pyarrow = "^18.1.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^21"
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
[default]
|
||||
AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources"
|
||||
AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-pipeline"
|
||||
AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-versions/2.0"
|
||||
DATASOURCE_RETRIEVAL_FROM_AWS = true
|
||||
REQUEST_TIMEOUT = 120
|
||||
REQUEST_RETRIES = 2
|
||||
|
||||
[development]
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue