diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py index 06233b44..f62a5960 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py @@ -15,6 +15,12 @@ class GeoCorrETL(ExtractTransformLoad): self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr" # Need to change hyperlink to S3 + + # Note, that this CSV was generated by this notebook: + # https://github.com/usds/justice40-tool/blob/main/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb + # The source data for this notebook was downloaded from GeoCorr; + # the instructions for generating the source data is here: + # https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787 self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip" self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT" self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag" diff --git a/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb b/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb index 0785a189..1059229a 100644 --- a/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb @@ -100,6 +100,8 @@ "metadata": {}, "outputs": [], "source": [ + "# CSV was manually generated\n", + "# Instructions for how to generate the CSV from Geocorr are here: https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787\n", "geocorr_urban_rural_map = pd.read_csv(\n", " os.path.join(GEOCORR_DATA_DIR, \"geocorr2014_2125804280.csv\"),\n", " encoding=\"ISO-8859-1\",\n",