From 5b2548f56d6473cbcaa2e982135597b974b8093d Mon Sep 17 00:00:00 2001 From: Claire Hemmerly Date: Mon, 9 May 2022 11:34:19 -0500 Subject: [PATCH 01/20] first upload of nitrogen plume data --- .../ocn_027_rw0_nitrogen_plumes_plumes.py | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_plumes.py diff --git a/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_plumes.py b/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_plumes.py new file mode 100644 index 00000000..cb9ec6d2 --- /dev/null +++ b/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_plumes.py @@ -0,0 +1,179 @@ +import os +import sys +import dotenv +utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') +if utils_path not in sys.path: + sys.path.append(utils_path) +import util_files +import util_cloud +import zipfile +from zipfile import ZipFile +import ee +from google.cloud import storage +import logging +import urllib +from collections import OrderedDict + +# Set up logging +# Get the top-level logger object +logger = logging.getLogger() +for handler in logger.handlers: logger.removeHandler(handler) +logger.setLevel(logging.INFO) +# make it print to the console. +console = logging.StreamHandler() +logger.addHandler(console) +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# name of asset on GEE where you want to upload data +# this should be an asset name that is not currently in use +dataset_name = 'ocn_027_rw0_nitrogen_plumes' + +# create a new sub-directory within your specified dir called 'data' +# within this directory, create files to store raw and processed data +data_dir = util_files.prep_dirs(dataset_name) + +# create a dictionary to store information about the dataset +data_dict = OrderedDict() + +data_dict= { + 'url': 'https://knb.ecoinformatics.org/knb/d1/mn/v2/object/urn%3Auuid%3Aefef18ef-416e-4d4d-9190-f17485c02c15', + 'tifs': ['global_effluent_2015_open_N.tif', 'global_effluent_2015_septic_N.tif', 'global_effluent_2015_treated_N.tif', 'global_effluent_2015_tot_N.tiff'], + 'raw_data_file':[], + 'processed_data_file': [], + 'sds': [ + 'classification', + ], + 'missing_data': [], + 'pyramiding_policy': 'MEAN', + 'band_ids': ['classification'] +} + +''' +Download data and save to your data directory - this may take a few minutes +''' +logger.info('Downloading raw data') + +# download the data from the source +raw_data_file = os.path.join(data_dir, 'Global_N_Coastal_Plumes_tifs.zip') +urllib.request.urlretrieve(data_dict['url'], raw_data_file) + +# unzip source data +raw_data_file_unzipped = raw_data_file.split('.')[0] +zip_ref = ZipFile(raw_data_file, 'r') +zip_ref.extractall(raw_data_file_unzipped) +zip_ref.close() + +# set name of raw data files +for tif in data_dict['tifs']: + data_dict['raw_data_file'].append(os.path.join(data_dir,tif)) + +''' +Process data +''' +# Project and compress each tif +for i in range(len(data_dict['tifs'])): + # set a new file name to represent processed data + plume_type = ['open', 'septic', 'treated', 'total'] + data_dict['processed_data_file'].append(os.path.join(data_dir,dataset_name + '_' + plume_type[i] +'.tif')) + + logger.info('Processing data for ' + data_dict['processed_data_file'][i]) + + # project the data into WGS84 (espg 4326) using the command line terminal + cmd = 'gdalwarp -of GTiff -t_srs EPSG:4326 {} {}' + # format to command line and run + posix_cmd = shlex.split(cmd.format(data_dict['raw_data_file'][i], data_dict['processed_data_file'][i]), posix=True) + completed_process= subprocess.check_call(posix_cmd) + logging.debug(str(completed_process)) + +''' +Upload processed data to Google Earth Engine +''' + +# set up Google Cloud Storage project and bucket objects +gcsClient = storage.Client(os.environ.get("CLOUDSDK_CORE_PROJECT")) +gcsBucket = gcsClient.bucket(os.environ.get("GEE_STAGING_BUCKET")) + +# initialize ee and eeUtil modules for uploading to Google Earth Engine +auth = ee.ServiceAccountCredentials(os.getenv('GEE_SERVICE_ACCOUNT'), os.getenv('GOOGLE_APPLICATION_CREDENTIALS')) +ee.Initialize(auth) + +# set pyramiding policy for GEE upload +pyramiding_policy = data_dict['pyramiding_policy'] #check + +# Create an image collection where we will put the processed data files in GEE +image_collection = f'projects/resource-watch-gee/{dataset_name}' +#ee.data.createAsset({'type': 'ImageCollection'}, image_collection) + +# set image collection's privacy to public +acl = {"all_users_can_read": True} +ee.data.setAssetAcl(image_collection, acl) +print('Privacy set to public.') + +# list the bands in each image +band_ids = data_dict['band_ids'] + +task_id = [] + +# Upload processed data files to GEE + +# if upload is timing out, uncomment the following lines +# storage.blob._DEFAULT_CHUNKSIZE = 10 * 1024* 1024 # 10 MB +# storage.blob._MAX_MULTIPART_SIZE = 10 * 1024* 1024 # 10 MB + +#loop though the processed data files to upload to Google Cloud Storage and Google Earth Engine +for i in range(len(data_dict['tifs'])): + logger.info('Uploading '+ data_dict['processed_data_file'][i]+' to Google Cloud Storage.') + # upload files to Google Cloud Storage + gcs_uri= util_cloud.gcs_upload(data_dict['raw_data_file'][i], dataset_name, gcs_bucket=gcsBucket) + + logger.info('Uploading '+ data_dict['processed_data_file'][i]+ ' Google Earth Engine.') + # generate an asset name for the current file by using the filename (minus the file type extension) + file_name=data_dict['processed_data_file'][i].split('.')[0].split('/')[1] + asset_name = f'projects/resource-watch-gee/{dataset_name}/{file_name}' + + # create the band manifest for this asset + tileset_id= data_dict['tifs'][i].split('.')[0] + mf_bands = [{'id': band_id, 'tileset_band_index': band_ids.index(band_id), 'tileset_id': tileset_id,'pyramidingPolicy': pyramiding_policy} for band_id in band_ids] + + # create complete manifest for asset upload + manifest = util_cloud.gee_manifest_complete(asset_name, gcs_uri[0], mf_bands) + + # upload the file from Google Cloud Storage to Google Earth Engine + task = util_cloud.gee_ingest(manifest) + print(asset_name + ' uploaded to GEE') + task_id.append(task) + + # remove files from Google Cloud Storage + util_cloud.gcs_remove(gcs_uri, gcs_bucket=gcsBucket) + logger.info('Files deleted from Google Cloud Storage.') + +''' +Upload original data and processed data to Amazon S3 storage +''' +# initialize AWS variables +aws_bucket = 'wri-public-data' +s3_prefix = 'resourcewatch/raster/' + +# Copy the raw data into a zipped file to upload to S3 + +print('Uploading original data to S3.') +# Copy the raw data into a zipped file to upload to S3 +raw_data_dir = os.path.join(data_dir, dataset_name+'.zip') +with ZipFile(raw_data_dir,'w') as zip: + raw_data_files = data_dict['raw_data_file'] + for raw_data_file in raw_data_files: + zip.write(raw_data_file, os.path.basename(raw_data_file)) + +# Upload raw data file to S3 +uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix + os.path.basename(raw_data_dir)) + +logger.info('Uploading processed data to S3.') +# Copy the processed data into a zipped file to upload to S3 +processed_data_dir = os.path.join(data_dir, dataset_name+'_edit.zip') +with ZipFile(processed_data_dir,'w') as zip: + processed_data_files = data_dict['processed_data_file'] + for processed_data_file in processed_data_files: + zip.write(processed_data_file, os.path.basename(processed_data_file),compress_type= zipfile.ZIP_DEFLATED) + +# Upload processed data file to S3 +uploaded = util_cloud.aws_upload(processed_data_dir, aws_bucket, s3_prefix + os.path.basename(processed_data_dir)) From f2c7aa9425fbebe6689150028e03c4253f0fb0f0 Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Mon, 9 May 2022 11:37:10 -0500 Subject: [PATCH 02/20] Rename ocn_027_rw0_nitrogen_plumes_plumes.py to ocn_027_rw0_nitrogen_plumes_processing.py --- ...plumes_plumes.py => ocn_027_rw0_nitrogen_plumes_processing.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ocn_027_rw0_nitrogen_plumes/{ocn_027_rw0_nitrogen_plumes_plumes.py => ocn_027_rw0_nitrogen_plumes_processing.py} (100%) diff --git a/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_plumes.py b/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py similarity index 100% rename from ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_plumes.py rename to ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py From 89dd7711d92eb2585efefccbf5ece17d08e6e323 Mon Sep 17 00:00:00 2001 From: Claire Hemmerly Date: Wed, 11 May 2022 12:04:09 -0500 Subject: [PATCH 03/20] debugging --- .../ocn_027_rw0_nitrogen_plumes_processing.py | 188 ++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py diff --git a/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py b/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py new file mode 100644 index 00000000..89efc1bb --- /dev/null +++ b/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py @@ -0,0 +1,188 @@ +import os +import sys +import dotenv +utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') +if utils_path not in sys.path: + sys.path.append(utils_path) +import util_files +import util_cloud +import zipfile +from zipfile import ZipFile +import ee +from google.cloud import storage +import logging +import urllib +from collections import OrderedDict +import shlex +import subprocess + +# Set up logging +# Get the top-level logger object +logger = logging.getLogger() +for handler in logger.handlers: logger.removeHandler(handler) +logger.setLevel(logging.INFO) +# make it print to the console. +console = logging.StreamHandler() +logger.addHandler(console) +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# name of asset on GEE where you want to upload data +# this should be an asset name that is not currently in use +dataset_name = 'ocn_027_rw0_nitrogen_plumes' + +# create a new sub-directory within your specified dir called 'data' +# within this directory, create files to store raw and processed data +data_dir = util_files.prep_dirs(dataset_name) + +# create a dictionary to store information about the dataset +data_dict = OrderedDict() + +data_dict= { + 'url': 'https://knb.ecoinformatics.org/knb/d1/mn/v2/object/urn%3Auuid%3Aefef18ef-416e-4d4d-9190-f17485c02c15', + 'unzipped folder': 'Global_N_Coastal_Plumes_tifs', + 'tifs': ['global_effluent_2015_open_N.tif', 'global_effluent_2015_septic_N.tif', 'global_effluent_2015_treated_N.tif', 'global_effluent_2015_tot_N.tiff'], + 'raw_data_file':[], + 'processed_data_file': [], + 'sds': [ + 'classification', + ], + 'pyramiding_policy': 'MEAN', + 'band_ids': ['classification'] +} + +''' +Download data and save to your data directory - this may take a few minutes +''' +logger.info('Downloading raw data') + +#download the data from the source +raw_data_file = os.path.join(data_dir, 'Global_N_Coastal_Plumes_tifs.zip') +urllib.request.urlretrieve(data_dict['url'], raw_data_file) + +# unzip source data +raw_data_file_unzipped = raw_data_file.split('.')[0] +zip_ref = ZipFile(raw_data_file, 'r') +zip_ref.extractall(raw_data_file_unzipped) +zip_ref.close() + + + +# set name of raw data files +for tif in data_dict['tifs']: + data_dict['raw_data_file'].append(os.path.join(data_dir, data_dict['unzipped folder'], tif)) + + +''' +Process data +''' +# Project and compress each tif +for i in range(len(data_dict['tifs'])): + # set a new file name to represent processed data + plume_type = ['open', 'septic', 'treated', 'total'] + data_dict['processed_data_file'].append(os.path.join(data_dir,dataset_name + '_' + plume_type[i] +'.tif')) + + logger.info('Processing data for ' + data_dict['processed_data_file'][i]) + + raw_data_path = os.path.join(os.getenv('PROCESSING_DIR'), dataset_name, data_dict['raw_data_file'][i]) + logger.info(raw_data_path) + + # project the data into WGS84 (espg 4326) using the command line terminal + cmd = 'gdalwarp -of GTiff -t_srs EPSG:4326 {} {}' + # format to command line and run + posix_cmd = shlex.split(cmd.format(raw_data_path, 'out.tif'), posix=True) + logger.info(posix_cmd) + completed_process= subprocess.check_output(posix_cmd) + logging.debug(str(completed_process)) + +''' +Upload processed data to Google Earth Engine +''' + +# set up Google Cloud Storage project and bucket objects +gcsClient = storage.Client(os.environ.get("CLOUDSDK_CORE_PROJECT")) +gcsBucket = gcsClient.bucket(os.environ.get("GEE_STAGING_BUCKET")) + +# initialize ee and eeUtil modules for uploading to Google Earth Engine +auth = ee.ServiceAccountCredentials(os.getenv('GEE_SERVICE_ACCOUNT'), os.getenv('GOOGLE_APPLICATION_CREDENTIALS')) +ee.Initialize(auth) + +# set pyramiding policy for GEE upload +pyramiding_policy = data_dict['pyramiding_policy'] #check + +# Create an image collection where we will put the processed data files in GEE +image_collection = f'projects/resource-watch-gee/{dataset_name}' +#ee.data.createAsset({'type': 'ImageCollection'}, image_collection) + +# set image collection's privacy to public +acl = {"all_users_can_read": True} +ee.data.setAssetAcl(image_collection, acl) +print('Privacy set to public.') + +# list the bands in each image +band_ids = data_dict['band_ids'] + +task_id = [] + +# Upload processed data files to GEE + +# if upload is timing out, uncomment the following lines +# storage.blob._DEFAULT_CHUNKSIZE = 10 * 1024* 1024 # 10 MB +# storage.blob._MAX_MULTIPART_SIZE = 10 * 1024* 1024 # 10 MB + +#loop though the processed data files to upload to Google Cloud Storage and Google Earth Engine +for i in range(len(data_dict['tifs'])): + logger.info('Uploading '+ data_dict['processed_data_file'][i]+' to Google Cloud Storage.') + # upload files to Google Cloud Storage + gcs_uri= util_cloud.gcs_upload(data_dict['raw_data_file'][i], dataset_name, gcs_bucket=gcsBucket) + + logger.info('Uploading '+ data_dict['processed_data_file'][i]+ ' Google Earth Engine.') + # generate an asset name for the current file by using the filename (minus the file type extension) + file_name=data_dict['processed_data_file'][i].split('.')[0].split('/')[1] + asset_name = f'projects/resource-watch-gee/{dataset_name}/{file_name}' + + # create the band manifest for this asset + tileset_id= data_dict['tifs'][i].split('.')[0] + mf_bands = [{'id': band_id, 'tileset_band_index': band_ids.index(band_id), 'tileset_id': tileset_id,'pyramidingPolicy': pyramiding_policy} for band_id in band_ids] + + # create complete manifest for asset upload + manifest = util_cloud.gee_manifest_complete(asset_name, gcs_uri[0], mf_bands) + + # upload the file from Google Cloud Storage to Google Earth Engine + task = util_cloud.gee_ingest(manifest) + print(asset_name + ' uploaded to GEE') + task_id.append(task) + + # remove files from Google Cloud Storage + util_cloud.gcs_remove(gcs_uri, gcs_bucket=gcsBucket) + logger.info('Files deleted from Google Cloud Storage.') + +''' +Upload original data and processed data to Amazon S3 storage +''' +# initialize AWS variables +aws_bucket = 'wri-public-data' +s3_prefix = 'resourcewatch/raster/' + +# Copy the raw data into a zipped file to upload to S3 + +print('Uploading original data to S3.') +# Copy the raw data into a zipped file to upload to S3 +raw_data_dir = os.path.join(data_dir, dataset_name+'.zip') +with ZipFile(raw_data_dir,'w') as zip: + raw_data_files = data_dict['raw_data_file'] + for raw_data_file in raw_data_files: + zip.write(raw_data_file, os.path.basename(raw_data_file)) + +# Upload raw data file to S3 +uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix + os.path.basename(raw_data_dir)) + +logger.info('Uploading processed data to S3.') +# Copy the processed data into a zipped file to upload to S3 +processed_data_dir = os.path.join(data_dir, dataset_name+'_edit.zip') +with ZipFile(processed_data_dir,'w') as zip: + processed_data_files = data_dict['processed_data_file'] + for processed_data_file in processed_data_files: + zip.write(processed_data_file, os.path.basename(processed_data_file),compress_type= zipfile.ZIP_DEFLATED) + +# Upload processed data file to S3 +uploaded = util_cloud.aws_upload(processed_data_dir, aws_bucket, s3_prefix + os.path.basename(processed_data_dir)) From a0d497ed2e4b38bb3c31db3f00c002e209b34208 Mon Sep 17 00:00:00 2001 From: Claire Hemmerly Date: Fri, 20 May 2022 08:28:57 -0500 Subject: [PATCH 04/20] updated minor fixes to plumes preprocessing --- .../ocn_027_rw0_nitrogen_plumes_processing.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py b/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py index 89efc1bb..5cac604e 100644 --- a/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py +++ b/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py @@ -1,6 +1,5 @@ import os import sys -import dotenv utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') if utils_path not in sys.path: sys.path.append(utils_path) @@ -11,7 +10,7 @@ import ee from google.cloud import storage import logging -import urllib +#import urllib from collections import OrderedDict import shlex import subprocess @@ -40,14 +39,14 @@ data_dict= { 'url': 'https://knb.ecoinformatics.org/knb/d1/mn/v2/object/urn%3Auuid%3Aefef18ef-416e-4d4d-9190-f17485c02c15', 'unzipped folder': 'Global_N_Coastal_Plumes_tifs', - 'tifs': ['global_effluent_2015_open_N.tif', 'global_effluent_2015_septic_N.tif', 'global_effluent_2015_treated_N.tif', 'global_effluent_2015_tot_N.tiff'], + 'tifs': ['global_effluent_2015_open_N.tif', 'global_effluent_2015_septic_N.tif', 'global_effluent_2015_treated_N.tif', 'global_effluent_2015_tot_N.tif'], 'raw_data_file':[], 'processed_data_file': [], 'sds': [ 'classification', ], 'pyramiding_policy': 'MEAN', - 'band_ids': ['classification'] + 'band_ids': ['b1'] } ''' @@ -65,8 +64,6 @@ zip_ref.extractall(raw_data_file_unzipped) zip_ref.close() - - # set name of raw data files for tif in data_dict['tifs']: data_dict['raw_data_file'].append(os.path.join(data_dir, data_dict['unzipped folder'], tif)) @@ -87,12 +84,13 @@ logger.info(raw_data_path) # project the data into WGS84 (espg 4326) using the command line terminal - cmd = 'gdalwarp -of GTiff -t_srs EPSG:4326 {} {}' + cmd = 'gdalwarp {} {}' # format to command line and run - posix_cmd = shlex.split(cmd.format(raw_data_path, 'out.tif'), posix=True) + posix_cmd = shlex.split(cmd.format(raw_data_path, data_dict['processed_data_file'][i]), posix=True) logger.info(posix_cmd) - completed_process= subprocess.check_output(posix_cmd) - logging.debug(str(completed_process)) + #completed_process= subprocess.check_output(posix_cmd) + completed_process= subprocess.call(posix_cmd) + #logging.debug(str(completed_process)) ''' Upload processed data to Google Earth Engine @@ -111,7 +109,7 @@ # Create an image collection where we will put the processed data files in GEE image_collection = f'projects/resource-watch-gee/{dataset_name}' -#ee.data.createAsset({'type': 'ImageCollection'}, image_collection) +ee.data.createAsset({'type': 'ImageCollection'}, image_collection) # set image collection's privacy to public acl = {"all_users_can_read": True} @@ -130,10 +128,11 @@ # storage.blob._MAX_MULTIPART_SIZE = 10 * 1024* 1024 # 10 MB #loop though the processed data files to upload to Google Cloud Storage and Google Earth Engine + for i in range(len(data_dict['tifs'])): logger.info('Uploading '+ data_dict['processed_data_file'][i]+' to Google Cloud Storage.') # upload files to Google Cloud Storage - gcs_uri= util_cloud.gcs_upload(data_dict['raw_data_file'][i], dataset_name, gcs_bucket=gcsBucket) + gcs_uri= util_cloud.gcs_upload(data_dict['processed_data_file'][i], dataset_name, gcs_bucket=gcsBucket) logger.info('Uploading '+ data_dict['processed_data_file'][i]+ ' Google Earth Engine.') # generate an asset name for the current file by using the filename (minus the file type extension) @@ -141,9 +140,9 @@ asset_name = f'projects/resource-watch-gee/{dataset_name}/{file_name}' # create the band manifest for this asset - tileset_id= data_dict['tifs'][i].split('.')[0] - mf_bands = [{'id': band_id, 'tileset_band_index': band_ids.index(band_id), 'tileset_id': tileset_id,'pyramidingPolicy': pyramiding_policy} for band_id in band_ids] - + #tileset_id= data_dict['processed_data_file'][i].split('.')[0] + mf_bands = [{'id': band_id, 'tileset_band_index': band_ids.index(band_id), 'tileset_id': file_name,'pyramidingPolicy': pyramiding_policy} for band_id in band_ids] + # create complete manifest for asset upload manifest = util_cloud.gee_manifest_complete(asset_name, gcs_uri[0], mf_bands) @@ -151,9 +150,9 @@ task = util_cloud.gee_ingest(manifest) print(asset_name + ' uploaded to GEE') task_id.append(task) - + # remove files from Google Cloud Storage - util_cloud.gcs_remove(gcs_uri, gcs_bucket=gcsBucket) + util_cloud.gcs_remove(gcs_uri[0], gcs_bucket=gcsBucket) logger.info('Files deleted from Google Cloud Storage.') ''' From 0332613715d7b386874069f6503198f4d3f3ceca Mon Sep 17 00:00:00 2001 From: Claire Hemmerly Date: Mon, 6 Jun 2022 15:52:37 -0500 Subject: [PATCH 05/20] updated preprocessing file --- ...ocn_027a_rw0_nitrogen_plumes_processing.py | 187 ++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py diff --git a/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py b/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py new file mode 100644 index 00000000..2d672a4a --- /dev/null +++ b/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py @@ -0,0 +1,187 @@ +import os +import sys +utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') +if utils_path not in sys.path: + sys.path.append(utils_path) +import util_files +import util_cloud +import zipfile +from zipfile import ZipFile +import ee +from google.cloud import storage +import logging +#import urllib +from collections import OrderedDict +import shlex +import subprocess + +# Set up logging +# Get the top-level logger object +logger = logging.getLogger() +for handler in logger.handlers: logger.removeHandler(handler) +logger.setLevel(logging.INFO) +# make it print to the console. +console = logging.StreamHandler() +logger.addHandler(console) +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# name of asset on GEE where you want to upload data +# this should be an asset name that is not currently in use +dataset_name = 'ocn_027a_rw0_nitrogen_plumes' + +# create a new sub-directory within your specified dir called 'data' +# within this directory, create files to store raw and processed data +data_dir = util_files.prep_dirs(dataset_name) + +# create a dictionary to store information about the dataset +data_dict = OrderedDict() + +data_dict= { + 'url': 'https://knb.ecoinformatics.org/knb/d1/mn/v2/object/urn%3Auuid%3Aefef18ef-416e-4d4d-9190-f17485c02c15', + 'unzipped folder': 'Global_N_Coastal_Plumes_tifs', + 'tifs': ['global_effluent_2015_open_N.tif', 'global_effluent_2015_septic_N.tif', 'global_effluent_2015_treated_N.tif', 'global_effluent_2015_tot_N.tif'], + 'raw_data_file':[], + 'processed_data_file': [], + 'sds': [ + 'classification', + ], + 'pyramiding_policy': 'MEAN', + 'band_ids': ['b1'] +} + +''' +Download data and save to your data directory - this may take a few minutes +''' +logger.info('Downloading raw data') + +#download the data from the source +raw_data_file = os.path.join(data_dir, 'Global_N_Coastal_Plumes_tifs.zip') +urllib.request.urlretrieve(data_dict['url'], raw_data_file) + +# unzip source data +raw_data_file_unzipped = raw_data_file.split('.')[0] +zip_ref = ZipFile(raw_data_file, 'r') +zip_ref.extractall(raw_data_file_unzipped) +zip_ref.close() + +# set name of raw data files +for tif in data_dict['tifs']: + data_dict['raw_data_file'].append(os.path.join(data_dir, data_dict['unzipped folder'], tif)) + + +''' +Process data +''' +# Project and compress each tif +for i in range(len(data_dict['tifs'])): + # set a new file name to represent processed data + plume_type = ['open', 'septic', 'treated', 'total'] + data_dict['processed_data_file'].append(os.path.join(data_dir,dataset_name + '_' + plume_type[i] +'.tif')) + + logger.info('Processing data for ' + data_dict['processed_data_file'][i]) + + raw_data_path = os.path.join(os.getenv('PROCESSING_DIR'), dataset_name, data_dict['raw_data_file'][i]) + logger.info(raw_data_path) + + # project the data into WGS84 (espg 4326) using the command line terminal + cmd = 'gdalwarp {} {}' + # format to command line and run + posix_cmd = shlex.split(cmd.format(raw_data_path, data_dict['processed_data_file'][i]), posix=True) + logger.info(posix_cmd) + #completed_process= subprocess.check_output(posix_cmd) + completed_process= subprocess.call(posix_cmd) + #logging.debug(str(completed_process)) + +''' +Upload processed data to Google Earth Engine +''' + +# set up Google Cloud Storage project and bucket objects +gcsClient = storage.Client(os.environ.get("CLOUDSDK_CORE_PROJECT")) +gcsBucket = gcsClient.bucket(os.environ.get("GEE_STAGING_BUCKET")) + +# initialize ee and eeUtil modules for uploading to Google Earth Engine +auth = ee.ServiceAccountCredentials(os.getenv('GEE_SERVICE_ACCOUNT'), os.getenv('GOOGLE_APPLICATION_CREDENTIALS')) +ee.Initialize(auth) + +# set pyramiding policy for GEE upload +pyramiding_policy = data_dict['pyramiding_policy'] #check + +# Create an image collection where we will put the processed data files in GEE +image_collection = f'projects/resource-watch-gee/{dataset_name}' +ee.data.createAsset({'type': 'ImageCollection'}, image_collection) + +# set image collection's privacy to public +acl = {"all_users_can_read": True} +ee.data.setAssetAcl(image_collection, acl) +print('Privacy set to public.') + +# list the bands in each image +band_ids = data_dict['band_ids'] + +task_id = [] + +# Upload processed data files to GEE + +# if upload is timing out, uncomment the following lines +# storage.blob._DEFAULT_CHUNKSIZE = 10 * 1024* 1024 # 10 MB +# storage.blob._MAX_MULTIPART_SIZE = 10 * 1024* 1024 # 10 MB + +#loop though the processed data files to upload to Google Cloud Storage and Google Earth Engine + +for i in range(len(data_dict['tifs'])): + logger.info('Uploading '+ data_dict['processed_data_file'][i]+' to Google Cloud Storage.') + # upload files to Google Cloud Storage + gcs_uri= util_cloud.gcs_upload(data_dict['processed_data_file'][i], dataset_name, gcs_bucket=gcsBucket) + + logger.info('Uploading '+ data_dict['processed_data_file'][i]+ ' Google Earth Engine.') + # generate an asset name for the current file by using the filename (minus the file type extension) + file_name=data_dict['processed_data_file'][i].split('.')[0].split('/')[1] + asset_name = f'projects/resource-watch-gee/{dataset_name}/{file_name}' + + # create the band manifest for this asset + #tileset_id= data_dict['processed_data_file'][i].split('.')[0] + mf_bands = [{'id': band_id, 'tileset_band_index': band_ids.index(band_id), 'tileset_id': file_name,'pyramidingPolicy': pyramiding_policy} for band_id in band_ids] + + # create complete manifest for asset upload + manifest = util_cloud.gee_manifest_complete(asset_name, gcs_uri[0], mf_bands) + + # upload the file from Google Cloud Storage to Google Earth Engine + task = util_cloud.gee_ingest(manifest) + print(asset_name + ' uploaded to GEE') + task_id.append(task) + + # remove files from Google Cloud Storage + util_cloud.gcs_remove(gcs_uri[0], gcs_bucket=gcsBucket) + logger.info('Files deleted from Google Cloud Storage.') + +''' +Upload original data and processed data to Amazon S3 storage +''' +# initialize AWS variables +aws_bucket = 'wri-public-data' +s3_prefix = 'resourcewatch/raster/' + +# Copy the raw data into a zipped file to upload to S3 + +print('Uploading original data to S3.') +# Copy the raw data into a zipped file to upload to S3 +raw_data_dir = os.path.join(data_dir, dataset_name+'.zip') +with ZipFile(raw_data_dir,'w') as zip: + raw_data_files = data_dict['raw_data_file'] + for raw_data_file in raw_data_files: + zip.write(raw_data_file, os.path.basename(raw_data_file)) + +# Upload raw data file to S3 +uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix + os.path.basename(raw_data_dir)) + +logger.info('Uploading processed data to S3.') +# Copy the processed data into a zipped file to upload to S3 +processed_data_dir = os.path.join(data_dir, dataset_name+'_edit.zip') +with ZipFile(processed_data_dir,'w') as zip: + processed_data_files = data_dict['processed_data_file'] + for processed_data_file in processed_data_files: + zip.write(processed_data_file, os.path.basename(processed_data_file),compress_type= zipfile.ZIP_DEFLATED) + +# Upload processed data file to S3 +uploaded = util_cloud.aws_upload(processed_data_dir, aws_bucket, s3_prefix + os.path.basename(processed_data_dir)) From 736150299811eafc66a613edef70d104904d9761 Mon Sep 17 00:00:00 2001 From: Claire Hemmerly Date: Mon, 6 Jun 2022 16:14:29 -0500 Subject: [PATCH 06/20] deleting old version --- .../ocn_027_rw0_nitrogen_plumes_processing.py | 187 ------------------ 1 file changed, 187 deletions(-) delete mode 100644 ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py diff --git a/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py b/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py deleted file mode 100644 index 5cac604e..00000000 --- a/ocn_027_rw0_nitrogen_plumes/ocn_027_rw0_nitrogen_plumes_processing.py +++ /dev/null @@ -1,187 +0,0 @@ -import os -import sys -utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') -if utils_path not in sys.path: - sys.path.append(utils_path) -import util_files -import util_cloud -import zipfile -from zipfile import ZipFile -import ee -from google.cloud import storage -import logging -#import urllib -from collections import OrderedDict -import shlex -import subprocess - -# Set up logging -# Get the top-level logger object -logger = logging.getLogger() -for handler in logger.handlers: logger.removeHandler(handler) -logger.setLevel(logging.INFO) -# make it print to the console. -console = logging.StreamHandler() -logger.addHandler(console) -logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') - -# name of asset on GEE where you want to upload data -# this should be an asset name that is not currently in use -dataset_name = 'ocn_027_rw0_nitrogen_plumes' - -# create a new sub-directory within your specified dir called 'data' -# within this directory, create files to store raw and processed data -data_dir = util_files.prep_dirs(dataset_name) - -# create a dictionary to store information about the dataset -data_dict = OrderedDict() - -data_dict= { - 'url': 'https://knb.ecoinformatics.org/knb/d1/mn/v2/object/urn%3Auuid%3Aefef18ef-416e-4d4d-9190-f17485c02c15', - 'unzipped folder': 'Global_N_Coastal_Plumes_tifs', - 'tifs': ['global_effluent_2015_open_N.tif', 'global_effluent_2015_septic_N.tif', 'global_effluent_2015_treated_N.tif', 'global_effluent_2015_tot_N.tif'], - 'raw_data_file':[], - 'processed_data_file': [], - 'sds': [ - 'classification', - ], - 'pyramiding_policy': 'MEAN', - 'band_ids': ['b1'] -} - -''' -Download data and save to your data directory - this may take a few minutes -''' -logger.info('Downloading raw data') - -#download the data from the source -raw_data_file = os.path.join(data_dir, 'Global_N_Coastal_Plumes_tifs.zip') -urllib.request.urlretrieve(data_dict['url'], raw_data_file) - -# unzip source data -raw_data_file_unzipped = raw_data_file.split('.')[0] -zip_ref = ZipFile(raw_data_file, 'r') -zip_ref.extractall(raw_data_file_unzipped) -zip_ref.close() - -# set name of raw data files -for tif in data_dict['tifs']: - data_dict['raw_data_file'].append(os.path.join(data_dir, data_dict['unzipped folder'], tif)) - - -''' -Process data -''' -# Project and compress each tif -for i in range(len(data_dict['tifs'])): - # set a new file name to represent processed data - plume_type = ['open', 'septic', 'treated', 'total'] - data_dict['processed_data_file'].append(os.path.join(data_dir,dataset_name + '_' + plume_type[i] +'.tif')) - - logger.info('Processing data for ' + data_dict['processed_data_file'][i]) - - raw_data_path = os.path.join(os.getenv('PROCESSING_DIR'), dataset_name, data_dict['raw_data_file'][i]) - logger.info(raw_data_path) - - # project the data into WGS84 (espg 4326) using the command line terminal - cmd = 'gdalwarp {} {}' - # format to command line and run - posix_cmd = shlex.split(cmd.format(raw_data_path, data_dict['processed_data_file'][i]), posix=True) - logger.info(posix_cmd) - #completed_process= subprocess.check_output(posix_cmd) - completed_process= subprocess.call(posix_cmd) - #logging.debug(str(completed_process)) - -''' -Upload processed data to Google Earth Engine -''' - -# set up Google Cloud Storage project and bucket objects -gcsClient = storage.Client(os.environ.get("CLOUDSDK_CORE_PROJECT")) -gcsBucket = gcsClient.bucket(os.environ.get("GEE_STAGING_BUCKET")) - -# initialize ee and eeUtil modules for uploading to Google Earth Engine -auth = ee.ServiceAccountCredentials(os.getenv('GEE_SERVICE_ACCOUNT'), os.getenv('GOOGLE_APPLICATION_CREDENTIALS')) -ee.Initialize(auth) - -# set pyramiding policy for GEE upload -pyramiding_policy = data_dict['pyramiding_policy'] #check - -# Create an image collection where we will put the processed data files in GEE -image_collection = f'projects/resource-watch-gee/{dataset_name}' -ee.data.createAsset({'type': 'ImageCollection'}, image_collection) - -# set image collection's privacy to public -acl = {"all_users_can_read": True} -ee.data.setAssetAcl(image_collection, acl) -print('Privacy set to public.') - -# list the bands in each image -band_ids = data_dict['band_ids'] - -task_id = [] - -# Upload processed data files to GEE - -# if upload is timing out, uncomment the following lines -# storage.blob._DEFAULT_CHUNKSIZE = 10 * 1024* 1024 # 10 MB -# storage.blob._MAX_MULTIPART_SIZE = 10 * 1024* 1024 # 10 MB - -#loop though the processed data files to upload to Google Cloud Storage and Google Earth Engine - -for i in range(len(data_dict['tifs'])): - logger.info('Uploading '+ data_dict['processed_data_file'][i]+' to Google Cloud Storage.') - # upload files to Google Cloud Storage - gcs_uri= util_cloud.gcs_upload(data_dict['processed_data_file'][i], dataset_name, gcs_bucket=gcsBucket) - - logger.info('Uploading '+ data_dict['processed_data_file'][i]+ ' Google Earth Engine.') - # generate an asset name for the current file by using the filename (minus the file type extension) - file_name=data_dict['processed_data_file'][i].split('.')[0].split('/')[1] - asset_name = f'projects/resource-watch-gee/{dataset_name}/{file_name}' - - # create the band manifest for this asset - #tileset_id= data_dict['processed_data_file'][i].split('.')[0] - mf_bands = [{'id': band_id, 'tileset_band_index': band_ids.index(band_id), 'tileset_id': file_name,'pyramidingPolicy': pyramiding_policy} for band_id in band_ids] - - # create complete manifest for asset upload - manifest = util_cloud.gee_manifest_complete(asset_name, gcs_uri[0], mf_bands) - - # upload the file from Google Cloud Storage to Google Earth Engine - task = util_cloud.gee_ingest(manifest) - print(asset_name + ' uploaded to GEE') - task_id.append(task) - - # remove files from Google Cloud Storage - util_cloud.gcs_remove(gcs_uri[0], gcs_bucket=gcsBucket) - logger.info('Files deleted from Google Cloud Storage.') - -''' -Upload original data and processed data to Amazon S3 storage -''' -# initialize AWS variables -aws_bucket = 'wri-public-data' -s3_prefix = 'resourcewatch/raster/' - -# Copy the raw data into a zipped file to upload to S3 - -print('Uploading original data to S3.') -# Copy the raw data into a zipped file to upload to S3 -raw_data_dir = os.path.join(data_dir, dataset_name+'.zip') -with ZipFile(raw_data_dir,'w') as zip: - raw_data_files = data_dict['raw_data_file'] - for raw_data_file in raw_data_files: - zip.write(raw_data_file, os.path.basename(raw_data_file)) - -# Upload raw data file to S3 -uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix + os.path.basename(raw_data_dir)) - -logger.info('Uploading processed data to S3.') -# Copy the processed data into a zipped file to upload to S3 -processed_data_dir = os.path.join(data_dir, dataset_name+'_edit.zip') -with ZipFile(processed_data_dir,'w') as zip: - processed_data_files = data_dict['processed_data_file'] - for processed_data_file in processed_data_files: - zip.write(processed_data_file, os.path.basename(processed_data_file),compress_type= zipfile.ZIP_DEFLATED) - -# Upload processed data file to S3 -uploaded = util_cloud.aws_upload(processed_data_dir, aws_bucket, s3_prefix + os.path.basename(processed_data_dir)) From a160987c20f1c5640e23bf9921e05c0b6e79d327 Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Tue, 7 Jun 2022 09:44:30 -0500 Subject: [PATCH 07/20] Create README.md --- ocn_027a_rw0_nitrogen_plumes/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 ocn_027a_rw0_nitrogen_plumes/README.md diff --git a/ocn_027a_rw0_nitrogen_plumes/README.md b/ocn_027a_rw0_nitrogen_plumes/README.md new file mode 100644 index 00000000..2f2cb092 --- /dev/null +++ b/ocn_027a_rw0_nitrogen_plumes/README.md @@ -0,0 +1,14 @@ +## Wastewater Plumes in Coastal Areas Dataset Pre-processing +This file describes the data pre-processing that was done to [the Global Inputs and Impacts from of Human Sewage in Coastal Ecosystems]([{learn more link}](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0258898)) for [display on Resource Watch]({link to dataset's metadata page on Resource Watch}). + +This dataset is provided as a series of GeoTIFF files from the data provider to the Resource Watch data team. + +To display these data on Resource Watch, each GeoTIFF was translated into the appropriate projection for web display and uploaded to Google Earth Engine. + +Please see the [Python script]({link to Python script on Github}) for more details on this processing. + +You can view the processed {Resource Watch public title} dataset [on Resource Watch]({link to dataset's metadata page on Resource Watch}). + +You can also download the original dataset [directly through Resource Watch]({s3 link if available}), or [from the source website]({download from source link}). + +###### Note: This dataset processing was done by [Claire Hemmerly]({link to WRI bio page}), and QC'd by [{name}]({link to WRI bio page}). From 6075223cb54f928685f1c28bf4444689b5a7328f Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Tue, 7 Jun 2022 09:45:54 -0500 Subject: [PATCH 08/20] Update README.md --- ocn_027a_rw0_nitrogen_plumes/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocn_027a_rw0_nitrogen_plumes/README.md b/ocn_027a_rw0_nitrogen_plumes/README.md index 2f2cb092..6deb176f 100644 --- a/ocn_027a_rw0_nitrogen_plumes/README.md +++ b/ocn_027a_rw0_nitrogen_plumes/README.md @@ -1,5 +1,5 @@ ## Wastewater Plumes in Coastal Areas Dataset Pre-processing -This file describes the data pre-processing that was done to [the Global Inputs and Impacts from of Human Sewage in Coastal Ecosystems]([{learn more link}](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0258898)) for [display on Resource Watch]({link to dataset's metadata page on Resource Watch}). +This file describes the data pre-processing that was done to [the Global Inputs and Impacts from of Human Sewage in Coastal Ecosystems](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0258898) for [display on Resource Watch]({link to dataset's metadata page on Resource Watch}). This dataset is provided as a series of GeoTIFF files from the data provider to the Resource Watch data team. From e4553b78f7d45a3839f2d96c30d6056b1ee5d90d Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Tue, 7 Jun 2022 10:09:26 -0500 Subject: [PATCH 09/20] Update README.md --- ocn_027a_rw0_nitrogen_plumes/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocn_027a_rw0_nitrogen_plumes/README.md b/ocn_027a_rw0_nitrogen_plumes/README.md index 6deb176f..b5971075 100644 --- a/ocn_027a_rw0_nitrogen_plumes/README.md +++ b/ocn_027a_rw0_nitrogen_plumes/README.md @@ -5,10 +5,10 @@ This dataset is provided as a series of GeoTIFF files from the data provider to To display these data on Resource Watch, each GeoTIFF was translated into the appropriate projection for web display and uploaded to Google Earth Engine. -Please see the [Python script]({link to Python script on Github}) for more details on this processing. +Please see the [Python script]([{link to Python script on Github}](https://github.com/resource-watch/data-pre-processing/blob/master/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py)) for more details on this processing. You can view the processed {Resource Watch public title} dataset [on Resource Watch]({link to dataset's metadata page on Resource Watch}). -You can also download the original dataset [directly through Resource Watch]({s3 link if available}), or [from the source website]({download from source link}). +You can also download the original dataset [directly through Resource Watch]({s3 link if available}), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). -###### Note: This dataset processing was done by [Claire Hemmerly]({link to WRI bio page}), and QC'd by [{name}]({link to WRI bio page}). +###### Note: This dataset processing was done by [Claire Hemmerly], and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). From c1ee8d91300c5d9cb49c550bef6f75ec93c9d35a Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Tue, 7 Jun 2022 11:36:18 -0500 Subject: [PATCH 10/20] Update README.md --- ocn_027a_rw0_nitrogen_plumes/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ocn_027a_rw0_nitrogen_plumes/README.md b/ocn_027a_rw0_nitrogen_plumes/README.md index b5971075..486fe77c 100644 --- a/ocn_027a_rw0_nitrogen_plumes/README.md +++ b/ocn_027a_rw0_nitrogen_plumes/README.md @@ -1,14 +1,14 @@ ## Wastewater Plumes in Coastal Areas Dataset Pre-processing -This file describes the data pre-processing that was done to [the Global Inputs and Impacts from of Human Sewage in Coastal Ecosystems](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0258898) for [display on Resource Watch]({link to dataset's metadata page on Resource Watch}). +This file describes the data pre-processing that was done to the [Global Inputs and Impacts from of Human Sewage in Coastal Ecosystems](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0258898) for [display on Resource Watch](https://resourcewatch.org/data/explore/11804f04-d9c7-47b9-8d27-27ce6ed6c042). This dataset is provided as a series of GeoTIFF files from the data provider to the Resource Watch data team. To display these data on Resource Watch, each GeoTIFF was translated into the appropriate projection for web display and uploaded to Google Earth Engine. -Please see the [Python script]([{link to Python script on Github}](https://github.com/resource-watch/data-pre-processing/blob/master/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py)) for more details on this processing. +Please see the [Python script](https://github.com/resource-watch/data-pre-processing/blob/master/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py)) for more details on this processing. -You can view the processed {Resource Watch public title} dataset [on Resource Watch]({link to dataset's metadata page on Resource Watch}). +You can view the processed Wastewater Plumes in Coastal Areas dataset [on Resource Watch](https://resourcewatch.org/data/explore/11804f04-d9c7-47b9-8d27-27ce6ed6c042). -You can also download the original dataset [directly through Resource Watch]({s3 link if available}), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). +You can also download the original dataset [directly through Resource Watch](http://wri-public-data.s3.amazonaws.com/resourcewatch/raster/ocn_027_rw0_nitrogen_plumes.zip), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). -###### Note: This dataset processing was done by [Claire Hemmerly], and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). +###### Note: This dataset processing was done by Claire Hemmerly, and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). From 1503240903f1849806df0b42544dbb8820546bfe Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Tue, 7 Jun 2022 11:40:06 -0500 Subject: [PATCH 11/20] Update README.md --- ocn_027a_rw0_nitrogen_plumes/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocn_027a_rw0_nitrogen_plumes/README.md b/ocn_027a_rw0_nitrogen_plumes/README.md index 486fe77c..ad77dacd 100644 --- a/ocn_027a_rw0_nitrogen_plumes/README.md +++ b/ocn_027a_rw0_nitrogen_plumes/README.md @@ -5,7 +5,7 @@ This dataset is provided as a series of GeoTIFF files from the data provider to To display these data on Resource Watch, each GeoTIFF was translated into the appropriate projection for web display and uploaded to Google Earth Engine. -Please see the [Python script](https://github.com/resource-watch/data-pre-processing/blob/master/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py)) for more details on this processing. +Please see the [Python script](https://github.com/resource-watch/data-pre-processing/blob/master/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py) for more details on this processing. You can view the processed Wastewater Plumes in Coastal Areas dataset [on Resource Watch](https://resourcewatch.org/data/explore/11804f04-d9c7-47b9-8d27-27ce6ed6c042). From dc542170969b3f03d60a8c05ba9ff9e57f791539 Mon Sep 17 00:00:00 2001 From: Claire Hemmerly Date: Tue, 14 Jun 2022 11:58:10 -0500 Subject: [PATCH 12/20] watershed and pourpoint preprocessing files --- ...7b_rw0_wastewater_pourpoints_processing.py | 112 ++++++++++++++++++ ...7c_rw0_wastewater_watersheds_processing.py | 112 ++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py create mode 100644 ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py diff --git a/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py b/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py new file mode 100644 index 00000000..171f688b --- /dev/null +++ b/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py @@ -0,0 +1,112 @@ +import geopandas as gpd +import os +import pyproj +from shapely.geometry import Point +import urllib.request +import sys + +utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') +if utils_path not in sys.path: + sys.path.append(utils_path) + +from cartoframes.auth import set_default_credentials +from cartoframes import to_carto, update_privacy_table +import util_files +import util_cloud +from zipfile import ZipFile +import logging + + +# Set up logging +# Get the top-level logger object +logger = logging.getLogger() +for handler in logger.handlers: + logger.removeHandler(handler) +logger.setLevel(logging.INFO) +# make it print to the console. +console = logging.StreamHandler() +logger.addHandler(console) +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# name of table on Carto where you want to upload data +# this should be a table name that is not currently in use +dataset_name = 'ocn_027b_rw0_wastewater_pourpoints' + +logger.info('Executing script for dataset: ' + dataset_name) +# create a new sub-directory within your specified dir called 'data' +# within this directory, create files to store raw and processed data +data_dir = util_files.prep_dirs(dataset_name) + +''' +Download data and save to your data directory +''' +logger.info('Downloading raw data') +# insert the url used to download the data from the source website +url = 'https://knb.ecoinformatics.org/knb/d1/mn/v2/object/urn%3Auuid%3Aaf8d0bd6-dc0c-4149-a3cd-93b5aed71f7c' + +# download the data from the source +raw_data_file = os.path.join(data_dir, 'N_PourPoint_And_Watershed.zip') +urllib.request.urlretrieve(url, raw_data_file) + +# unzip source data +raw_data_file_unzipped = raw_data_file.split('.')[0] +zip_ref = ZipFile(raw_data_file, 'r') +zip_ref.extractall(raw_data_file_unzipped) +zip_ref.close() + + +''' +Process data +''' + +# load in the polygon shapefile +shapes = os.path.join(raw_data_file_unzipped, 'effluent_N_pourpoints_all.shp') +gdf = gpd.read_file(shapes) + +# convert the data type of columns to integer +for col in gdf.columns[1:9]: + gdf[col] = gdf[col].fillna(0).astype('int') + +# conver geometrey from esri 54009 to epsg 4326 for display on carto +transformer = pyproj.Transformer.from_crs('esri:54009', 'epsg:4326') +lon, lat = transformer.transform(gdf['geometry'].x, gdf['geometry'].y) +gdf['geometry'] = [Point(xy) for xy in zip(lat, lon)] +gdf['geometry'] = gdf['geometry'].set_crs(epsg=4326) + + +''' +Upload processed data to Carto +''' + +logger.info('Uploading processed data to Carto.') + +# authenticate carto account +CARTO_USER = os.getenv('CARTO_WRI_RW_USER') +CARTO_KEY = os.getenv('CARTO_WRI_RW_KEY') +set_default_credentials(username=CARTO_USER, base_url="https://{user}.carto.com/".format(user=CARTO_USER),api_key=CARTO_KEY) + +# upload data frame to Carto +to_carto(gdf, dataset_name + '_edit', if_exists='replace') + +# set privacy to 'link' so table is accessible but not published +update_privacy_table(dataset_name + '_edit', 'link') + + +''' +Upload original data and processed data to Amazon S3 storage +''' +# initialize AWS variables +aws_bucket = 'wri-public-data' +s3_prefix = 'resourcewatch/' + +logger.info('Uploading original data to S3.') + +# Copy the raw data into a zipped file to upload to S3 +raw_data_dir = os.path.join(data_dir, dataset_name+'.zip') +with ZipFile(raw_data_dir,'w') as zip: + zip.write(raw_data_file, os.path.basename(raw_data_file)) + +# Upload raw data file to S3 +uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix+os.path.basename(raw_data_dir)) + + diff --git a/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py b/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py new file mode 100644 index 00000000..e242b7d1 --- /dev/null +++ b/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py @@ -0,0 +1,112 @@ +import geopandas as gpd +import os +import pyproj +from shapely.ops import transform +import urllib.request +import sys + +utils_path = os.path.join(os.path.abspath(os.getenv('PROCESSING_DIR')),'utils') +if utils_path not in sys.path: + sys.path.append(utils_path) + +from cartoframes.auth import set_default_credentials +from cartoframes import to_carto, update_privacy_table +import util_files +import util_cloud +from zipfile import ZipFile +import logging + + +# Set up logging +# Get the top-level logger object +logger = logging.getLogger() +for handler in logger.handlers: + logger.removeHandler(handler) +logger.setLevel(logging.INFO) +# make it print to the console. +console = logging.StreamHandler() +logger.addHandler(console) +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# name of table on Carto where you want to upload data +# this should be a table name that is not currently in use +dataset_name = 'ocn_027c_rw0_wastewater_watersheds' + +logger.info('Executing script for dataset: ' + dataset_name) +# create a new sub-directory within your specified dir called 'data' +# within this directory, create files to store raw and processed data +data_dir = util_files.prep_dirs(dataset_name) + +''' +Download data and save to your data directory +''' +logger.info('Downloading raw data') +# insert the url used to download the data from the source website +url = 'https://knb.ecoinformatics.org/knb/d1/mn/v2/object/urn%3Auuid%3Aaf8d0bd6-dc0c-4149-a3cd-93b5aed71f7c' + +# download the data from the source +raw_data_file = os.path.join(data_dir, 'N_PourPoint_And_Watershed.zip') +urllib.request.urlretrieve(url, raw_data_file) + +# unzip source data +raw_data_file_unzipped = raw_data_file.split('.')[0] +zip_ref = ZipFile(raw_data_file, 'r') +zip_ref.extractall(raw_data_file_unzipped) +zip_ref.close() + + +''' +Process data +''' + +# load in the polygon shapefile +shapes = os.path.join(raw_data_file_unzipped, 'effluent_N_watersheds_all.shp') +gdf = gpd.read_file(shapes) + +# convert the data type of columns to integer +for col in gdf.columns[1:9]: + gdf[col] = gdf[col].fillna(0).astype('int') + +# convert geometrey from esri 54009 to epsg 4326 for display on carto +transformer = pyproj.Transformer.from_crs('esri:54009', 'epsg:4326', always_xy=True).transform +for i in range(len(gdf['geometry'])): + polygon = gdf['geometry'][i] + gdf['geometry'][i] = transform(transformer, polygon) + + + +''' +Upload processed data to Carto +''' + +logger.info('Uploading processed data to Carto.') + +# authenticate carto account +CARTO_USER = os.getenv('CARTO_WRI_RW_USER') +CARTO_KEY = os.getenv('CARTO_WRI_RW_KEY') +set_default_credentials(username=CARTO_USER, base_url="https://{user}.carto.com/".format(user=CARTO_USER),api_key=CARTO_KEY) + +# upload data frame to Carto +to_carto(gdf, dataset_name + '_edit', if_exists='replace') + +# set privacy to 'link' so table is accessible but not published +update_privacy_table(dataset_name + '_edit', 'link') + + +''' +Upload original data and processed data to Amazon S3 storage +''' +# initialize AWS variables +aws_bucket = 'wri-public-data' +s3_prefix = 'resourcewatch/' + +logger.info('Uploading original data to S3.') + +# Copy the raw data into a zipped file to upload to S3 +raw_data_dir = os.path.join(data_dir, dataset_name+'.zip') +with ZipFile(raw_data_dir,'w') as zip: + zip.write(raw_data_file, os.path.basename(raw_data_file)) + +# Upload raw data file to S3 +uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix+os.path.basename(raw_data_dir)) + From b1108d3f10efa6774f925db3a5cde37b2e0bc6c8 Mon Sep 17 00:00:00 2001 From: Claire Hemmerly Date: Mon, 27 Jun 2022 10:29:12 -0500 Subject: [PATCH 13/20] fixed typo --- .../ocn_027b_rw0_wastewater_pourpoints_processing.py | 2 +- .../ocn_027c_rw0_wastewater_watersheds_processing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py b/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py index 171f688b..285185a7 100644 --- a/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py +++ b/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py @@ -67,7 +67,7 @@ for col in gdf.columns[1:9]: gdf[col] = gdf[col].fillna(0).astype('int') -# conver geometrey from esri 54009 to epsg 4326 for display on carto +# convert geometry from esri 54009 to epsg 4326 for display on carto transformer = pyproj.Transformer.from_crs('esri:54009', 'epsg:4326') lon, lat = transformer.transform(gdf['geometry'].x, gdf['geometry'].y) gdf['geometry'] = [Point(xy) for xy in zip(lat, lon)] diff --git a/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py b/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py index e242b7d1..5a2afc51 100644 --- a/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py +++ b/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py @@ -67,7 +67,7 @@ for col in gdf.columns[1:9]: gdf[col] = gdf[col].fillna(0).astype('int') -# convert geometrey from esri 54009 to epsg 4326 for display on carto +# convert geometry from esri 54009 to epsg 4326 for display on carto transformer = pyproj.Transformer.from_crs('esri:54009', 'epsg:4326', always_xy=True).transform for i in range(len(gdf['geometry'])): polygon = gdf['geometry'][i] From 9311d6dcba71775e2f0a9d957bf9d925af89a9c5 Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Mon, 27 Jun 2022 10:31:18 -0500 Subject: [PATCH 14/20] Create README.md --- ocn_027b_rw0_wastewater_pourpoints/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 ocn_027b_rw0_wastewater_pourpoints/README.md diff --git a/ocn_027b_rw0_wastewater_pourpoints/README.md b/ocn_027b_rw0_wastewater_pourpoints/README.md new file mode 100644 index 00000000..3d6f37ce --- /dev/null +++ b/ocn_027b_rw0_wastewater_pourpoints/README.md @@ -0,0 +1,19 @@ +## Wastewater Inputs to Coastal Areas Dataset Pre-processing +This file describes the data pre-processing that was done to the [Global Inputs and Impacts from of Human Sewage in Coastal Ecosystems](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0258898) for [display on Resource Watch](https://resourcewatch.org/data/explore/11804f04-d9c7-47b9-8d27-27ce6ed6c042). + +The source provided this dataset as a shapefile containing point data. + +Below, we describe the steps used to reformat the shapefile to upload it to Carto: + +1. Read in the table as a geopandas data frame. +2. Convert the data type of the columns to integer (geometry column excluded). +3. Transform the projection from ESRI 54009 to EPSG 4326. + + +Please see the [Python script](https://github.com/resource-watch/data-pre-processing/blob/master/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py) for more details on this processing. + +You can view the processed Wastewater Inputs to Coastal Areas dataset [on Resource Watch](https://resourcewatch.org/data/explore/5bf349ec-3b14-4021-a7d4-fc4b8104bd74). + +You can also download the original dataset [directly through Resource Watch](http://wri-public-data.s3.amazonaws.com/resourcewatch/raster/ocn_027b_rw0_wastewater_pourpoints.zip), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). + +###### Note: This dataset processing was done by Claire Hemmerly, and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). From a0e861f10b210aff07f1a1e91654e910c6eabe62 Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Mon, 27 Jun 2022 10:31:54 -0500 Subject: [PATCH 15/20] Create README.md --- ocn_027c_rw0_wastewater_watersheds/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 ocn_027c_rw0_wastewater_watersheds/README.md diff --git a/ocn_027c_rw0_wastewater_watersheds/README.md b/ocn_027c_rw0_wastewater_watersheds/README.md new file mode 100644 index 00000000..f6a34a90 --- /dev/null +++ b/ocn_027c_rw0_wastewater_watersheds/README.md @@ -0,0 +1,19 @@ +## Watersheds that Transport Wastewater to Coastal Ocean Dataset Pre-processing +This file describes the data pre-processing that was done to the [Global Inputs and Impacts from of Human Sewage in Coastal Ecosystems](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0258898) for [display on Resource Watch](https://resourcewatch.org/data/explore/784732cc-8e7e-4dac-be51-d4506ff2ee04). + +The source provided this dataset as a shapefile containing polygon data. + +Below, we describe the steps used to reformat the shapefile to upload it to Carto: + +1. Read in the table as a geopandas data frame. +2. Convert the data type of the columns to integer (geometry column excluded). +3. Transform the projection from ESRI 54009 to EPSG 4326. + + +Please see the [Python script](https://github.com/resource-watch/data-pre-processing/blob/master/ocn_027c_rw0_wastewater_watersheds/ocn_027b_rw0_wastewater_watersheds_processing.py) for more details on this processing. + +You can view the processed Watersheds that Transport Wastewater to Coastal Ocean dataset [on Resource Watch](https://resourcewatch.org/data/explore/5bf349ec-3b14-4021-a7d4-fc4b8104bd74). + +You can also download the original dataset [directly through Resource Watch](http://wri-public-data.s3.amazonaws.com/resourcewatch/raster/ocn_027c_rw0_wastewater_watersheds.zip), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). + +###### Note: This dataset processing was done by Claire Hemmerly, and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). From 19453f9e0ca07b613ee5c77333c1fd8f05d0c2ef Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Tue, 12 Jul 2022 13:39:04 -0500 Subject: [PATCH 16/20] Update README.md --- ocn_027a_rw0_nitrogen_plumes/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocn_027a_rw0_nitrogen_plumes/README.md b/ocn_027a_rw0_nitrogen_plumes/README.md index ad77dacd..013c68e7 100644 --- a/ocn_027a_rw0_nitrogen_plumes/README.md +++ b/ocn_027a_rw0_nitrogen_plumes/README.md @@ -1,5 +1,5 @@ ## Wastewater Plumes in Coastal Areas Dataset Pre-processing -This file describes the data pre-processing that was done to the [Global Inputs and Impacts from of Human Sewage in Coastal Ecosystems](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0258898) for [display on Resource Watch](https://resourcewatch.org/data/explore/11804f04-d9c7-47b9-8d27-27ce6ed6c042). +This file describes the data pre-processing that was done to the [Global Inputs and Impacts from of Human Sewage in Coastal Ecosystems](https://knb.ecoinformatics.org/view/doi:10.5063/F76B09) for [display on Resource Watch](https://resourcewatch.org/data/explore/11804f04-d9c7-47b9-8d27-27ce6ed6c042). This dataset is provided as a series of GeoTIFF files from the data provider to the Resource Watch data team. @@ -9,6 +9,6 @@ Please see the [Python script](https://github.com/resource-watch/data-pre-proces You can view the processed Wastewater Plumes in Coastal Areas dataset [on Resource Watch](https://resourcewatch.org/data/explore/11804f04-d9c7-47b9-8d27-27ce6ed6c042). -You can also download the original dataset [directly through Resource Watch](http://wri-public-data.s3.amazonaws.com/resourcewatch/raster/ocn_027_rw0_nitrogen_plumes.zip), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). +You can also download the original dataset [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). -###### Note: This dataset processing was done by Claire Hemmerly, and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). +###### Note: This dataset processing was done by [Claire Hemmerly](https://github.com/clairehemmerly), and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). From 83daf030dfe02e12725753304fd8c875ea8f0785 Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Tue, 12 Jul 2022 14:37:19 -0500 Subject: [PATCH 17/20] Update README.md --- ocn_027b_rw0_wastewater_pourpoints/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocn_027b_rw0_wastewater_pourpoints/README.md b/ocn_027b_rw0_wastewater_pourpoints/README.md index 3d6f37ce..99e721b4 100644 --- a/ocn_027b_rw0_wastewater_pourpoints/README.md +++ b/ocn_027b_rw0_wastewater_pourpoints/README.md @@ -14,6 +14,6 @@ Please see the [Python script](https://github.com/resource-watch/data-pre-proces You can view the processed Wastewater Inputs to Coastal Areas dataset [on Resource Watch](https://resourcewatch.org/data/explore/5bf349ec-3b14-4021-a7d4-fc4b8104bd74). -You can also download the original dataset [directly through Resource Watch](http://wri-public-data.s3.amazonaws.com/resourcewatch/raster/ocn_027b_rw0_wastewater_pourpoints.zip), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). +You can also download the original dataset [directly through Resource Watch](https://wri-public-data.s3.amazonaws.com/resourcewatch/ocn_027b_rw0_wastewater_pourpoints.zip), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). ###### Note: This dataset processing was done by Claire Hemmerly, and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). From 78c4d305caa1569720e7a475d99e4fbdc3a8a663 Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Tue, 12 Jul 2022 14:41:05 -0500 Subject: [PATCH 18/20] Update README.md --- ocn_027c_rw0_wastewater_watersheds/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocn_027c_rw0_wastewater_watersheds/README.md b/ocn_027c_rw0_wastewater_watersheds/README.md index f6a34a90..7700fa20 100644 --- a/ocn_027c_rw0_wastewater_watersheds/README.md +++ b/ocn_027c_rw0_wastewater_watersheds/README.md @@ -10,10 +10,11 @@ Below, we describe the steps used to reformat the shapefile to upload it to Cart 3. Transform the projection from ESRI 54009 to EPSG 4326. -Please see the [Python script](https://github.com/resource-watch/data-pre-processing/blob/master/ocn_027c_rw0_wastewater_watersheds/ocn_027b_rw0_wastewater_watersheds_processing.py) for more details on this processing. +Please see the [Python script](https://github.com/resource-watch/data-pre-processing/blob/master/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py) for more details on this processing. You can view the processed Watersheds that Transport Wastewater to Coastal Ocean dataset [on Resource Watch](https://resourcewatch.org/data/explore/5bf349ec-3b14-4021-a7d4-fc4b8104bd74). -You can also download the original dataset [directly through Resource Watch](http://wri-public-data.s3.amazonaws.com/resourcewatch/raster/ocn_027c_rw0_wastewater_watersheds.zip), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). +You can also download the original dataset [directly through Resource Watch]([http://wri-public-data.s3.amazonaws.com/resourcewatch/raster/ocn_027c_rw0_wastewater_watersheds.zip](https://wri-public-data.s3.amazonaws.com/resourcewatch/ocn_027c_rw0_wastewater_watersheds.zip +)), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). ###### Note: This dataset processing was done by Claire Hemmerly, and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). From bbf6ec3a9538c1ca7ebb19e8a9bda46ef1da28c1 Mon Sep 17 00:00:00 2001 From: clairehemmerly <103712609+clairehemmerly@users.noreply.github.com> Date: Tue, 12 Jul 2022 14:42:24 -0500 Subject: [PATCH 19/20] Update README.md --- ocn_027c_rw0_wastewater_watersheds/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocn_027c_rw0_wastewater_watersheds/README.md b/ocn_027c_rw0_wastewater_watersheds/README.md index 7700fa20..c529edc2 100644 --- a/ocn_027c_rw0_wastewater_watersheds/README.md +++ b/ocn_027c_rw0_wastewater_watersheds/README.md @@ -14,7 +14,7 @@ Please see the [Python script](https://github.com/resource-watch/data-pre-proces You can view the processed Watersheds that Transport Wastewater to Coastal Ocean dataset [on Resource Watch](https://resourcewatch.org/data/explore/5bf349ec-3b14-4021-a7d4-fc4b8104bd74). -You can also download the original dataset [directly through Resource Watch]([http://wri-public-data.s3.amazonaws.com/resourcewatch/raster/ocn_027c_rw0_wastewater_watersheds.zip](https://wri-public-data.s3.amazonaws.com/resourcewatch/ocn_027c_rw0_wastewater_watersheds.zip -)), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). +You can also download the original dataset [directly through Resource Watch](https://wri-public-data.s3.amazonaws.com/resourcewatch/ocn_027c_rw0_wastewater_watersheds.zip +), or [from the source website](https://knb.ecoinformatics.org/view/urn%3Auuid%3Ac7bdc77e-6c7d-46b6-8bfc-a66491119d07). ###### Note: This dataset processing was done by Claire Hemmerly, and QC'd by [Chris Rowe](https://www.wri.org/profile/chris-rowe). From c0d3b367ec83e5b987e26403674333dca01e1fbc Mon Sep 17 00:00:00 2001 From: Claire Hemmerly Date: Wed, 13 Jul 2022 17:44:44 -0500 Subject: [PATCH 20/20] corrections based on pull request comments --- ...ocn_027a_rw0_nitrogen_plumes_processing.py | 4 +-- ...7b_rw0_wastewater_pourpoints_processing.py | 31 ++++++++++++++++-- ...7c_rw0_wastewater_watersheds_processing.py | 32 +++++++++++++++++-- 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py b/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py index 2d672a4a..a806b1e7 100644 --- a/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py +++ b/ocn_027a_rw0_nitrogen_plumes/ocn_027a_rw0_nitrogen_plumes_processing.py @@ -159,7 +159,7 @@ Upload original data and processed data to Amazon S3 storage ''' # initialize AWS variables -aws_bucket = 'wri-public-data' +aws_bucket = 'wri-projects' s3_prefix = 'resourcewatch/raster/' # Copy the raw data into a zipped file to upload to S3 @@ -170,7 +170,7 @@ with ZipFile(raw_data_dir,'w') as zip: raw_data_files = data_dict['raw_data_file'] for raw_data_file in raw_data_files: - zip.write(raw_data_file, os.path.basename(raw_data_file)) + zip.write(raw_data_file, os.path.basename(raw_data_file),compress_type= zipfile.ZIP_DEFLATED) # Upload raw data file to S3 uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix + os.path.basename(raw_data_dir)) diff --git a/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py b/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py index 285185a7..49d75575 100644 --- a/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py +++ b/ocn_027b_rw0_wastewater_pourpoints/ocn_027b_rw0_wastewater_pourpoints_processing.py @@ -1,5 +1,6 @@ import geopandas as gpd import os +import glob import pyproj from shapely.geometry import Point import urllib.request @@ -60,8 +61,11 @@ ''' # load in the polygon shapefile -shapes = os.path.join(raw_data_file_unzipped, 'effluent_N_pourpoints_all.shp') -gdf = gpd.read_file(shapes) +shapefile = os.path.join(raw_data_file_unzipped, 'effluent_N_pourpoints_all.shp') +gdf = gpd.read_file(shapefile) + +# create a path to save the processed shapefile later +processed_data_file = os.path.join(data_dir, dataset_name+'_edit.shp') # convert the data type of columns to integer for col in gdf.columns[1:9]: @@ -73,6 +77,18 @@ gdf['geometry'] = [Point(xy) for xy in zip(lat, lon)] gdf['geometry'] = gdf['geometry'].set_crs(epsg=4326) +# create an index column to use as cartodb_id +gdf['cartodb_id'] = gdf.index + +# rename columns to match names in carto +gdf.columns = [x.lower().replace(' ', '_') for x in gdf.columns] + +# reorder the columns +gdf = gdf[['cartodb_id'] + list(gdf)[:-1]] + +# save processed dataset to shapefile +gdf.to_file(processed_data_file, driver='ESRI Shapefile') + ''' Upload processed data to Carto @@ -109,4 +125,15 @@ # Upload raw data file to S3 uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix+os.path.basename(raw_data_dir)) +# Copy the processed data into a zipped file to upload to S3 +processed_data_dir = os.path.join(data_dir, dataset_name+'_edit.zip') +# find all the necessary components of the shapefiles +processed_data_files = glob.glob(os.path.join(data_dir, dataset_name + '_edit.*')) +with ZipFile(processed_data_dir,'w') as zip: + for file in processed_data_files: + zip.write(file, os.path.basename(file)) + +# Upload processed data file to S3 +uploaded = util_cloud.aws_upload(processed_data_dir, aws_bucket, s3_prefix+os.path.basename(processed_data_dir)) + diff --git a/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py b/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py index 5a2afc51..652e2449 100644 --- a/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py +++ b/ocn_027c_rw0_wastewater_watersheds/ocn_027c_rw0_wastewater_watersheds_processing.py @@ -1,5 +1,6 @@ import geopandas as gpd import os +import glob import pyproj from shapely.ops import transform import urllib.request @@ -60,8 +61,11 @@ ''' # load in the polygon shapefile -shapes = os.path.join(raw_data_file_unzipped, 'effluent_N_watersheds_all.shp') -gdf = gpd.read_file(shapes) +shapefile = os.path.join(raw_data_file_unzipped, 'effluent_N_watersheds_all.shp') +gdf = gpd.read_file(shapefile) + +# create a path to save the processed shapefile later +processed_data_file = os.path.join(data_dir, dataset_name+'_edit.shp') # convert the data type of columns to integer for col in gdf.columns[1:9]: @@ -73,6 +77,18 @@ polygon = gdf['geometry'][i] gdf['geometry'][i] = transform(transformer, polygon) +# create an index column to use as cartodb_id +gdf['cartodb_id'] = gdf.index + +# rename columns to match names in carto +gdf.columns = [x.lower().replace(' ', '_') for x in gdf.columns] + +# reorder the columns +gdf = gdf[['cartodb_id'] + list(gdf)[:-1]] + +# save processed dataset to shapefile +gdf.to_file(processed_data_file, driver='ESRI Shapefile') + ''' @@ -110,3 +126,15 @@ # Upload raw data file to S3 uploaded = util_cloud.aws_upload(raw_data_dir, aws_bucket, s3_prefix+os.path.basename(raw_data_dir)) +# Copy the processed data into a zipped file to upload to S3 +processed_data_dir = os.path.join(data_dir, dataset_name+'_edit.zip') +# find all the necessary components of the shapefiles +processed_data_files = glob.glob(os.path.join(data_dir, dataset_name + '_edit.*')) +with ZipFile(processed_data_dir,'w') as zip: + for file in processed_data_files: + zip.write(file, os.path.basename(file)) + +# Upload processed data file to S3 +uploaded = util_cloud.aws_upload(processed_data_dir, aws_bucket, s3_prefix+os.path.basename(processed_data_dir)) + +