You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sdap.apache.org by sk...@apache.org on 2023/08/16 21:48:38 UTC

[incubator-sdap-nexus] branch SDAP-481a created (now 187c472)

This is an automated email from the ASF dual-hosted git repository.

skperez pushed a change to branch SDAP-481a
in repository https://gitbox.apache.org/repos/asf/incubator-sdap-nexus.git


      at 187c472  Add suport for netcdf compression

This branch includes the following new commits:

     new 187c472  Add suport for netcdf compression

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[incubator-sdap-nexus] 01/01: Add suport for netcdf compression

Posted by sk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

skperez pushed a commit to branch SDAP-481a
in repository https://gitbox.apache.org/repos/asf/incubator-sdap-nexus.git

commit 187c4722688ba78f1445aa2178418640c02b04c6
Author: skorper <st...@gmail.com>
AuthorDate: Wed Aug 16 14:38:48 2023 -0700

    Add suport for netcdf compression
---
 CHANGELOG.md                                       |  1 +
 analysis/conda-requirements.txt                    |  2 +-
 .../webservice/algorithms/doms/BaseDomsHandler.py  | 40 +++++++++++++++++-----
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c857124..bc7581d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - SDAP-467: Added pagination to cdmsresults endpoint
 - SDAP-461: Added 4 remaining Saildrone insitu datasets.
 - SDAP-473: Added support for matchup job prioritization
+- SDAP-481: Added support for NetCDF compression
 ### Changed
 - SDAP-453: Updated results storage and retrieval to support output JSON from `/cdmsresults` that matches output from `/match_spark`.
   - **NOTE:** Deploying these changes to an existing SDAP deployment will require modifying the Cassandra database with stored results. There is a script to do so at `/tools/update-doms-data-schema/update.py`
diff --git a/analysis/conda-requirements.txt b/analysis/conda-requirements.txt
index e27bdea..c092350 100644
--- a/analysis/conda-requirements.txt
+++ b/analysis/conda-requirements.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 
-netcdf4==1.5.5.1
+netcdf4==1.6.4
 basemap==1.2.2
 scipy==1.6.0
 pyspark==3.2.1
diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py
index 4140684..1ed398f 100644
--- a/analysis/webservice/algorithms/doms/BaseDomsHandler.py
+++ b/analysis/webservice/algorithms/doms/BaseDomsHandler.py
@@ -297,6 +297,8 @@ class DomsCSVFormatter:
 
 
 class DomsNetCDFFormatter:
+    compression = 'zlib'
+    comp_level = 5
     @staticmethod
     def create(executionId, results, params, details):
 
@@ -362,17 +364,20 @@ class DomsNetCDFFormatter:
 
         #Create Satellite group, variables, and attributes
         satelliteGroup = dataset.createGroup(satellite_group_name)
-        satelliteWriter = DomsNetCDFValueWriter(satelliteGroup, params["parameter"])
+        satelliteWriter = DomsNetCDFValueWriter(satelliteGroup, DomsNetCDFFormatter.compression, DomsNetCDFFormatter.comp_level)
 
         # Create InSitu group, variables, and attributes
         insituGroup = dataset.createGroup(insitu_group_name)
-        insituWriter = DomsNetCDFValueWriter(insituGroup, params["parameter"])
+        insituWriter = DomsNetCDFValueWriter(insituGroup, DomsNetCDFFormatter.compression, DomsNetCDFFormatter.comp_level)
 
         # Add data to Insitu and Satellite groups, generate array of match ID pairs
         matches = DomsNetCDFFormatter.__writeResults(results, satelliteWriter, insituWriter)
         dataset.createDimension("MatchedRecords", size=None)
         dataset.createDimension("MatchedGroups", size=2)
-        matchArray = dataset.createVariable("matchIDs", "f4", ("MatchedRecords", "MatchedGroups"))
+        matchArray = dataset.createVariable(
+            'matchIDs', 'f4', ('MatchedRecords', 'MatchedGroups'),
+            compression=DomsNetCDFFormatter.compression, complevel=DomsNetCDFFormatter.comp_level
+        )
         matchArray[:] = matches
 
         dataset.close()
@@ -441,7 +446,7 @@ class DomsNetCDFFormatter:
 
 
 class DomsNetCDFValueWriter:
-    def __init__(self, group, matchup_parameter):
+    def __init__(self, group, compression=None, comp_level=None):
         group.createDimension("dim", size=None)
         self.group = group
 
@@ -454,6 +459,9 @@ class DomsNetCDFValueWriter:
         self.secondary_group_name = "SecondaryData"
         self.data_map = defaultdict(list)
 
+        self.compression = compression
+        self.comp_level = comp_level
+
     def addData(self, result_item):
         """
         Populate DomsNetCDFValueWriter fields from matchup results dict
@@ -491,9 +499,18 @@ class DomsNetCDFValueWriter:
         #
         # Create variables, enrich with attributes, and add data
         #
-        lonVar = self.group.createVariable('lon', 'f4', ('dim',), fill_value=-32767.0)
-        latVar = self.group.createVariable('lat', 'f4', ('dim',), fill_value=-32767.0)
-        timeVar = self.group.createVariable('time', 'f4', ('dim',), fill_value=-32767.0)
+        lonVar = self.group.createVariable(
+            'lon', 'f4', ('dim',), fill_value=-32767.0,
+            compression=self.compression, complevel=self.comp_level
+        )
+        latVar = self.group.createVariable(
+            'lat', 'f4', ('dim',), fill_value=-32767.0,
+            compression=self.compression, complevel=self.comp_level
+        )
+        timeVar = self.group.createVariable(
+            'time', 'f4', ('dim',), fill_value=-32767.0,
+            compression=self.compression, complevel=self.comp_level
+        )
 
         self.__enrichLon(lonVar, min(self.lon), max(self.lon))
         self.__enrichLat(latVar, min(self.lat), max(self.lat))
@@ -505,7 +522,10 @@ class DomsNetCDFValueWriter:
 
         # Add depth variable, if present
         if self.depth and any(self.depth):
-            depthVar = self.group.createVariable('depth', 'f4', ('dim',), fill_value=-32767.0)
+            depthVar = self.group.createVariable(
+                'depth', 'f4', ('dim',), fill_value=-32767.0,
+                compression=self.compression, complevel=self.comp_level
+            )
             self.__enrichDepth(depthVar, self.__calcMin(self.depth), max(self.depth))
             depthVar[:] = self.depth
 
@@ -533,7 +553,9 @@ class DomsNetCDFValueWriter:
                 cf_name = variable[1]
 
                 data_variable = self.group.createVariable(
-                    cf_name if cf_name is not None and cf_name != '' else name, 'f4', ('dim',), fill_value=-32767.0)
+                    cf_name if cf_name is not None and cf_name != '' else name, 'f4', ('dim',),
+                    fill_value=-32767.0, compression=self.compression, complevel=self.comp_level
+                )
                 # Find min/max for data variables. It is possible for 'None' to
                 # be in this list, so filter those out when doing the calculation.
                 min_data = np.nanmin(variables[variable])