You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sdap.apache.org by sk...@apache.org on 2023/08/16 21:48:39 UTC
[incubator-sdap-nexus] 01/01: Add suport for netcdf compression

This is an automated email from the ASF dual-hosted git repository.

skperez pushed a commit to branch SDAP-481a
in repository https://gitbox.apache.org/repos/asf/incubator-sdap-nexus.git

commit 187c4722688ba78f1445aa2178418640c02b04c6
Author: skorper <st...@gmail.com>
AuthorDate: Wed Aug 16 14:38:48 2023 -0700

    Add suport for netcdf compression
---
 CHANGELOG.md                                       |  1 +
 analysis/conda-requirements.txt                    |  2 +-
 .../webservice/algorithms/doms/BaseDomsHandler.py  | 40 +++++++++++++++++-----
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c857124..bc7581d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - SDAP-467: Added pagination to cdmsresults endpoint
 - SDAP-461: Added 4 remaining Saildrone insitu datasets.
 - SDAP-473: Added support for matchup job prioritization
+- SDAP-481: Added support for NetCDF compression
 ### Changed
 - SDAP-453: Updated results storage and retrieval to support output JSON from `/cdmsresults` that matches output from `/match_spark`.
   - **NOTE:** Deploying these changes to an existing SDAP deployment will require modifying the Cassandra database with stored results. There is a script to do so at `/tools/update-doms-data-schema/update.py`
diff --git a/analysis/conda-requirements.txt b/analysis/conda-requirements.txt
index e27bdea..c092350 100644
--- a/analysis/conda-requirements.txt
+++ b/analysis/conda-requirements.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 
-netcdf4==1.5.5.1
+netcdf4==1.6.4
 basemap==1.2.2
 scipy==1.6.0
 pyspark==3.2.1
diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py
index 4140684..1ed398f 100644
--- a/analysis/webservice/algorithms/doms/BaseDomsHandler.py
+++ b/analysis/webservice/algorithms/doms/BaseDomsHandler.py
@@ -297,6 +297,8 @@ class DomsCSVFormatter:
 
 
 class DomsNetCDFFormatter:
+    compression = 'zlib'
+    comp_level = 5
     @staticmethod
     def create(executionId, results, params, details):
 
@@ -362,17 +364,20 @@ class DomsNetCDFFormatter:
 
         #Create Satellite group, variables, and attributes
         satelliteGroup = dataset.createGroup(satellite_group_name)
-        satelliteWriter = DomsNetCDFValueWriter(satelliteGroup, params["parameter"])
+        satelliteWriter = DomsNetCDFValueWriter(satelliteGroup, DomsNetCDFFormatter.compression, DomsNetCDFFormatter.comp_level)
 
         # Create InSitu group, variables, and attributes
         insituGroup = dataset.createGroup(insitu_group_name)
-        insituWriter = DomsNetCDFValueWriter(insituGroup, params["parameter"])
+        insituWriter = DomsNetCDFValueWriter(insituGroup, DomsNetCDFFormatter.compression, DomsNetCDFFormatter.comp_level)
 
         # Add data to Insitu and Satellite groups, generate array of match ID pairs
         matches = DomsNetCDFFormatter.__writeResults(results, satelliteWriter, insituWriter)
         dataset.createDimension("MatchedRecords", size=None)
         dataset.createDimension("MatchedGroups", size=2)
-        matchArray = dataset.createVariable("matchIDs", "f4", ("MatchedRecords", "MatchedGroups"))
+        matchArray = dataset.createVariable(
+            'matchIDs', 'f4', ('MatchedRecords', 'MatchedGroups'),
+            compression=DomsNetCDFFormatter.compression, complevel=DomsNetCDFFormatter.comp_level
+        )
         matchArray[:] = matches
 
         dataset.close()
@@ -441,7 +446,7 @@ class DomsNetCDFFormatter:
 
 
 class DomsNetCDFValueWriter:
-    def __init__(self, group, matchup_parameter):
+    def __init__(self, group, compression=None, comp_level=None):
         group.createDimension("dim", size=None)
         self.group = group
 
@@ -454,6 +459,9 @@ class DomsNetCDFValueWriter:
         self.secondary_group_name = "SecondaryData"
         self.data_map = defaultdict(list)
 
+        self.compression = compression
+        self.comp_level = comp_level
+
     def addData(self, result_item):
         """
         Populate DomsNetCDFValueWriter fields from matchup results dict
@@ -491,9 +499,18 @@ class DomsNetCDFValueWriter:
         #
         # Create variables, enrich with attributes, and add data
         #
-        lonVar = self.group.createVariable('lon', 'f4', ('dim',), fill_value=-32767.0)
-        latVar = self.group.createVariable('lat', 'f4', ('dim',), fill_value=-32767.0)
-        timeVar = self.group.createVariable('time', 'f4', ('dim',), fill_value=-32767.0)
+        lonVar = self.group.createVariable(
+            'lon', 'f4', ('dim',), fill_value=-32767.0,
+            compression=self.compression, complevel=self.comp_level
+        )
+        latVar = self.group.createVariable(
+            'lat', 'f4', ('dim',), fill_value=-32767.0,
+            compression=self.compression, complevel=self.comp_level
+        )
+        timeVar = self.group.createVariable(
+            'time', 'f4', ('dim',), fill_value=-32767.0,
+            compression=self.compression, complevel=self.comp_level
+        )
 
         self.__enrichLon(lonVar, min(self.lon), max(self.lon))
         self.__enrichLat(latVar, min(self.lat), max(self.lat))
@@ -505,7 +522,10 @@ class DomsNetCDFValueWriter:
 
         # Add depth variable, if present
         if self.depth and any(self.depth):
-            depthVar = self.group.createVariable('depth', 'f4', ('dim',), fill_value=-32767.0)
+            depthVar = self.group.createVariable(
+                'depth', 'f4', ('dim',), fill_value=-32767.0,
+                compression=self.compression, complevel=self.comp_level
+            )
             self.__enrichDepth(depthVar, self.__calcMin(self.depth), max(self.depth))
             depthVar[:] = self.depth
 
@@ -533,7 +553,9 @@ class DomsNetCDFValueWriter:
                 cf_name = variable[1]
 
                 data_variable = self.group.createVariable(
-                    cf_name if cf_name is not None and cf_name != '' else name, 'f4', ('dim',), fill_value=-32767.0)
+                    cf_name if cf_name is not None and cf_name != '' else name, 'f4', ('dim',),
+                    fill_value=-32767.0, compression=self.compression, complevel=self.comp_level
+                )
                 # Find min/max for data variables. It is possible for 'None' to
                 # be in this list, so filter those out when doing the calculation.
                 min_data = np.nanmin(variables[variable])