You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@climate.apache.org by hu...@apache.org on 2017/09/25 18:12:03 UTC
[1/3] climate git commit: CLIMATE-926 - Metadata Extractors
Repository: climate
Updated Branches:
refs/heads/master cf4fb57fd -> fce720570
CLIMATE-926 - Metadata Extractors
Project: http://git-wip-us.apache.org/repos/asf/climate/repo
Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/8217d12f
Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/8217d12f
Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/8217d12f
Branch: refs/heads/master
Commit: 8217d12f06987d852f9294da94a5af243116e751
Parents: cf4fb57
Author: Alex Goodman <ag...@users.noreply.github.com>
Authored: Mon Sep 25 10:35:20 2017 -0700
Committer: Alex Goodman <ag...@users.noreply.github.com>
Committed: Mon Sep 25 10:35:20 2017 -0700
----------------------------------------------------------------------
RCMES/CORDEX/metadata_extractor.py | 222 ++++++++++++++++++++++++++++++++
1 file changed, 222 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/climate/blob/8217d12f/RCMES/CORDEX/metadata_extractor.py
----------------------------------------------------------------------
diff --git a/RCMES/CORDEX/metadata_extractor.py b/RCMES/CORDEX/metadata_extractor.py
new file mode 100644
index 0000000..7351cf4
--- /dev/null
+++ b/RCMES/CORDEX/metadata_extractor.py
@@ -0,0 +1,222 @@
+import glob
+import os
+
+
+class MetadataExtractor(object):
+ def __init__(self, *paths):
+ """Extracts metadata from data filenames.
+
+ Instances of MetadataExtractor are used to extract metadata from
+ filenames in bulk. Example usage:
+ >>> extractor = MetadataExtractor('/path/to/data')
+
+ Suppose the data in this directory had the following files:
+ pr_*.nc, uas_*.nc, vas_*.nc
+
+ All of the metadata lies in the data attribute:
+ >>> extractor.data
+ [{'filename': /path/to/data/pr_*.nc, 'variable': 'pr'},
+ {'filename': /path/to/data/vas_*.nc, 'variable': 'vas'},
+ {'filename': /path/to/data/uas_*.nc, 'variable': 'uas'}]
+
+ Results can be narrowed down by specifying values for a field:
+ >>> extractor.query(variable='pr')
+ [{'filename': /path/to/data/pr_*.nc, 'variable': 'pr'}]
+
+ Finally, metadata from two sets of extractors can be grouped together
+ based on common field name as follows:
+ >>> extractor.group(extractor2, 'variable')
+
+ This class should only be used as a starting point. We recommend using
+ the included obs4MIPSMetadataExtractor and CORDEXMetadataExtractor
+ subclasses or creating your own subclass for your usecase.
+ """
+ self.paths = paths
+
+ @property
+ def data(self):
+ """
+ The extracted metadata for each file, with all fields listed in
+ the fields attribute included.
+ """
+ return self._data
+
+ @property
+ def paths(self):
+ """
+ Search paths containing the dataset files.
+ """
+ return self._paths
+
+ @paths.setter
+ def paths(self, paths):
+ """
+ Extracts the metadata from scratch when paths are reset.
+ """
+ self._paths = paths
+ self._extract()
+
+ @property
+ def fields(self):
+ """
+ The name of field in the filename, assuming the fully filtered
+ filename conforms to the following convention:
+ filename = <field[0]>_<field[1]>_..._<field[n]>.nc. Using fewer fields
+ than the filename defines is allowed.
+ """
+ fields = ['variable']
+ return fields
+
+ @property
+ def files(self):
+ """
+ List of files (or regular expressions) for each dataset.
+ """
+ files = []
+ for path in self.paths:
+ files.extend(glob.glob(os.path.join(path, '*.nc')))
+ return list(set(self.get_pattern(fname) for fname in files))
+
+ @property
+ def variables(self):
+ """
+ Get the list of variables included accross all the datasets.
+ """
+ return self.get_field('variable')
+
+ def query(self, **kwargs):
+ """
+ Narrow down the list of files by field names.
+ """
+ fields = kwargs.keys()
+ if not set(fields).issubset(set(self.fields)):
+ raise ValueError("Invalid fields: {}. Must be subset of: {}"
+ .format(fields, self.fields))
+ data = self.data
+ for field, value in kwargs.items():
+ value = value if isinstance(value, list) else [value]
+ data = [meta for meta in data if meta[field] in value]
+ return data
+
+ def group(self, extractor, field):
+ """
+ Compare the data of this extractor with another extractor instance
+ and group each of their metadata together by given field.
+ """
+ # First we only want to consider values of field which are contained
+ # in both extractors
+ subset = self.get_field(field)
+ other_subset = extractor.get_field(field)
+ intersection = list(subset.intersection(other_subset))
+
+ # Next we will group the datasets in each extractor together by common
+ # field values
+ kwargs = {field: intersection}
+ results = self.query(**kwargs)
+
+ groups = []
+ for meta in results:
+ val = meta[field]
+ kwargs.update({field: val})
+ match = extractor.query(**kwargs)
+ groups.append((meta, match))
+
+ return groups
+
+ def get_field(self, field):
+ """
+ Returns only the selected field of the extracted data.
+ """
+ if field not in self.fields:
+ raise ValueError("Invalid field: {}. Must be one of: {}"
+ .format(field, self.fields))
+ sub = set(meta[field] for meta in self.data)
+ return sub
+
+ def filter_filename(self, fname):
+ """
+ Applies a filter to each individual filename contained in the _files
+ attribute, which is useful if some files within a data set are known
+ to not follow conventions, and "fix" them so that they do.
+ """
+ return os.path.basename(fname)
+
+ def get_pattern(self, fname):
+ """
+ Used to group multiple file datasets together via regular expresssions.
+ The most common convention is to split files by time periods, which
+ are generally the last field in a filename.
+ """
+ base = fname.split('_')
+ pattern = '_'.join(base[:len(self.fields)] + ['*.nc'])
+ return pattern
+
+ def _extract(self):
+ """
+ Do the actual metadata extraction from the list of filename given
+ via filter_filelist(). Additionally, filenames can also be filtered
+ via filter_filename() to remove unwanted characters from the extraction.
+ """
+ self._data = []
+ for fname in self.files:
+ meta = dict(filename=fname)
+
+ # Perform the actual metadata extraction
+ fname = self.filter_filename(fname)
+ meta.update(dict(zip(self.fields, fname.split('_')[:-1])))
+ self._data.append(meta)
+
+
+class obs4MIPSMetadataExtractor(MetadataExtractor):
+ @property
+ def instruments(self):
+ """
+ Get the list of instruments accross all the datasets.
+ """
+ return self.get_field('instrument')
+
+ @property
+ def fields(self):
+ """
+ obs4MIPs fields
+ """
+ fields = ['variable', 'instrument', 'processing_level', 'version']
+ return fields
+
+ def filter_filename(self, fname):
+ """
+ CALIPSO files have odd naming conventions, so we will use
+ a modified version to conform to standard obs4MIPs conventions.
+ """
+ fname = os.path.basename(fname)
+ fname = fname.replace('_obs4MIPs_', '_')
+ fname = fname.replace('calipso', '')
+ fname = fname.replace('Lidarsr532', '')
+ return fname
+
+ def get_pattern(self, fname):
+ """
+ Overriden to deal with CALIPSO filenames
+ """
+ base = fname.split('_')
+ offset = -2 if len(base) != 5 else -1
+ pattern = '_'.join(base[:offset] + ['*.nc'])
+ return pattern
+
+
+class CORDEXMetadataExtractor(MetadataExtractor):
+ @property
+ def models(self):
+ """
+ Get the list of models accross all the datasets.
+ """
+ return self.get_field('models')
+
+ @property
+ def fields(self):
+ """
+ obs4MIPs fields
+ """
+ fields = ['variable', 'domain', 'driving_model', 'experiment',
+ 'ensemble', 'model', 'version', 'time_step']
+ return fields
[3/3] climate git commit: Merge branch 'CLIMATE-926'
Posted by hu...@apache.org.
Merge branch 'CLIMATE-926'
Project: http://git-wip-us.apache.org/repos/asf/climate/repo
Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/fce72057
Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/fce72057
Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/fce72057
Branch: refs/heads/master
Commit: fce7205707c13021d1e224bd3b7ffccd2d9d35ad
Parents: cf4fb57 ec81037
Author: huikyole <hu...@argo.jpl.nasa.gov>
Authored: Mon Sep 25 11:11:45 2017 -0700
Committer: huikyole <hu...@argo.jpl.nasa.gov>
Committed: Mon Sep 25 11:11:45 2017 -0700
----------------------------------------------------------------------
RCMES/CORDEX/metadata_extractor.py | 222 ++++++++++++++++++++++++++++++++
1 file changed, 222 insertions(+)
----------------------------------------------------------------------
[2/3] climate git commit: Fix model attribute in
CORDEXMetadataExtractor
Posted by hu...@apache.org.
Fix model attribute in CORDEXMetadataExtractor
Project: http://git-wip-us.apache.org/repos/asf/climate/repo
Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/ec81037a
Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/ec81037a
Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/ec81037a
Branch: refs/heads/master
Commit: ec81037a21dbf2cf1999422127bbe924e1072c4a
Parents: 8217d12
Author: Alex Goodman <ag...@users.noreply.github.com>
Authored: Mon Sep 25 10:41:07 2017 -0700
Committer: Alex Goodman <ag...@users.noreply.github.com>
Committed: Mon Sep 25 10:41:07 2017 -0700
----------------------------------------------------------------------
RCMES/CORDEX/metadata_extractor.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/climate/blob/ec81037a/RCMES/CORDEX/metadata_extractor.py
----------------------------------------------------------------------
diff --git a/RCMES/CORDEX/metadata_extractor.py b/RCMES/CORDEX/metadata_extractor.py
index 7351cf4..cb18946 100644
--- a/RCMES/CORDEX/metadata_extractor.py
+++ b/RCMES/CORDEX/metadata_extractor.py
@@ -210,7 +210,7 @@ class CORDEXMetadataExtractor(MetadataExtractor):
"""
Get the list of models accross all the datasets.
"""
- return self.get_field('models')
+ return self.get_field('model')
@property
def fields(self):