You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@climate.apache.org by hu...@apache.org on 2016/06/27 00:00:13 UTC
[1/2] climate git commit: CLIMATE-812 - Fix PEP8 violations in
dataset processor
Repository: climate
Updated Branches:
refs/heads/master 54f211084 -> 3ce4e2094
CLIMATE-812 - Fix PEP8 violations in dataset processor
Project: http://git-wip-us.apache.org/repos/asf/climate/repo
Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/9fa0e4c5
Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/9fa0e4c5
Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/9fa0e4c5
Branch: refs/heads/master
Commit: 9fa0e4c5a8ed0223e5ee2a9494214a92ad22013f
Parents: 705403a
Author: Ibrahim Jarif <ja...@gmail.com>
Authored: Sat Jun 18 15:21:50 2016 +0530
Committer: Ibrahim Jarif <ja...@gmail.com>
Committed: Sat Jun 18 15:21:50 2016 +0530
----------------------------------------------------------------------
ocw/dataset_processor.py | 762 ++++++++++++++++++++++++------------------
1 file changed, 439 insertions(+), 323 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/climate/blob/9fa0e4c5/ocw/dataset_processor.py
----------------------------------------------------------------------
diff --git a/ocw/dataset_processor.py b/ocw/dataset_processor.py
index bab78e8..9ab4e50 100755
--- a/ocw/dataset_processor.py
+++ b/ocw/dataset_processor.py
@@ -31,7 +31,9 @@ import logging
logger = logging.getLogger(__name__)
-def temporal_subset(month_start, month_end, target_dataset, average_each_year=False):
+
+def temporal_subset(month_start, month_end, target_dataset,
+ average_each_year=False):
""" Temporally subset data given month_index.
:param month_start: An integer for beginning month (Jan=1)
@@ -51,21 +53,16 @@ def temporal_subset(month_start, month_end, target_dataset, average_each_year=Fa
"""
if month_start > month_end:
- month_index = range(month_start,13)
- month_index.extend(range(1, month_end+1))
+ month_index = range(month_start, 13)
+ month_index.extend(range(1, month_end + 1))
else:
- month_index = range(month_start, month_end+1)
+ month_index = range(month_start, month_end + 1)
dates = target_dataset.times
months = np.array([d.month for d in dates])
time_index = []
for m_value in month_index:
time_index = np.append(time_index, np.where(months == m_value)[0])
- if m_value == month_index[0]:
- time_index_first = np.min(np.where(months == m_value)[0])
- if m_value == month_index[-1]:
- time_index_last = np.max(np.where(months == m_value)[0])
-
time_index = np.sort(time_index)
time_index = list(time_index)
@@ -73,25 +70,27 @@ def temporal_subset(month_start, month_end, target_dataset, average_each_year=Fa
new_dataset = ds.Dataset(target_dataset.lats,
target_dataset.lons,
target_dataset.times[time_index],
- target_dataset.values[time_index,:],
+ target_dataset.values[time_index, :],
variable=target_dataset.variable,
units=target_dataset.units,
name=target_dataset.name)
if average_each_year:
nmonth = len(month_index)
- ntime = new_dataset.times.size
- nyear = ntime/nmonth
- averaged_time = []
+ ntime = new_dataset.times.size
+ nyear = ntime / nmonth
+ averaged_time = []
ny, nx = target_dataset.values.shape[1:]
- averaged_values =ma.zeros([nyear, ny, nx])
+ averaged_values = ma.zeros([nyear, ny, nx])
for iyear in np.arange(nyear):
- # centered time index of the season between month_start and month_end in each year
- center_index = int(nmonth/2)+iyear*nmonth
+ # centered time index of the season between month_start and
+ # month_end in each year
+ center_index = int(nmonth / 2) + iyear * nmonth
if nmonth == 1:
center_index = iyear
- averaged_time.append(new_dataset.times[center_index])
- averaged_values[iyear,:] = ma.average(new_dataset.values[nmonth*iyear:nmonth*iyear+nmonth,:], axis=0)
+ averaged_time.append(new_dataset.times[center_index])
+ averaged_values[iyear, :] = ma.average(new_dataset.values[
+ nmonth * iyear: nmonth * iyear + nmonth, :], axis=0)
new_dataset = ds.Dataset(target_dataset.lats,
target_dataset.lons,
np.array(averaged_time),
@@ -99,59 +98,69 @@ def temporal_subset(month_start, month_end, target_dataset, average_each_year=Fa
variable=target_dataset.variable,
units=target_dataset.units,
name=target_dataset.name)
-
return new_dataset
-def temporal_rebin(target_dataset, temporal_resolution):
+
+def temporal_rebin(target_dataset, temporal_resolution):
""" Rebin a Dataset to a new temporal resolution
-
+
:param target_dataset: Dataset object that needs temporal rebinned
:type target_dataset: :class:`dataset.Dataset`
:param temporal_resolution: The new temporal resolution
- :type temporal_resolution: :mod:`string`
-
+ :type temporal_resolution: :mod:`string`
+
:returns: A new temporally rebinned Dataset
:rtype: :class:`dataset.Dataset`
"""
- # Decode the temporal resolution into a string format that
+ # Decode the temporal resolution into a string format that
# _rcmes_calc_average_on_new_time_unit_K() can understand
- binned_values, binned_dates = _rcmes_calc_average_on_new_time_unit(target_dataset.values, target_dataset.times, temporal_resolution)
+ binned_values, binned_dates = _rcmes_calc_average_on_new_time_unit(
+ target_dataset.values, target_dataset.times, temporal_resolution)
binned_dates = np.array(binned_dates)
- new_dataset = ds.Dataset(target_dataset.lats,
- target_dataset.lons,
- binned_dates,
+ new_dataset = ds.Dataset(target_dataset.lats,
+ target_dataset.lons,
+ binned_dates,
binned_values,
variable=target_dataset.variable,
units=target_dataset.units,
name=target_dataset.name,
origin=target_dataset.origin)
-
return new_dataset
+
def temporal_rebin_with_time_index(target_dataset, nt_average):
""" Rebin a Dataset to a new temporal resolution
:param target_dataset: Dataset object that needs temporal rebinned
:type target_dataset: :class:`dataset.Dataset`
- :param nt_average: Time resolution for the output datasets.
- It is the same as the number of time indicies to be averaged. (length of time dimension in the rebinned dataset) = (original time dimension length/nt_average)
+ :param nt_average: Time resolution for the output datasets.
+ It is the same as the number of time indicies to be averaged.
+ length of time dimension in the rebinned dataset) =
+ (original time dimension length/nt_average)
:type temporal_resolution: integer
:returns: A new temporally rebinned Dataset
:rtype: :class:`dataset.Dataset`
"""
nt = target_dataset.times.size
- if nt % nt_average !=0:
- print 'Warning: length of time dimension must be a multiple of nt_average'
+ if nt % nt_average != 0:
+ msg = ('Warning: length of time dimension must '
+ 'be a multiple of nt_average')
+ print msg
# nt2 is the length of time dimension in the rebinned dataset
- nt2 = nt/nt_average
- binned_dates = target_dataset.times[np.arange(nt2)*nt_average]
- binned_values = ma.zeros(np.insert(target_dataset.values.shape[1:],0,nt2))
+ nt2 = nt / nt_average
+ binned_dates = target_dataset.times[np.arange(nt2) * nt_average]
+ binned_values = ma.zeros(
+ np.insert(target_dataset.values.shape[1:], 0, nt2))
for it in np.arange(nt2):
- binned_values[it,:] = ma.average(target_dataset.values[nt_average*it:nt_average*it+nt_average,:], axis=0)
+ binned_values[it, :] = ma.average(
+ target_dataset.values[nt_average * it:
+ nt_average * it + nt_average,
+ :],
+ axis=0)
new_dataset = ds.Dataset(target_dataset.lats,
target_dataset.lons,
binned_dates,
@@ -162,7 +171,9 @@ def temporal_rebin_with_time_index(target_dataset, nt_average):
origin=target_dataset.origin)
return new_dataset
-def spatial_regrid(target_dataset, new_latitudes, new_longitudes, boundary_check=True):
+
+def spatial_regrid(target_dataset, new_latitudes, new_longitudes,
+ boundary_check=True):
""" Regrid a Dataset using the new latitudes and longitudes
:param target_dataset: Dataset object that needs spatially regridded
@@ -174,25 +185,26 @@ def spatial_regrid(target_dataset, new_latitudes, new_longitudes, boundary_check
:param new_longitudes: Array of longitudes
:type new_longitudes: :class:`numpy.ndarray`
- :param boundary_check: Check if the regriding domain's boundaries are outside target_dataset's domain
+ :param boundary_check: Check if the regriding domain's boundaries
+ are outside target_dataset's domain
:type boundary_check: :class:'bool'
:returns: A new spatially regridded Dataset
:rtype: :class:`dataset.Dataset`
"""
-
+
# Create grids of the given lats and lons for the underlying API
# NOTE: np.meshgrid() requires inputs (x, y) and returns data
# of shape(y|lat|rows, x|lon|columns). So we pass in lons, lats
# and get back data.shape(lats, lons)
- if target_dataset.lons.ndim ==1 and target_dataset.lats.ndim ==1:
+ if target_dataset.lons.ndim == 1 and target_dataset.lats.ndim == 1:
regular_grid = True
lons, lats = np.meshgrid(target_dataset.lons, target_dataset.lats)
else:
regular_grid = False
lons = target_dataset.lons
lats = target_dataset.lats
- if new_longitudes.ndim ==1 and new_latitudes.ndim ==1:
+ if new_longitudes.ndim == 1 and new_latitudes.ndim == 1:
new_lons, new_lats = np.meshgrid(new_longitudes, new_latitudes)
else:
new_lons = new_longitudes
@@ -202,91 +214,116 @@ def spatial_regrid(target_dataset, new_latitudes, new_longitudes, boundary_check
ny_new, nx_new = new_lats.shape
# Make masked array of shape (times, new_latitudes,new_longitudes)
- new_values = ma.zeros([len(target_dataset.times),
- ny_new, nx_new])
+ new_values = ma.zeros([len(target_dataset.times),
+ ny_new, nx_new])
# Make masked array of shape (times, new_latitudes,new_longitudes)
- new_values = ma.zeros([len(target_dataset.times),
+ new_values = ma.zeros([len(target_dataset.times),
ny_new, nx_new])
# Boundary vertices of target_dataset
vertices = []
- if regular_grid:
- vertices.append([lons[0,0], lats[0,0]])
- vertices.append([lons[-1,0], lats[-1,0]])
- vertices.append([lons[-1,-1], lats[-1,-1]])
- vertices.append([lons[0,-1], lats[0,-1]])
- else:
- for iy in np.arange(ny_old): # from south to north along the west boundary
- vertices.append([lons[iy,0], lats[iy,0]])
- for ix in np.arange(nx_old): # from west to east along the north boundary
+ if regular_grid:
+ vertices.append([lons[0, 0], lats[0, 0]])
+ vertices.append([lons[-1, 0], lats[-1, 0]])
+ vertices.append([lons[-1, -1], lats[-1, -1]])
+ vertices.append([lons[0, -1], lats[0, -1]])
+ else:
+ # from south to north along the west boundary
+ for iy in np.arange(ny_old):
+ vertices.append([lons[iy, 0], lats[iy, 0]])
+ # from west to east along the north boundary
+ for ix in np.arange(nx_old):
vertices.append([lons[-1, ix], lats[-1, ix]])
- for iy in np.arange(ny_old)[::-1]: # from north to south along the east boundary
+ # from north to south along the east boundary
+ for iy in np.arange(ny_old)[::-1]:
vertices.append([lons[iy, -1], lats[iy, -1]])
- for ix in np.arange(nx_old)[::-1]: # from east to west along the south boundary
+ # from east to west along the south boundary
+ for ix in np.arange(nx_old)[::-1]:
vertices.append([lons[0, ix], lats[0, ix]])
path = Path(vertices)
# Convert new_lats and new_lons to float indices
new_lons_indices = np.zeros(new_lons.shape)
new_lats_indices = np.zeros(new_lats.shape)
-
+
for iy in np.arange(ny_new):
for ix in np.arange(nx_new):
- if path.contains_point([new_lons[iy,ix], new_lats[iy,ix]]) or not boundary_check:
+ if path.contains_point([new_lons[iy, ix],
+ new_lats[iy, ix]]) or not boundary_check:
if regular_grid:
- new_lats_indices[iy,ix] = (ny_old -1.)*(new_lats[iy,ix] - lats.min())/(lats.max() - lats.min())
- new_lons_indices[iy,ix] = (nx_old -1.)*(new_lons[iy,ix] - lons.min())/(lons.max() - lons.min())
+ mn = lats.min()
+ mx = lats.max()
+ new_lats_indices[iy, ix] = (
+ ny_old - 1.) * (new_lats[iy, ix] - mn / (mx - mn))
+ mn = lons.min()
+ mx = lons.max()
+ new_lons_indices[iy, ix] = (
+ nx_old - 1.) * (new_lons[iy, ix] - mn) / (mx - mn)
else:
- distance_from_original_grids = ((lons - new_lons[iy,ix])**2.+(lats - new_lats[iy,ix])**2.)**0.5
+ distance_from_original_grids = (
+ (lons - new_lons[iy, ix])**2. +
+ (lats - new_lats[iy, ix])**2.)**0.5
if np.min(distance_from_original_grids) == 0.:
- new_lats_indices[iy,ix], new_lons_indices[iy,ix] = np.where(distance_from_original_grids ==0)
+ new_lats_indices[iy, ix], new_lons_indices[
+ iy, ix] = np.where(
+ distance_from_original_grids == 0)
else:
- distance_rank = rankdata(distance_from_original_grids.flatten(), method='ordinal').reshape(lats.shape)
- iy1, ix1 = np.where(distance_rank == 1) # the nearest grid point's indices
- iy2, ix2 = np.where(distance_rank == 4) # point [iy2, ix] is diagonally across from [iy1, ix1]
- dist1 = distance_from_original_grids[iy1,ix1]
- dist2 = distance_from_original_grids[iy2,ix2]
- new_lats_indices[iy,ix] = (dist1*iy2 + dist2*iy1)/(dist1+dist2)
- new_lons_indices[iy,ix] = (dist1*ix2 + dist2*ix1)/(dist1+dist2)
+ distance_rank = rankdata(
+ distance_from_original_grids.flatten(),
+ method='ordinal').reshape(lats.shape)
+ # the nearest grid point's indices
+ iy1, ix1 = np.where(distance_rank == 1)
+ # point [iy2, ix] is diagonally across from [iy1, ix1]
+ iy2, ix2 = np.where(distance_rank == 4)
+ dist1 = distance_from_original_grids[iy1, ix1]
+ dist2 = distance_from_original_grids[iy2, ix2]
+ new_lats_indices[iy, ix] = (
+ dist1 * iy2 + dist2 * iy1) / (dist1 + dist2)
+ new_lons_indices[iy, ix] = (
+ dist1 * ix2 + dist2 * ix1) / (dist1 + dist2)
else:
- new_lats_indices[iy,ix] = -999.
- new_lats_indices[iy,ix] = -999.
+ new_lats_indices[iy, ix] = -999.
+ new_lats_indices[iy, ix] = -999.
new_lats_indices = ma.masked_less(new_lats_indices, 0.)
new_lons_indices = ma.masked_less(new_lons_indices, 0.)
-
+
# Regrid the data on each time slice
for i in range(len(target_dataset.times)):
if len(target_dataset.times) == 1:
values_original = ma.array(target_dataset.values)
- else:
+ else:
values_original = ma.array(target_dataset.values[i])
for shift in (-1, 1):
for axis in (0, 1):
q_shifted = np.roll(values_original, shift=shift, axis=axis)
idx = ~q_shifted.mask * values_original.mask
values_original.data[idx] = q_shifted[idx]
- new_values[i] = map_coordinates(values_original, [new_lats_indices.flatten(), new_lons_indices.flatten()], order=1).reshape(new_lats.shape)
+ new_values[i] = map_coordinates(values_original,
+ [new_lats_indices.flatten(),
+ new_lons_indices.flatten()],
+ order=1).reshape(new_lats.shape)
new_values[i] = ma.array(new_values[i], mask=new_lats_indices.mask)
- # Make a masking map using nearest neighbour interpolation -use this to determine locations with MDI and mask these
+ # Make a masking map using nearest neighbour interpolation -use this to
+ # determine locations with MDI and mask these
qmdi = np.zeros_like(values_original)
qmdi[values_original.mask == True] = 1.
qmdi[values_original.mask == False] = 0.
- qmdi_r = map_coordinates(qmdi, [new_lats_indices.flatten(), new_lons_indices.flatten()], order=1).reshape(new_lats.shape)
+ qmdi_r = map_coordinates(qmdi, [new_lats_indices.flatten(
+ ), new_lons_indices.flatten()], order=1).reshape(new_lats.shape)
mdimask = (qmdi_r != 0.0)
# Combine missing data mask, with outside domain mask define above.
new_values[i].mask = np.logical_or(mdimask, new_values[i].mask)
-
- # TODO:
- # This will call down to the _congrid() function and the lat and lon
+ # TODO:
+ # This will call down to the _congrid() function and the lat and lon
# axis will be adjusted with the time axis being held constant
-
+
# Create a new Dataset Object to return using new data
- regridded_dataset = ds.Dataset(new_latitudes,
- new_longitudes,
- target_dataset.times,
+ regridded_dataset = ds.Dataset(new_latitudes,
+ new_longitudes,
+ target_dataset.times,
new_values,
variable=target_dataset.variable,
units=target_dataset.units,
@@ -294,38 +331,41 @@ def spatial_regrid(target_dataset, new_latitudes, new_longitudes, boundary_check
origin=target_dataset.origin)
return regridded_dataset
+
def ensemble(datasets):
"""
Generate a single dataset which is the mean of the input datasets
An ensemble datasets combines input datasets assuming the all have
- similar shape, dimensions, and units.
-
+ similar shape, dimensions, and units.
+
:param datasets: Datasets to be used to compose the ensemble dataset from.
All Datasets must be the same shape.
:type datasets: :class:`list` of :class:`dataset.Dataset`
-
- :returns: New Dataset with a name of 'Dataset Ensemble'
+
+ :returns: New Dataset with a name of 'Dataset Ensemble'
:rtype: :class:`dataset.Dataset`
"""
_check_dataset_shapes(datasets)
dataset_values = [dataset.values for dataset in datasets]
ensemble_values = ma.mean(dataset_values, axis=0)
-
- # Build new dataset object from the input datasets and the ensemble values and return it
- ensemble_dataset = ds.Dataset(datasets[0].lats,
- datasets[0].lons,
+
+ # Build new dataset object from the input datasets and
+ # the ensemble values and return it
+ ensemble_dataset = ds.Dataset(datasets[0].lats,
+ datasets[0].lons,
datasets[0].times,
ensemble_values,
units=datasets[0].units,
name="Dataset Ensemble")
-
+
return ensemble_dataset
+
def subset(subregion, target_dataset, subregion_name=None):
'''Subset given dataset(s) with subregion information
- :param subregion: The Bounds with which to subset the target Dataset.
+ :param subregion: The Bounds with which to subset the target Dataset.
:type subregion: :class:`dataset.Bounds`
:param target_dataset: The Dataset object to subset.
@@ -341,34 +381,40 @@ def subset(subregion, target_dataset, subregion_name=None):
'''
if not subregion.start:
- subregion.start = target_dataset.times[0]
+ subregion.start = target_dataset.times[0]
subregion.end = target_dataset.times[-1]
-
+
# Ensure that the subregion information is well formed
_are_bounds_contained_by_dataset(subregion, target_dataset)
if not subregion_name:
subregion_name = target_dataset.name
- if target_dataset.lats.ndim ==2 and target_dataset.lons.ndim ==2:
- start_time_index = np.where(target_dataset.times == subregion.start)[0][0]
+ if target_dataset.lats.ndim == 2 and target_dataset.lons.ndim == 2:
+ start_time_index = np.where(
+ target_dataset.times == subregion.start)[0][0]
end_time_index = np.where(target_dataset.times == subregion.end)[0][0]
- target_dataset = temporal_slice(start_time_index, end_time_index, target_dataset)
+ target_dataset = temporal_slice(
+ start_time_index, end_time_index, target_dataset)
nt, ny, nx = target_dataset.values.shape
- y_index, x_index = np.where((target_dataset.lats >= subregion.lat_max) | (target_dataset.lats <= subregion.lat_min) |
- (target_dataset.lons >= subregion.lon_max) | (target_dataset.lons <= subregion.lon_min))
+ y_index, x_index = np.where(
+ (target_dataset.lats >= subregion.lat_max) | (
+ target_dataset.lats <= subregion.lat_min) |
+ (target_dataset.lons >= subregion.lon_max) | (
+ target_dataset.lons <= subregion.lon_min))
for it in np.arange(nt):
- target_dataset.values[it,y_index, x_index] = 1.e+20
+ target_dataset.values[it, y_index, x_index] = 1.e+20
target_dataset.values = ma.masked_equal(target_dataset.values, 1.e+20)
return target_dataset
- elif target_dataset.lats.ndim ==1 and target_dataset.lons.ndim ==1:
+ elif target_dataset.lats.ndim == 1 and target_dataset.lons.ndim == 1:
# Get subregion indices into subregion data
- dataset_slices = _get_subregion_slice_indices(subregion, target_dataset)
- # Slice the values array with our calculated slice indices
+ dataset_slices = _get_subregion_slice_indices(subregion,
+ target_dataset)
+ # Slice the values array with our calculated slice indices
if target_dataset.values.ndim == 2:
subset_values = ma.zeros([len(target_dataset.values[
- dataset_slices["lat_start"]:dataset_slices["lat_end"]]),
+ dataset_slices["lat_start"]:dataset_slices["lat_end"]]),
len(target_dataset.values[
dataset_slices["lon_start"]:dataset_slices["lon_end"]])])
@@ -380,26 +426,26 @@ def subset(subregion, target_dataset, subregion_name=None):
subset_values = ma.zeros([len(target_dataset.values[
dataset_slices["time_start"]:dataset_slices["time_end"]]),
len(target_dataset.values[
- dataset_slices["lat_start"]:dataset_slices["lat_end"]]),
+ dataset_slices["lat_start"]:dataset_slices["lat_end"]]),
len(target_dataset.values[
- dataset_slices["lon_start"]:dataset_slices["lon_end"]])])
-
+ dataset_slices["lon_start"]:dataset_slices["lon_end"]])])
+
subset_values = target_dataset.values[
dataset_slices["time_start"]:dataset_slices["time_end"] + 1,
dataset_slices["lat_start"]:dataset_slices["lat_end"] + 1,
dataset_slices["lon_start"]:dataset_slices["lon_end"] + 1]
-
+
# Build new dataset with subset information
return ds.Dataset(
# Slice the lats array with our calculated slice indices
- target_dataset.lats[dataset_slices["lat_start"]:
+ target_dataset.lats[dataset_slices["lat_start"]:
dataset_slices["lat_end"] + 1],
# Slice the lons array with our calculated slice indices
- target_dataset.lons[dataset_slices["lon_start"]:
+ target_dataset.lons[dataset_slices["lon_start"]:
dataset_slices["lon_end"] + 1],
# Slice the times array with our calculated slice indices
- target_dataset.times[dataset_slices["time_start"]:
- dataset_slices["time_end"]+ 1],
+ target_dataset.times[dataset_slices["time_start"]:
+ dataset_slices["time_end"] + 1],
# Slice the values array with our calculated slice indices
subset_values,
variable=target_dataset.variable,
@@ -408,6 +454,7 @@ def subset(subregion, target_dataset, subregion_name=None):
origin=target_dataset.origin
)
+
def temporal_slice(start_time_index, end_time_index, target_dataset):
'''Temporally slice given dataset(s) with subregion information. This does not
spatially subset the target_Dataset
@@ -430,17 +477,18 @@ def temporal_slice(start_time_index, end_time_index, target_dataset):
end_date = target_dataset.times[end_time_index]
timeStart = min(np.nonzero(target_dataset.times >= start_date)[0])
timeEnd = max(np.nonzero(target_dataset.times <= end_date)[0])
- target_dataset.times = target_dataset.times[timeStart:timeEnd+1]
- target_dataset.values = target_dataset.values[timeStart:timeEnd+1,:]
+ target_dataset.times = target_dataset.times[timeStart:timeEnd + 1]
+ target_dataset.values = target_dataset.values[timeStart:timeEnd + 1, :]
return target_dataset
+
def safe_subset(subregion, target_dataset, subregion_name=None):
'''Safely subset given dataset with subregion information
- A standard subset requires that the provided subregion be entirely contained
- within the datasets bounds. `safe_subset` returns the overlap of the
- subregion and dataset without returning an error.
+ A standard subset requires that the provided subregion be entirely
+ contained within the datasets bounds. `safe_subset` returns the
+ overlap of the subregion and dataset without returning an error.
:param subregion: The Bounds with which to subset the target Dataset.
:type subregion: :class:`dataset.Bounds`
@@ -470,7 +518,7 @@ def safe_subset(subregion, target_dataset, subregion_name=None):
if subregion.lon_max > lon_max:
subregion.lon_max = lon_max
- if subregion.start:
+ if subregion.start:
if subregion.start < start:
subregion.start = start
@@ -480,6 +528,7 @@ def safe_subset(subregion, target_dataset, subregion_name=None):
return subset(subregion, target_dataset, subregion_name)
+
def normalize_dataset_datetimes(dataset, timestep):
''' Normalize Dataset datetime values.
@@ -508,6 +557,7 @@ def normalize_dataset_datetimes(dataset, timestep):
origin=dataset.origin
)
+
def write_netcdf(dataset, path, compress=True):
''' Write a dataset to a NetCDF file.
@@ -525,9 +575,9 @@ def write_netcdf(dataset, path, compress=True):
time_len = len(dataset.times)
# Create attribute dimensions
- lat_dim = out_file.createDimension('lat', lat_len)
- lon_dim = out_file.createDimension('lon', lon_len)
- time_dim = out_file.createDimension('time', time_len)
+ out_file.createDimension('lat', lat_len)
+ out_file.createDimension('lon', lon_len)
+ out_file.createDimension('time', time_len)
# Create variables
lats = out_file.createVariable('lat', 'f8', ('lat',), zlib=compress)
@@ -536,9 +586,9 @@ def write_netcdf(dataset, path, compress=True):
var_name = dataset.variable if dataset.variable else 'var'
values = out_file.createVariable(var_name,
- 'f8',
- ('time', 'lat', 'lon'),
- zlib=compress)
+ 'f8',
+ ('time', 'lat', 'lon'),
+ zlib=compress)
# Set the time variable units
# We don't deal with hourly/minutely/anything-less-than-a-day data so
@@ -556,15 +606,21 @@ def write_netcdf(dataset, path, compress=True):
out_file.close()
-def write_netcdf_multiple_datasets_with_subregions(ref_dataset, ref_name,
- model_dataset_array, model_names,
- path,
- subregions = None, subregion_array = None,
- ref_subregion_mean = None, ref_subregion_std = None,
- model_subregion_mean = None, model_subregion_std = None):
- #Write multiple reference and model datasets and their subregional means and standard deivations in a NetCDF file.
- #:To be updated
+def write_netcdf_multiple_datasets_with_subregions(ref_dataset, ref_name,
+ model_dataset_array,
+ model_names,
+ path,
+ subregions=None,
+ subregion_array=None,
+ ref_subregion_mean=None,
+ ref_subregion_std=None,
+ model_subregion_mean=None,
+ model_subregion_std=None):
+ # Write multiple reference and model datasets and their subregional means
+ # and standard deivations in a NetCDF file.
+
+ # :To be updated
#
out_file = netCDF4.Dataset(path, 'w', format='NETCDF4')
@@ -577,22 +633,22 @@ def write_netcdf_multiple_datasets_with_subregions(ref_dataset, ref_name,
lon_ndim = dataset.lons.ndim
time_len = len(dataset.times)
- if not subregions == None:
+ if subregions is not None:
nsubregion = len(subregions)
-
+
# Create attribute dimensions
- lat_dim = out_file.createDimension('y', lat_len)
- lon_dim = out_file.createDimension('x', lon_len)
- time_dim = out_file.createDimension('time', time_len)
+ out_file.createDimension('y', lat_len)
+ out_file.createDimension('x', lon_len)
+ out_file.createDimension('time', time_len)
# Create variables and store the values
- if lat_ndim ==2:
- lats = out_file.createVariable('lat', 'f8', ('y','x'))
+ if lat_ndim == 2:
+ lats = out_file.createVariable('lat', 'f8', ('y', 'x'))
else:
lats = out_file.createVariable('lat', 'f8', ('y'))
lats[:] = dataset.lats
- if lon_ndim ==2:
- lons = out_file.createVariable('lon', 'f8', ('y','x'))
+ if lon_ndim == 2:
+ lons = out_file.createVariable('lon', 'f8', ('y', 'x'))
else:
lons = out_file.createVariable('lon', 'f8', ('x'))
lons[:] = dataset.lons
@@ -600,37 +656,49 @@ def write_netcdf_multiple_datasets_with_subregions(ref_dataset, ref_name,
times.units = "days since %s" % dataset.times[0]
times[:] = netCDF4.date2num(dataset.times, times.units)
- #mask_array = np.zeros([time_len, lat_len, lon_len])
- #for iobs in np.arange(nobs):
+ # mask_array = np.zeros([time_len, lat_len, lon_len])
+ # for iobs in np.arange(nobs):
# index = np.where(ref_dataset_array[iobs].values.mask[:] == True)
# mask_array[index] = 1
- out_file.createVariable(ref_name, 'f8', ('time','y','x'))
+ out_file.createVariable(ref_name, 'f8', ('time', 'y', 'x'))
out_file.variables[ref_name][:] = ref_dataset.values
out_file.variables[ref_name].units = ref_dataset.units
for imodel in np.arange(nmodel):
- out_file.createVariable(model_names[imodel], 'f8', ('time','y','x'))
- #out_file.variables[model_names[imodel]][:] = ma.array(model_dataset_array[imodel].values, mask = mask_array)
- out_file.variables[model_names[imodel]][:] = model_dataset_array[imodel].values
- out_file.variables[model_names[imodel]].units = model_dataset_array[imodel].units
-
- if not subregions == None:
- out_file.createVariable('subregion_array', 'i4', ('y','x'))
+ out_file.createVariable(model_names[imodel], 'f8', ('time', 'y', 'x'))
+ # out_file.variables[model_names[imodel]][:] = ma.array(
+ # model_dataset_array[imodel].values, mask = mask_array)
+ out_file.variables[model_names[imodel]][:] = model_dataset_array[
+ imodel].values
+ out_file.variables[model_names[imodel]].units = model_dataset_array[
+ imodel].units
+
+ if subregions is not None:
+ out_file.createVariable('subregion_array', 'i4', ('y', 'x'))
out_file.variables['subregion_array'][:] = subregion_array[:]
nsubregion = len(subregions)
out_file.createDimension('nsubregion', nsubregion)
out_file.createDimension('nobs', nobs)
out_file.createDimension('nmodel', nmodel)
- out_file.createVariable('obs_subregion_mean', 'f8', ('nobs','time','nsubregion'))
+ out_file.createVariable('obs_subregion_mean', 'f8', ('nobs',
+ 'time',
+ 'nsubregion'))
out_file.variables['obs_subregion_mean'][:] = ref_subregion_mean[:]
- out_file.createVariable('obs_subregion_std', 'f8', ('nobs','time','nsubregion'))
+ out_file.createVariable('obs_subregion_std', 'f8', ('nobs',
+ 'time',
+ 'nsubregion'))
out_file.variables['obs_subregion_std'][:] = ref_subregion_std[:]
- out_file.createVariable('model_subregion_mean', 'f8', ('nmodel','time','nsubregion'))
+ out_file.createVariable('model_subregion_mean', 'f8', ('nmodel',
+ 'time',
+ 'nsubregion'))
out_file.variables['model_subregion_mean'][:] = model_subregion_mean[:]
- out_file.createVariable('model_subregion_std', 'f8', ('nmodel','time','nsubregion'))
+ out_file.createVariable('model_subregion_std', 'f8', ('nmodel',
+ 'time',
+ 'nsubregion'))
out_file.variables['model_subregion_std'][:] = model_subregion_std[:]
out_file.close()
+
def water_flux_unit_conversion(dataset):
''' Convert water flux variables units as necessary
@@ -642,7 +710,7 @@ def water_flux_unit_conversion(dataset):
:returns: A Dataset with values converted to new units.
:rtype: :class:`dataset.Dataset`
'''
- water_flux_variables = ['pr', 'prec','evspsbl', 'mrro', 'swe']
+ water_flux_variables = ['pr', 'prec', 'evspsbl', 'mrro', 'swe']
variable = dataset.variable.lower()
if any(sub_string in variable for sub_string in water_flux_variables):
@@ -652,13 +720,14 @@ def water_flux_unit_conversion(dataset):
dataset.values = 1.e3 * dataset.values
dataset.units = 'km'
else:
- if any(unit in dataset_units
- for unit in ['kg m-2 s-1', 'mm s-1', 'mm/sec']):
+ if any(unit in dataset_units
+ for unit in ['kg m-2 s-1', 'mm s-1', 'mm/sec']):
dataset.values = 86400. * dataset.values
dataset.units = 'mm/day'
return dataset
+
def temperature_unit_conversion(dataset):
''' Convert temperature units as necessary
@@ -670,7 +739,7 @@ def temperature_unit_conversion(dataset):
:returns: The dataset with (potentially) updated units.
:rtype: :class:`dataset.Dataset`
'''
- temperature_variables = ['temp','tas','tasmax','taxmin','T','tg']
+ temperature_variables = ['temp', 'tas', 'tasmax', 'taxmin', 'T', 'tg']
variable = dataset.variable.lower()
if any(sub_string in variable for sub_string in temperature_variables):
@@ -681,10 +750,12 @@ def temperature_unit_conversion(dataset):
return dataset
+
def variable_unit_conversion(dataset):
''' Convert water flux or temperature variables units as necessary
-
- For water flux variables, convert full SI units water flux units to more common units.
+
+ For water flux variables, convert full SI units water flux units
+ to more common units.
For temperature, convert Celcius to Kelvin.
:param dataset: The dataset to convert.
@@ -696,9 +767,10 @@ def variable_unit_conversion(dataset):
dataset = water_flux_unit_conversion(dataset)
dataset = temperature_unit_conversion(dataset)
-
+
return dataset
+
def _rcmes_normalize_datetimes(datetimes, timestep):
""" Normalize Dataset datetime values.
@@ -718,22 +790,25 @@ def _rcmes_normalize_datetimes(datetimes, timestep):
# Clean the inputDatetime
inputDatetimeString = inputDatetime.strftime('%Y%m%d')
normalInputDatetimeString = inputDatetimeString[:6] + '01'
- inputDatetime = datetime.datetime.strptime(normalInputDatetimeString, '%Y%m%d')
+ inputDatetime = datetime.datetime.strptime(
+ normalInputDatetimeString, '%Y%m%d')
normalDatetimes.append(inputDatetime)
elif timestep.lower() == 'daily':
for inputDatetime in datetimes:
- if inputDatetime.hour != 0 or inputDatetime.minute != 0 or inputDatetime.second != 0:
+ if (inputDatetime.hour != 0 or inputDatetime.minute != 0 or
+ inputDatetime.second != 0):
datetimeString = inputDatetime.strftime('%Y%m%d%H%M%S')
normalDatetimeString = datetimeString[:8] + '000000'
- inputDatetime = datetime.datetime.strptime(normalDatetimeString, '%Y%m%d%H%M%S')
+ inputDatetime = datetime.datetime.strptime(
+ normalDatetimeString, '%Y%m%d%H%M%S')
normalDatetimes.append(inputDatetime)
-
return normalDatetimes
+
def mask_missing_data(dataset_array):
''' Check missing values in observation and model datasets.
If any of dataset in dataset_array has missing values at a grid point,
@@ -743,7 +818,7 @@ def mask_missing_data(dataset_array):
mask_array = np.zeros(dataset_array[0].values.shape)
for dataset in dataset_array:
index = np.where(dataset.values.mask == True)
- if index[0].size >0:
+ if index[0].size > 0:
mask_array[index] = 1
masked_array = []
for dataset in dataset_array:
@@ -755,7 +830,7 @@ def mask_missing_data(dataset_array):
def _rcmes_spatial_regrid(spatial_values, lat, lon, lat2, lon2, order=1):
'''
Spatial regrid from one set of lat,lon values onto a new set (lat2,lon2)
-
+
:param spatial_values: Values in a spatial grid that need to be regridded
:type spatial_values: 2d masked numpy array. shape (latitude, longitude)
:param lat: Grid of latitude values which map to the spatial values
@@ -770,57 +845,61 @@ def _rcmes_spatial_regrid(spatial_values, lat, lon, lat2, lon2, order=1):
:type order: [optional] Integer
:returns: 2d masked numpy array with shape(len(lat2), len(lon2))
- :rtype: (float, float)
+ :rtype: (float, float)
'''
nlat = spatial_values.shape[0]
nlon = spatial_values.shape[1]
- #print nlat, nlon, "lats, lons - incoming dataset"
+ # print nlat, nlon, "lats, lons - incoming dataset"
nlat2 = lat2.shape[0]
nlon2 = lon2.shape[1]
- #print nlat2, nlon2, "NEW lats, lons - for the new grid output"
+ # print nlat2, nlon2, "NEW lats, lons - for the new grid output"
- # To make our lives easier down the road, let's
+ # To make our lives easier down the road, let's
# turn these into arrays of x & y coords
loni = lon2.ravel()
lati = lat2.ravel()
- loni = loni.copy() # NB. it won't run unless you do this...
+ loni = loni.copy() # NB. it won't run unless you do this...
lati = lati.copy()
# Now, we'll set points outside the boundaries to lie along an edge
loni[loni > lon.max()] = lon.max()
loni[loni < lon.min()] = lon.min()
-
+
# To deal with the "hard" break, we'll have to treat y differently,
# so we're just setting the min here...
lati[lati > lat.max()] = lat.max()
lati[lati < lat.min()] = lat.min()
-
-
+
# We need to convert these to (float) indicies
# (xi should range from 0 to (nx - 1), etc)
loni = (nlon - 1) * (loni - lon.min()) / (lon.max() - lon.min())
-
+
# Deal with the "hard" break in the y-direction
lati = (nlat - 1) * (lati - lat.min()) / (lat.max() - lat.min())
-
+
"""
- TODO: Review this docstring and see if it still holds true.
+ TODO: Review this docstring and see if it still holds true.
NOTE: This function doesn't use MDI currently. These are legacy comments
-
+
Notes on dealing with MDI when regridding data.
Method adopted here:
- Use bilinear interpolation of data by default (but user can specify other order using order=... in call)
+ Use bilinear interpolation of data by default
+ (but user can specify other order using order=... in call)
Perform bilinear interpolation of data, and of mask.
- To be conservative, new grid point which contained some missing data on the old grid is set to missing data.
- -this is achieved by looking for any non-zero interpolated mask values.
- To avoid issues with bilinear interpolation producing strong gradients leading into the MDI,
- set values at MDI points to mean data value so little gradient visible = not ideal, but acceptable for now.
-
- Set values in MDI so that similar to surroundings so don't produce large gradients when interpolating
- Preserve MDI mask, by only changing data part of masked array object.
+ To be conservative, new grid point which contained some missing data on
+ the old grid is set to missing data.
+ -this is achieved by looking for any non-zero interpolated
+ mask values.
+ To avoid issues with bilinear interpolation producing strong gradients
+ leading into the MDI, set values at MDI points to mean data value so
+ little gradient visible = not ideal, but acceptable for now.
+
+ Set values in MDI so that similar to surroundings so don't produce large
+ gradients when interpolating Preserve MDI mask, by only changing data part
+ of masked array object.
"""
for shift in (-1, 1):
for axis in (0, 1):
@@ -829,31 +908,37 @@ def _rcmes_spatial_regrid(spatial_values, lat, lon, lat2, lon2, order=1):
spatial_values.data[idx] = q_shifted[idx]
# Now we actually interpolate
- # map_coordinates does cubic interpolation by default,
+ # map_coordinates does cubic interpolation by default,
# use "order=1" to preform bilinear interpolation instead...
-
- regridded_values = map_coordinates(spatial_values, [lati, loni], order=order)
+
+ regridded_values = map_coordinates(spatial_values,
+ [lati, loni],
+ order=order)
regridded_values = regridded_values.reshape([nlat2, nlon2])
# Set values to missing data outside of original domain
- regridded_values = ma.masked_array(regridded_values, mask=np.logical_or(np.logical_or(lat2 >= lat.max(),
- lat2 <= lat.min()),
- np.logical_or(lon2 <= lon.min(),
- lon2 >= lon.max())))
-
- # Make second map using nearest neighbour interpolation -use this to determine locations with MDI and mask these
+ regridded_values = ma.masked_array(regridded_values,
+ mask=np.logical_or(
+ np.logical_or(lat2 >= lat.max(),
+ lat2 <= lat.min()),
+ np.logical_or(lon2 <= lon.min(),
+ lon2 >= lon.max())))
+
+ # Make second map using nearest neighbour interpolation -use this to
+ # determine locations with MDI and mask these
qmdi = np.zeros_like(spatial_values)
qmdi[spatial_values.mask == True] = 1.
qmdi[spatial_values.mask == False] = 0.
qmdi_r = map_coordinates(qmdi, [lati, loni], order=order)
qmdi_r = qmdi_r.reshape([nlat2, nlon2])
mdimask = (qmdi_r != 0.0)
-
+
# Combine missing data mask, with outside domain mask define above.
regridded_values.mask = np.logical_or(mdimask, regridded_values.mask)
return regridded_values
+
def _rcmes_create_mask_using_threshold(masked_array, threshold=0.5):
'''Mask an array if percent of values missing data is above a threshold.
@@ -871,15 +956,16 @@ def _rcmes_create_mask_using_threshold(masked_array, threshold=0.5):
:returns: A Numpy array describing the mask for masked_array.
'''
- # try, except used as some model files don't have a full mask, but a single bool
- # the except catches this situation and deals with it appropriately.
+ # try, except used as some model files don't have a full mask, but a single
+ # bool the except catches this situation and deals with it appropriately.
try:
nT = masked_array.mask.shape[0]
# For each pixel, count how many times are masked.
nMasked = masked_array.mask[:, :, :].sum(axis=0)
- # Define new mask as when a pixel has over a defined threshold ratio of masked data
+ # Define new mask as when a pixel has over a defined threshold ratio of
+ # masked data
# e.g. if the threshold is 75%, and there are 10 times,
# then a pixel will be masked if more than 5 times are masked.
mymask = nMasked > (nT * threshold)
@@ -889,6 +975,7 @@ def _rcmes_create_mask_using_threshold(masked_array, threshold=0.5):
return mymask
+
def _rcmes_calc_average_on_new_time_unit(data, dates, unit):
""" Rebin 3d array and list of dates using the provided unit parameter
@@ -897,22 +984,25 @@ def _rcmes_calc_average_on_new_time_unit(data, dates, unit):
:param dates: List of dates that correspond to the given data values
:type dates: Python datetime objects
:param unit: Time unit to average the data into
- :type unit: String matching one of these values : full | annual | monthly | daily
+ :type unit: String matching one of these values :
+ full | annual | monthly | daily
:returns: meanstorem, newTimesList
- :rtype: 3D numpy masked array the same shape as the input array, list of python datetime objects
+ :rtype: 3D numpy masked array the same shape as the input array,
+ list of python datetime objects
"""
# Check if the user-selected temporal grid is valid. If not, EXIT
- acceptable = (unit=='full')|(unit=='annual')|(unit=='monthly')|(unit=='daily')
+ acceptable = ((unit == 'full') | (unit == 'annual') |
+ (unit == 'monthly') | (unit == 'daily'))
if not acceptable:
print 'Error: unknown unit type selected for time averaging: EXIT'
- return -1,-1,-1,-1
+ return - 1, - 1, - 1, - 1
nt, ny, nx = data.shape
if unit == 'full':
new_data = ma.mean(data, axis=0)
- new_date = [dates[dates.size/2]]
+ new_date = [dates[dates.size / 2]]
if unit == 'annual':
years = [d.year for d in dates]
years_sorted = np.unique(years)
@@ -921,26 +1011,28 @@ def _rcmes_calc_average_on_new_time_unit(data, dates, unit):
new_date = []
for year in years_sorted:
index = np.where(years == year)[0]
- new_data[it,:] = ma.mean(data[index,:], axis=0)
+ new_data[it, :] = ma.mean(data[index, :], axis=0)
new_date.append(datetime.datetime(year=year, month=7, day=2))
- it = it+1
+ it = it + 1
if unit == 'monthly':
years = [d.year for d in dates]
years_sorted = np.unique(years)
months = [d.month for d in dates]
months_sorted = np.unique(months)
-
- new_data = ma.zeros([years_sorted.size*months_sorted.size, ny, nx])
+
+ new_data = ma.zeros([years_sorted.size * months_sorted.size, ny, nx])
it = 0
new_date = []
for year in years_sorted:
for month in months_sorted:
index = np.where((years == year) & (months == month))[0]
- new_data[it,:] = ma.mean(data[index,:], axis=0)
- new_date.append(datetime.datetime(year=year, month=month, day=15))
- it = it+1
+ new_data[it, :] = ma.mean(data[index, :], axis=0)
+ new_date.append(datetime.datetime(year=year,
+ month=month,
+ day=15))
+ it = it + 1
if unit == 'daily':
- days = [d.year*10000.+d.month*100.+d.day for d in dates]
+ days = [d.year * 10000. + d.month * 100. + d.day for d in dates]
days_sorted = np.unique(days)
new_data = ma.zeros([days_sorted.size, ny, nx])
@@ -948,64 +1040,69 @@ def _rcmes_calc_average_on_new_time_unit(data, dates, unit):
new_date = []
for day in days_sorted:
index = np.where(days == day)[0]
- new_data[it,:] = ma.mean(data[index,:], axis=0)
+ new_data[it, :] = ma.mean(data[index, :], axis=0)
y = int(day / 10000)
m = int(day % 10000) / 100
d = int(day % 100)
new_date.append(datetime.datetime(year=y, month=m, day=d))
- it = it+1
-
+ it = it + 1
+
return new_data, np.array(new_date)
+
def _rcmes_calc_average_on_new_time_unit_K(data, dates, unit):
""" Rebin 3d array and list of dates using the provided unit parameter
-
- :param data: Input data that needs to be averaged
+
+ :param data: Input data that needs to be averaged
:type data: 3D masked numpy array of shape (times, lats, lons)
:param dates: List of dates that correspond to the given data values
:type dates: Python datetime objects
:param unit: Time unit to average the data into
- :type unit: String matching one of these values : full | annual | monthly | daily
-
+ :type unit: String matching one of these values :
+ full | annual | monthly | daily
+
:returns: meanstorem, newTimesList
- :rtype: 3D numpy masked array the same shape as the input array, list of python datetime objects
+ :rtype: 3D numpy masked array the same shape as the input array,
+ list of python datetime objects
"""
# Check if the user-selected temporal grid is valid. If not, EXIT
- acceptable = (unit=='full')|(unit=='annual')|(unit=='monthly')|(unit=='daily')
+ acceptable = ((unit == 'full') | (unit == 'annual') |
+ (unit == 'monthly') | (unit == 'daily'))
if not acceptable:
print 'Error: unknown unit type selected for time averaging: EXIT'
- return -1,-1,-1,-1
+ return - 1, - 1, - 1, - 1
# Calculate arrays of: annual timeseries: year (2007,2007),
# monthly time series: year-month (200701,200702),
- # daily timeseries: year-month-day (20070101,20070102)
+ # daily timeseries: year-month-day (20070101,20070102)
# depending on user-selected averaging period.
# Year list
- if unit=='annual':
+ if unit == 'annual':
timeunits = np.array([int(d.strftime("%Y")) for d in dates])
unique_times = np.unique(timeunits)
-
+
# YearMonth format list
- if unit=='monthly':
+ if unit == 'monthly':
timeunits = np.array([int(d.strftime("%Y%m")) for d in dates])
unique_times = np.unique(timeunits)
# YearMonthDay format list
- if unit=='daily':
+ if unit == 'daily':
timeunits = np.array([int(d.strftime("%Y%m%d")) for d in dates])
unique_times = np.unique(timeunits)
-
# TODO: add pentad setting using Julian days?
# Full list: a special case
if unit == 'full':
- # Calculating means data over the entire time range: i.e., annual-mean climatology
+ # Calculating means data over the entire time range:
+ # i.e., annual-mean climatology
timeunits = []
for i in np.arange(len(dates)):
- timeunits.append(999) # i.e. we just want the same value for all times.
+ # i.e. we just want the same value for all times.
+ timeunits.append(999)
timeunits = np.array(timeunits, dtype=int)
unique_times = np.unique(timeunits)
@@ -1013,52 +1110,56 @@ def _rcmes_calc_average_on_new_time_unit_K(data, dates, unit):
newTimesList = []
# Decide whether or not you need to do any time averaging.
- # i.e. if data are already on required time unit then just pass data through and
- # calculate and return representative datetimes.
+ # i.e. if data are already on required time unit then just
+ # pass data through and calculate and return representative
+ # datetimes.
processing_required = True
- if len(timeunits)==(len(unique_times)):
+ if len(timeunits) == (len(unique_times)):
processing_required = False
# 1D data arrays, i.e. time series
- if data.ndim==1:
+ if data.ndim == 1:
# Create array to store the resulting data
meanstore = np.zeros(len(unique_times))
-
+
# Calculate the means across each unique time unit
- i=0
+ i = 0
for myunit in unique_times:
if processing_required:
- datam=ma.masked_array(data,timeunits!=myunit)
+ datam = ma.masked_array(data, timeunits != myunit)
meanstore[i] = ma.average(datam)
-
+
# construct new times list
yyyy, mm, dd = _create_new_year_month_day(myunit, dates)
- newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0))
- i = i+1
+ newTimesList.append(datetime.datetime(yyyy, mm, dd, 0, 0, 0, 0))
+ i = i + 1
# 3D data arrays
- if data.ndim==3:
+ if data.ndim == 3:
# Create array to store the resulting data
- meanstore = np.zeros([len(unique_times),data.shape[1],data.shape[2]])
-
+ meanstore = np.zeros([len(unique_times), data.shape[1], data.shape[2]])
+
# Calculate the means across each unique time unit
- i=0
+ i = 0
datamask_store = []
for myunit in unique_times:
if processing_required:
mask = np.zeros_like(data)
- mask[timeunits!=myunit,:,:] = 1.0
+ mask[timeunits != myunit, :, :] = 1.0
# Calculate missing data mask within each time unit...
datamask_at_this_timeunit = np.zeros_like(data)
- datamask_at_this_timeunit[:]= _rcmes_create_mask_using_threshold(data[timeunits==myunit,:,:],threshold=0.75)
+ datamask_at_this_timeunit[:] = _rcmes_create_mask_using_threshold(
+ data[timeunits == myunit, :, :], threshold=0.75)
# Store results for masking later
datamask_store.append(datamask_at_this_timeunit[0])
- # Calculate means for each pixel in this time unit, ignoring missing data (using masked array).
- datam = ma.masked_array(data,np.logical_or(mask,datamask_at_this_timeunit))
- meanstore[i,:,:] = ma.average(datam,axis=0)
+ # Calculate means for each pixel in this time unit, ignoring
+ # missing data (using masked array).
+ datam = ma.masked_array(data, np.logical_or(mask,
+ datamask_at_this_timeunit))
+ meanstore[i, :, :] = ma.average(datam, axis=0)
# construct new times list
yyyy, mm, dd = _create_new_year_month_day(myunit, dates)
- newTimesList.append(datetime.datetime(yyyy,mm,dd))
+ newTimesList.append(datetime.datetime(yyyy, mm, dd))
i += 1
if not processing_required:
@@ -1071,40 +1172,42 @@ def _rcmes_calc_average_on_new_time_unit_K(data, dates, unit):
return meanstorem, newTimesList
+
def _create_new_year_month_day(time_unit, dates):
smyunit = str(time_unit)
- if len(smyunit)==4: # YYYY
+ if len(smyunit) == 4: # YYYY
yyyy = int(smyunit[0:4])
mm = 1
dd = 1
- elif len(smyunit)==6: # YYYYMM
+ elif len(smyunit) == 6: # YYYYMM
yyyy = int(smyunit[0:4])
mm = int(smyunit[4:6])
dd = 1
- elif len(smyunit)==8: # YYYYMMDD
+ elif len(smyunit) == 8: # YYYYMMDD
yyyy = int(smyunit[0:4])
mm = int(smyunit[4:6])
dd = int(smyunit[6:8])
- elif len(smyunit)==3: # Full time range
- # Need to set an appropriate time representing the mid-point of the entire time span
- dt = dates[-1]-dates[0]
- halfway = dates[0]+(dt/2)
+ elif len(smyunit) == 3: # Full time range
+ # Need to set an appropriate time representing the mid-point of the
+ # entire time span
+ dt = dates[-1] - dates[0]
+ halfway = dates[0] + (dt / 2)
yyyy = int(halfway.year)
mm = int(halfway.month)
dd = int(halfway.day)
-
+
return (yyyy, mm, dd)
+
def _congrid(a, newdims, method='linear', centre=False, minusone=False):
'''
This function is from http://wiki.scipy.org/Cookbook/Rebinning - Example 3
It has been refactored and changed a bit, but the original functionality
has been preserved.
-
Arbitrary resampling of source array to new dimension sizes.
Currently only supports maintaining the same number of dimensions.
To use 1-D arrays, first promote them to shape (x,1).
-
+
Uses the same parameters and creates the same co-ordinate lookup points
as IDL''s congrid routine, which apparently originally came from a VAX/VMS
routine of the same name.
@@ -1126,60 +1229,60 @@ def _congrid(a, newdims, method='linear', centre=False, minusone=False):
True - inarray is resampled by(i-1)/(x-1) * (j-1)/(y-1)
This prevents extrapolation one element beyond bounds of input array.
'''
- if not a.dtype in [np.float64, np.float32]:
+ if a.dtype not in [np.float64, np.float32]:
a = np.cast[float](a)
- # this will merely take the True/False input and convert it to an array(1) or array(0)
+ # this will merely take the True/False input and convert it to an
+ # array(1) or array(0)
m1 = np.cast[int](minusone)
- # this also casts the True False input into a floating point number of 0.5 or 0.0
+ # this also casts the True False input into a floating point number
+ # of 0.5 or 0.0
ofs = np.cast[int](centre) * 0.5
- old = np.array( a.shape )
- ndims = len( a.shape )
- if len( newdims ) != ndims:
+ old = np.array(a.shape)
+ ndims = len(a.shape)
+ if len(newdims) != ndims:
print "[congrid] dimensions error. " \
"This routine currently only supports " \
"rebinning to the same number of dimensions."
return None
- newdims = np.asarray( newdims, dtype=float )
+ newdims = np.asarray(newdims, dtype=float)
dimlist = []
if method == 'neighbour':
newa = _congrid_neighbor(a, newdims, m1, ofs)
- elif method in ['nearest','linear']:
+ elif method in ['nearest', 'linear']:
# calculate new dims
- for i in range( ndims ):
- base = np.arange( newdims[i] )
- dimlist.append( (old[i] - m1) / (newdims[i] - m1) \
- * (base + ofs) - ofs )
+ for i in range(ndims):
+ base = np.arange(newdims[i])
+ dimlist.append((old[i] - m1) / (newdims[i] - m1) *
+ (base + ofs) - ofs)
# specify old dims
- olddims = [np.arange(i, dtype = np.float) for i in list( a.shape )]
+ olddims = [np.arange(i, dtype=np.float) for i in list(a.shape)]
# first interpolation - for ndims = any
- mint = scipy.interpolate.interp1d( olddims[-1], a, kind=method )
- newa = mint( dimlist[-1] )
+ mint = scipy.interpolate.interp1d(olddims[-1], a, kind=method)
+ newa = mint(dimlist[-1])
- trorder = [ndims - 1] + range( ndims - 1 )
- for i in range( ndims - 2, -1, -1 ):
- newa = newa.transpose( trorder )
+ trorder = [ndims - 1] + range(ndims - 1)
+ for i in range(ndims - 2, -1, -1):
+ newa = newa.transpose(trorder)
- mint = scipy.interpolate.interp1d( olddims[i], newa, kind=method )
- newa = mint( dimlist[i] )
+ mint = scipy.interpolate.interp1d(olddims[i], newa, kind=method)
+ newa = mint(dimlist[i])
if ndims > 1:
# need one more transpose to return to original dimensions
- newa = newa.transpose( trorder )
+ newa = newa.transpose(trorder)
return newa
elif method in ['spline']:
- oslices = [ slice(0, j) for j in old ]
- oldcoords = np.ogrid[oslices]
- nslices = [ slice(0, j) for j in list(newdims) ]
+ nslices = [slice(0, j) for j in list(newdims)]
newcoords = np.mgrid[nslices]
newcoords_dims = range(np.rank(newcoords))
- #make first index last
+ # make first index last
newcoords_dims.append(newcoords_dims.pop(0))
newcoords_tr = newcoords.transpose(newcoords_dims)
# makes a view that affects newcoords
@@ -1199,56 +1302,59 @@ def _congrid(a, newdims, method='linear', centre=False, minusone=False):
"and \'spline\' are supported."
return None
+
def _check_dataset_shapes(datasets):
""" If the datasets are not the same shape throw a ValueError Exception
-
+
:param datasets: OCW Datasets to check for a consistent shape
:type datasets: List of OCW Dataset Objects
-
+
:raises: ValueError
"""
dataset_shape = None
for dataset in datasets:
- if dataset_shape == None:
+ if dataset_shape is None:
dataset_shape = dataset.values.shape
else:
if dataset.values.shape != dataset_shape:
msg = "%s != %s" % (dataset.values.shape, dataset_shape)
- raise ValueError("Input datasets must be the same shape for an ensemble :: ", msg)
+ raise ValueError("Input datasets must be the same shape "
+ "for an ensemble :: ", msg)
else:
pass
def _congrid_neighbor(values, new_dims, minus_one, offset):
""" Use the nearest neighbor to create a new array
-
+
:param values: Array of values that need to be interpolated
:type values: Numpy ndarray
:param new_dims: Longitude resolution in degrees
:type new_dims: float
:param lat_resolution: Latitude resolution in degrees
:type lat_resolution: float
-
+
:returns: A new spatially regridded Dataset
:rtype: Open Climate Workbench Dataset Object
"""
- ndims = len( values.shape )
+ ndims = len(values.shape)
dimlist = []
- old_dims = np.array( values.shape )
- for i in range( ndims ):
+ old_dims = np.array(values.shape)
+ for i in range(ndims):
base = np.indices(new_dims)[i]
- dimlist.append( (old_dims[i] - minus_one) / (new_dims[i] - minus_one) \
- * (base + offset) - offset )
- cd = np.array( dimlist ).round().astype(int)
- new_values = values[list( cd )]
- return new_values
+ dimlist.append((old_dims[i] - minus_one) / (new_dims[i] - minus_one) *
+ (base + offset) - offset)
+ cd = np.array(dimlist).round().astype(int)
+ new_values = values[list(cd)]
+ return new_values
+
def _are_bounds_contained_by_dataset(bounds, dataset):
'''Check if a Dataset fully contains a bounds.
:param bounds: The Bounds object to check.
:type bounds: Bounds
- :param dataset: The Dataset that should be fully contain the
+ :param dataset: The Dataset that should be fully contain the
Bounds
:type dataset: Dataset
@@ -1259,29 +1365,40 @@ def _are_bounds_contained_by_dataset(bounds, dataset):
start, end = dataset.time_range()
errors = []
- # TODO: THIS IS TERRIBLY inefficent and we need to use a geometry lib instead in the future
- if not np.round(lat_min,3) <= np.round(bounds.lat_min,3) <= np.round(lat_max,3):
- error = "bounds.lat_min: %s is not between lat_min: %s and lat_max: %s" % (bounds.lat_min, lat_min, lat_max)
+ # TODO: THIS IS TERRIBLY inefficent and we need to use a geometry
+ # lib instead in the future
+ if not (np.round(lat_min, 3) <= np.round(bounds.lat_min, 3) <=
+ np.round(lat_max, 3)):
+ error = ("bounds.lat_min: %s is not between lat_min: %s and"
+ " lat_max: %s" % (bounds.lat_min, lat_min, lat_max))
errors.append(error)
- if not np.round(lat_min,3) <= np.round(bounds.lat_max,3) <= np.round(lat_max,3):
- error = "bounds.lat_max: %s is not between lat_min: %s and lat_max: %s" % (bounds.lat_max, lat_min, lat_max)
+ if not (np.round(lat_min, 3) <= np.round(bounds.lat_max, 3) <=
+ np.round(lat_max, 3)):
+ error = ("bounds.lat_max: %s is not between lat_min: %s and"
+ "lat_max: %s" % (bounds.lat_max, lat_min, lat_max))
errors.append(error)
- if not np.round(lon_min,3) <= np.round(bounds.lon_min,3) <= np.round(lon_max,3):
- error = "bounds.lon_min: %s is not between lon_min: %s and lon_max: %s" % (bounds.lon_min, lon_min, lon_max)
+ if not (np.round(lon_min, 3) <= np.round(bounds.lon_min, 3) <=
+ np.round(lon_max, 3)):
+ error = ("bounds.lon_min: %s is not between lon_min: %s and"
+ "lon_max: %s" % (bounds.lon_min, lon_min, lon_max))
errors.append(error)
- if not np.round(lon_min,3) <= np.round(bounds.lon_max,3) <= np.round(lon_max,3):
- error = "bounds.lon_max: %s is not between lon_min: %s and lon_max: %s" % (bounds.lon_max, lon_min, lon_max)
+ if not (np.round(lon_min, 3) <= np.round(bounds.lon_max, 3) <=
+ np.round(lon_max, 3)):
+ error = ("bounds.lon_max: %s is not between lon_min: %s and"
+ "lon_max: %s" % (bounds.lon_max, lon_min, lon_max))
errors.append(error)
if not start <= bounds.start <= end:
- error = "bounds.start: %s is not between start: %s and end: %s" % (bounds.start, start, end)
+ error = ("bounds.start: %s is not between start: %s and end: %s" %
+ (bounds.start, start, end))
errors.append(error)
if not start <= bounds.end <= end:
- error = "bounds.end: %s is not between start: %s and end: %s" % (bounds.end, start, end)
+ error = ("bounds.end: %s is not between start: %s and end: %s" %
+ (bounds.end, start, end))
errors.append(error)
if len(errors) == 0:
@@ -1290,10 +1407,11 @@ def _are_bounds_contained_by_dataset(bounds, dataset):
error_message = '\n'.join(errors)
raise ValueError(error_message)
+
def _get_subregion_slice_indices(subregion, target_dataset):
'''Get the indices for slicing Dataset values to generate the subregion.
- :param subregion: The Bounds that specify the subset of the Dataset
+ :param subregion: The Bounds that specify the subset of the Dataset
that should be extracted.
:type subregion: Bounds
:param target_dataset: The Dataset to subset.
@@ -1307,16 +1425,14 @@ def _get_subregion_slice_indices(subregion, target_dataset):
lonStart = min(np.nonzero(target_dataset.lons >= subregion.lon_min)[0])
lonEnd = max(np.nonzero(target_dataset.lons <= subregion.lon_max)[0])
-
timeStart = min(np.nonzero(target_dataset.times >= subregion.start)[0])
timeEnd = max(np.nonzero(target_dataset.times <= subregion.end)[0])
return {
- "lat_start" : latStart,
- "lat_end" : latEnd,
- "lon_start" : lonStart,
- "lon_end" : lonEnd,
- "time_start" : timeStart,
- "time_end" : timeEnd
+ "lat_start": latStart,
+ "lat_end": latEnd,
+ "lon_start": lonStart,
+ "lon_end": lonEnd,
+ "time_start": timeStart,
+ "time_end": timeEnd
}
-
[2/2] climate git commit: PEP8 violations in dataset_processor.py
have been fixed.
Posted by hu...@apache.org.
PEP8 violations in dataset_processor.py have been fixed.
Project: http://git-wip-us.apache.org/repos/asf/climate/repo
Commit: http://git-wip-us.apache.org/repos/asf/climate/commit/3ce4e209
Tree: http://git-wip-us.apache.org/repos/asf/climate/tree/3ce4e209
Diff: http://git-wip-us.apache.org/repos/asf/climate/diff/3ce4e209
Branch: refs/heads/master
Commit: 3ce4e20941076c0b0db490e3c8c96cde2807ee34
Parents: 54f2110 9fa0e4c
Author: huikyole <hu...@argo.jpl.nasa.gov>
Authored: Sun Jun 26 16:59:45 2016 -0700
Committer: huikyole <hu...@argo.jpl.nasa.gov>
Committed: Sun Jun 26 16:59:45 2016 -0700
----------------------------------------------------------------------
ocw/dataset_processor.py | 762 ++++++++++++++++++++++++------------------
1 file changed, 439 insertions(+), 323 deletions(-)
----------------------------------------------------------------------