You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@climate.apache.org by jo...@apache.org on 2014/05/09 04:03:50 UTC
[41/51] [abbrv] [partial] Adding Jinwon's custom RCMET
http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/.svn/text-base/process.py.svn-base
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/.svn/text-base/process.py.svn-base b/src/main/python/rcmes/toolkit/.svn/text-base/process.py.svn-base
new file mode 100755
index 0000000..b5dde08
--- /dev/null
+++ b/src/main/python/rcmes/toolkit/.svn/text-base/process.py.svn-base
@@ -0,0 +1,1263 @@
+"""Collection of functions that process numpy arrays of varying shapes.
+Functions can range from subsetting to regridding"""
+
+# Python Standard Libary Imports
+import math
+import datetime
+import re
+import string
+import sys
+import time
+
+# 3rd Party Imports
+import Nio
+import numpy as np
+import numpy.ma as ma
+from scipy.ndimage import map_coordinates
+
+
+def extract_subregion_from_data_array(data, lats, lons, latmin, latmax, lonmin, lonmax):
+ '''
+ Extract a sub-region from a data array.
+
+ Example::
+ **e.g.** the user may load a global model file, but only want to examine data over North America
+ This function extracts a sub-domain from the original data.
+ The defined sub-region must be a regular lat/lon bounding box,
+ but the model data may be on a non-regular grid (e.g. rotated, or Guassian grid layout).
+ Data are kept on the original model grid and a rectangular (in model-space) region
+ is extracted which contains the rectangular (in lat/lon space) user supplied region.
+
+ INPUT::
+ data - 3d masked data array
+ lats - 2d array of latitudes corresponding to data array
+ lons - 2d array of longitudes corresponding to data array
+ latmin, latmax, lonmin, lonmax - bounding box of required region to extract
+
+ OUTPUT::
+ data2 - subset of original data array containing only data from required subregion
+ lats2 - subset of original latitude data array
+ lons2 - subset of original longitude data array
+
+ '''
+
+
+
+ # Mask model data array to find grid points inside the supplied bounding box
+ whlat = (lats > latmin) & (lats < latmax)
+ whlon = (lons > lonmin) & (lons < lonmax)
+ wh = whlat & whlon
+
+ # Find required matrix indices describing the limits of the regular lat/lon bounding box
+ jmax = np.where(wh == True)[0].max()
+ jmin = np.where(wh == True)[0].min()
+ imax = np.where(wh == True)[1].max()
+ imin = np.where(wh == True)[1].min()
+
+ # Cut out the sub-region from the data array
+ data2 = ma.zeros((data.shape[0], jmax - jmin, imax - imin))
+ data2 = data[:, jmin:jmax, imin:imax]
+
+ # Cut out sub-region from lats,lons arrays
+ lats2 = lats[jmin:jmax, imin:imax]
+ lons2 = lons[jmin:jmax, imin:imax]
+
+ return data2, lats2, lons2
+
+def calc_area_mean(data, lats, lons, mymask='not set'):
+ '''
+ Calculate Area Average of data in a masked array
+
+ INPUT::
+ data: a masked array of data (NB. only data from one time expected to be passed at once)
+ lats: 2d array of regularly gridded latitudes
+ lons: 2d array of regularly gridded longitudes
+ mymask: (optional) defines spatial region to do averaging over
+
+ OUTPUT::
+ area_mean: a value for the mean inside the area
+
+ '''
+
+
+
+ # If mask not passed in, then set maks to cover whole data domain
+ if(mymask == 'not set'):
+ mymask = np.empty(data.shape)
+ mymask[:] = False # NB. mask means (don't show), so False everywhere means use everything.
+
+ # Dimension check on lats, lons
+ # Sometimes these arrays are 3d, sometimes 2d, sometimes 1d
+ # This bit of code just converts to the required 2d array shape
+ if(len(lats.shape) == 3):
+ lats = lats[0, :, :]
+
+ if(len(lons.shape) == 3):
+ lons = lons[0, :, :]
+
+ if(np.logical_and(len(lats.shape) == 1, len(lons.shape) == 1)):
+ lons, lats = np.meshgrid(lons, lats)
+
+ # Calculate grid length (assuming regular lat/lon grid)
+ dlat = lats[1, 0] - lats[0, 0]
+ dlon = lons[0, 1] - lons[0, 0]
+
+ # Calculates weights for each grid box
+ myweights = calc_area_in_grid_box(lats, dlon, dlat)
+
+ # Create a new masked array covering just user selected area (defined in mymask)
+ # NB. this preserves missing data points in the observations data
+ subdata = ma.masked_array(data, mask=mymask)
+
+ if(myweights.shape != subdata.shape):
+ myweights.resize(subdata.shape)
+ myweights[1:, :] = myweights[0, :]
+
+ # Calculate weighted mean using ma.average (which takes weights)
+ area_mean = ma.average(subdata, weights=myweights)
+
+ return area_mean
+
+def calc_area_in_grid_box(latitude, dlat, dlon):
+ '''
+ Calculate area of regular lat-lon grid box
+
+ INPUT::
+ latitude: latitude of grid box (degrees)
+ dlat: grid length in latitude direction (degrees)
+ dlon: grid length in longitude direction (degrees)
+
+ OUTPUT::
+ A: area of the grid box
+
+ '''
+
+ R = 6371000 # radius of Earth in metres
+ C = 2 * math.pi * R
+
+ latitude = np.radians(latitude)
+
+ A = (dlon * (C / 360.)) * (dlat * (C / 360.) * np.cos(latitude))
+
+ return A
+
+def do_regrid(q, lat, lon, lat2, lon2, order=1, mdi= -999999999):
+ '''
+ Perform regridding from one set of lat,lon values onto a new set (lat2,lon2)
+
+ Input::
+ q - the variable to be regridded
+ lat,lon - original co-ordinates corresponding to q values
+ lat2,lon2 - new set of latitudes and longitudes that you want to regrid q onto
+ order - (optional) interpolation order 1=bi-linear, 3=cubic spline
+ mdi - (optional) fill value for missing data (used in creation of masked array)
+
+ Output::
+ q2 - q regridded onto the new set of lat2,lon2
+
+ '''
+
+ nlat = q.shape[0]
+ nlon = q.shape[1]
+
+ nlat2 = lat2.shape[0]
+ nlon2 = lon2.shape[1]
+
+ # To make our lives easier down the road, let's
+ # turn these into arrays of x & y coords
+ loni = lon2.ravel()
+ lati = lat2.ravel()
+
+ loni = loni.copy() # NB. it won't run unless you do this...
+ lati = lati.copy()
+
+ # Now, we'll set points outside the boundaries to lie along an edge
+ loni[loni > lon.max()] = lon.max()
+ loni[loni < lon.min()] = lon.min()
+
+ # To deal with the "hard" break, we'll have to treat y differently,
+ # so we're just setting the min here...
+ lati[lati > lat.max()] = lat.max()
+ lati[lati < lat.min()] = lat.min()
+
+
+ # We need to convert these to (float) indicies
+ # (xi should range from 0 to (nx - 1), etc)
+ loni = (nlon - 1) * (loni - lon.min()) / (lon.max() - lon.min())
+
+ # Deal with the "hard" break in the y-direction
+ lati = (nlat - 1) * (lati - lat.min()) / (lat.max() - lat.min())
+
+ # Notes on dealing with MDI when regridding data.
+ # Method adopted here:
+ # Use bilinear interpolation of data by default (but user can specify other order using order=... in call)
+ # Perform bilinear interpolation of data, and of mask.
+ # To be conservative, new grid point which contained some missing data on the old grid is set to missing data.
+ # -this is achieved by looking for any non-zero interpolated mask values.
+ # To avoid issues with bilinear interpolation producing strong gradients leading into the MDI,
+ # set values at MDI points to mean data value so little gradient visible = not ideal, but acceptable for now.
+
+ # Set values in MDI so that similar to surroundings so don't produce large gradients when interpolating
+ # Preserve MDI mask, by only changing data part of masked array object.
+ for shift in (-1, 1):
+ for axis in (0, 1):
+ q_shifted = np.roll(q, shift=shift, axis=axis)
+ idx = ~q_shifted.mask * q.mask
+ q.data[idx] = q_shifted[idx]
+
+ # Now we actually interpolate
+ # map_coordinates does cubic interpolation by default,
+ # use "order=1" to preform bilinear interpolation instead...
+ q2 = map_coordinates(q, [lati, loni], order=order)
+ q2 = q2.reshape([nlat2, nlon2])
+
+ # Set values to missing data outside of original domain
+ q2 = ma.masked_array(q2, mask=np.logical_or(np.logical_or(lat2 >= lat.max(),
+ lat2 <= lat.min()),
+ np.logical_or(lon2 <= lon.min(),
+ lon2 >= lon.max())))
+
+ # Make second map using nearest neighbour interpolation -use this to determine locations with MDI and mask these
+ qmdi = np.zeros_like(q)
+ qmdi[q.mask == True] = 1.
+ qmdi[q.mask == False] = 0.
+ qmdi_r = map_coordinates(qmdi, [lati, loni], order=order)
+ qmdi_r = qmdi_r.reshape([nlat2, nlon2])
+ mdimask = (qmdi_r != 0.0)
+
+ # Combine missing data mask, with outside domain mask define above.
+ q2.mask = np.logical_or(mdimask, q2.mask)
+
+ return q2
+
+def create_mask_using_threshold(masked_array, threshold=0.5):
+ '''
+ Routine to create a mask, depending on the proportion of times with missing data.
+ For each pixel, calculate proportion of times that are missing data,
+ **if** the proportion is above a specified threshold value,
+ then mark the pixel as missing data.
+
+ Input::
+ masked_array - a numpy masked array of data (assumes time on axis 0, and space on axes 1 and 2).
+ threshold - (optional) threshold proportion above which a pixel is marked as 'missing data'.
+ NB. default threshold = 50%
+
+ Output::
+ mymask - a numpy array describing the mask. NB. not a masked array, just the mask itself.
+
+ '''
+
+
+ # try, except used as some model files don't have a full mask, but a single bool
+ # the except catches this situation and deals with it appropriately.
+ try:
+ nT = masked_array.mask.shape[0]
+
+ # For each pixel, count how many times are masked.
+ nMasked = masked_array.mask[:, :, :].sum(axis=0)
+
+ # Define new mask as when a pixel has over a defined threshold ratio of masked data
+ # e.g. if the threshold is 75%, and there are 10 times,
+ # then a pixel will be masked if more than 5 times are masked.
+ mymask = nMasked > (nT * threshold)
+
+ except:
+ mymask = np.zeros_like(masked_array.data[0, :, :])
+
+ return mymask
+
+def calc_average_on_new_time_unit(data, dateList, unit='monthly'):
+ '''
+ Routine to calculate averages on longer time units than the data exists on.
+
+ Example::
+ If the data is 6-hourly, calculate daily, or monthly means.
+
+ Input::
+ data - data values
+ dateList - list of python datetime structures corresponding to data values
+ unit - string describing time unit to average onto.
+ *Valid values are: 'monthly', 'daily', 'pentad','annual','decadal'*
+
+ Output:
+ meanstorem - numpy masked array of data values meaned over required time period
+ newTimesList - a list of python datetime objects representing the data in the new averagin period
+ *NB.* currently set to beginning of averaging period,
+ **i.e. mean Jan 1st - Jan 31st -> represented as Jan 1st, 00Z.**
+ '''
+
+ # First catch unknown values of time unit passed in by user
+ acceptable = (unit == 'full') | (unit == 'annual') | (unit == 'monthly') | (unit == 'daily') | (unit == 'pentad')
+
+ if not acceptable:
+ print 'Error: unknown unit type selected for time averaging'
+ print ' Please check your code.'
+ return
+
+ # Calculate arrays of years (2007,2007),
+ # yearsmonths (200701,200702),
+ # or yearmonthdays (20070101,20070102)
+ # -depending on user selected averaging period.
+
+ # Year list
+ if unit == 'annual':
+ print 'Calculating annual mean data'
+ timeunits = []
+
+ for i in np.arange(len(dateList)):
+ timeunits.append(str(dateList[i].year))
+
+ timeunits = np.array(timeunits, dtype=int)
+
+ # YearMonth format list
+ if unit == 'monthly':
+ print 'Calculating monthly mean data'
+ timeunits = []
+
+ for i in np.arange(len(dateList)):
+ timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month))
+
+ timeunits = np.array(timeunits, dtype=int)
+
+ # YearMonthDay format list
+ if unit == 'daily':
+ print 'Calculating daily mean data'
+ timeunits = []
+
+ for i in np.arange(len(dateList)):
+ timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month) + \
+ str("%02d" % dateList[i].day))
+
+ timeunits = np.array(timeunits, dtype=int)
+
+
+ # TODO: add pentad setting using Julian days?
+
+
+ # Full list: a special case
+ if unit == 'full':
+ print 'Calculating means data over full time range'
+ timeunits = []
+
+ for i in np.arange(len(dateList)):
+ timeunits.append(999) # i.e. we just want the same value for all times.
+
+ timeunits = np.array(timeunits, dtype=int)
+
+
+
+ # empty list to store new times
+ newTimesList = []
+
+ # Decide whether or not you need to do any time averaging.
+ # i.e. if data are already on required time unit then just pass data through and
+ # calculate and return representative datetimes.
+ processing_required = True
+ if len(timeunits) == (len(np.unique(timeunits))):
+ processing_required = False
+
+ # 1D data arrays, i.e. time series
+ if data.ndim == 1:
+ # Create array to store the resulting data
+ meanstore = np.zeros(len(np.unique(timeunits)))
+
+ # Calculate the means across each unique time unit
+ i = 0
+ for myunit in np.unique(timeunits):
+ if processing_required:
+ datam = ma.masked_array(data, timeunits != myunit)
+ meanstore[i] = ma.average(datam)
+
+ # construct new times list
+ smyunit = str(myunit)
+ if len(smyunit) == 4: # YYYY
+ yyyy = int(myunit[0:4])
+ mm = 1
+ dd = 1
+ if len(smyunit) == 6: # YYYYMM
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = 1
+ if len(smyunit) == 8: # YYYYMMDD
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = int(smyunit[6:8])
+ if len(smyunit) == 3: # Full time range
+ # Need to set an appropriate time representing the mid-point of the entire time span
+ dt = dateList[-1] - dateList[0]
+ halfway = dateList[0] + (dt / 2)
+ yyyy = int(halfway.year)
+ mm = int(halfway.month)
+ dd = int(halfway.day)
+
+ newTimesList.append(datetime.datetime(yyyy, mm, dd, 0, 0, 0, 0))
+ i = i + 1
+
+ # 3D data arrays
+ if data.ndim == 3:
+
+ #datamask = create_mask_using_threshold(data,threshold=0.75)
+
+ # Create array to store the resulting data
+ meanstore = np.zeros([len(np.unique(timeunits)), data.shape[1], data.shape[2]])
+
+ # Calculate the means across each unique time unit
+ i = 0
+ datamask_store = []
+ for myunit in np.unique(timeunits):
+ if processing_required:
+
+ mask = np.zeros_like(data)
+ mask[timeunits != myunit, :, :] = 1.0
+
+ # Calculate missing data mask within each time unit...
+ datamask_at_this_timeunit = np.zeros_like(data)
+ datamask_at_this_timeunit[:] = create_mask_using_threshold(data[timeunits == myunit, :, :], threshold=0.75)
+ # Store results for masking later
+ datamask_store.append(datamask_at_this_timeunit[0])
+
+ # Calculate means for each pixel in this time unit, ignoring missing data (using masked array).
+ datam = ma.masked_array(data, np.logical_or(mask, datamask_at_this_timeunit))
+ meanstore[i, :, :] = ma.average(datam, axis=0)
+
+ # construct new times list
+ smyunit = str(myunit)
+ if len(smyunit) == 4: # YYYY
+ yyyy = int(smyunit[0:4])
+ mm = 1
+ dd = 1
+ if len(smyunit) == 6: # YYYYMM
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = 1
+ if len(smyunit) == 8: # YYYYMMDD
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = int(smyunit[6:8])
+ if len(smyunit) == 3: # Full time range
+ # Need to set an appropriate time representing the mid-point of the entire time span
+ dt = dateList[-1] - dateList[0]
+ halfway = dateList[0] + (dt / 2)
+ yyyy = int(halfway.year)
+ mm = int(halfway.month)
+ dd = int(halfway.day)
+
+ newTimesList.append(datetime.datetime(yyyy, mm, dd, 0, 0, 0, 0))
+
+ i += 1
+
+ if not processing_required:
+ meanstorem = data
+
+ if processing_required:
+ # Create masked array (using missing data mask defined above)
+ datamask_store = np.array(datamask_store)
+ meanstorem = ma.masked_array(meanstore, datamask_store)
+
+ return meanstorem, newTimesList
+
+def calc_running_accum_from_period_accum(data):
+ '''
+ Routine to calculate running total accumulations from individual period accumulations.
+ ::
+
+ e.g. 0,0,1,0,0,2,2,1,0,0
+ -> 0,0,1,1,1,3,5,6,6,6
+
+ Input::
+ data: numpy array with time in the first axis
+
+ Output::
+ running_acc: running accumulations
+
+ '''
+
+
+ running_acc = np.zeros_like(data)
+
+ if(len(data.shape) == 1):
+ running_acc[0] = data[0]
+
+ if(len(data.shape) > 1):
+ running_acc[0, :] = data[0, :]
+
+ for i in np.arange(data.shape[0] - 1):
+ if(len(data.shape) == 1):
+ running_acc[i + 1] = running_acc[i] + data[i + 1]
+
+ if(len(data.shape) > 1):
+ running_acc[i + 1, :] = running_acc[i, :] + data[i + 1, :]
+
+ return running_acc
+
+def ignore_boundaries(data, rim=10):
+ '''
+ Routine to mask the lateral boundary regions of model data to ignore them from calculations.
+
+ Input::
+ data - a masked array of model data
+ rim - (optional) number of grid points to ignore
+
+ Output::
+ data - data array with boundary region masked
+
+ '''
+
+ nx = data.shape[1]
+ ny = data.shape[0]
+
+ rimmask = np.zeros_like(data)
+ for j in np.arange(rim):
+ rimmask[j, 0:nx - 1] = 1.0
+
+ for j in ny - 1 - np.arange(rim):
+ rimmask[j, 0:nx - 1] = 1.0
+
+ for i in np.arange(rim):
+ rimmask[0:ny - 1, i] = 1.0
+
+ for i in nx - 1 - np.arange(rim):
+ rimmask[0:ny - 1, i] = 1.0
+
+ data = ma.masked_array(data, mask=rimmask)
+
+ return data
+
+def normalizeDatetimes(datetimes, timestep):
+ """
+ Input::
+ datetimes - list of datetime objects that need to be normalized
+ timestep - string of value ('daily' | 'monthly')
+ Output::
+ normalDatetimes - list of datetime objects that have been normalized
+
+ Normalization Rules::
+ Daily data will be forced to an hour value of 00:00:00
+ Monthly data will be forced to the first of the month at midnight
+ """
+ normalDatetimes = []
+ if timestep.lower() == 'monthly':
+ for inputDatetime in datetimes:
+ if inputDatetime.day != 1:
+ # Clean the inputDatetime
+ inputDatetimeString = inputDatetime.strftime('%Y%m%d')
+ normalInputDatetimeString = inputDatetimeString[:6] + '01'
+ inputDatetime = datetime.datetime.strptime(normalInputDatetimeString, '%Y%m%d')
+
+ normalDatetimes.append(inputDatetime)
+
+ elif timestep.lower() == 'daily':
+ for inputDatetime in datetimes:
+ if inputDatetime.hour != 0 or inputDatetime.minute != 0 or inputDatetime.second != 0:
+ datetimeString = inputDatetime.strftime('%Y%m%d%H%M%S')
+ normalDatetimeString = datetimeString[:8] + '000000'
+ inputDatetime = datetime.datetime.strptime(normalDatetimeString, '%Y%m%d%H%M%S')
+
+ normalDatetimes.append(inputDatetime)
+
+
+ return normalDatetimes
+
+def getModelTimes(modelFile, timeVarName):
+ '''
+ TODO: Do a better job handling dates here
+ Routine to convert from model times ('hours since 1900...', 'days since ...')
+ into a python datetime structure
+
+ Input::
+ modelFile - path to the model tile you want to extract the times list and modelTimeStep from
+ timeVarName - name of the time variable in the model file
+
+ Output::
+ times - list of python datetime objects describing model data times
+ modelTimeStep - 'hourly','daily','monthly','annual'
+ '''
+
+ f = Nio.open_file(modelFile)
+ xtimes = f.variables[timeVarName]
+ timeFormat = xtimes.attributes['units']
+
+ # search to check if 'since' appears in units
+ try:
+ sinceLoc = re.search('since', timeFormat).end()
+
+ except AttributeError:
+ print 'Error decoding model times: time variable attributes do not contain "since"'
+ raise
+
+ units = None
+ TIME_UNITS = ('minutes', 'hours', 'days', 'months', 'years')
+ # search for 'seconds','minutes','hours', 'days', 'months', 'years' so know units
+ for unit in TIME_UNITS:
+ if re.search(unit, timeFormat):
+ units = unit
+ break
+
+ # cut out base time (the bit following 'since')
+ base_time_string = string.lstrip(timeFormat[sinceLoc:])
+ # decode base time
+ base_time = decodeTimeFromString(base_time_string)
+ times = []
+
+ for xtime in xtimes[:]:
+
+ # Cast time as an int
+ xtime = int(xtime)
+
+ if units == 'minutes':
+ dt = datetime.timedelta(minutes=xtime)
+ new_time = base_time + dt
+ elif units == 'hours':
+ dt = datetime.timedelta(hours=xtime)
+ new_time = base_time + dt
+ elif units == 'days':
+ dt = datetime.timedelta(days=xtime)
+ new_time = base_time + dt
+ elif units == 'months':
+ # NB. adding months in python is complicated as month length varies and hence ambiguous.
+ # Perform date arithmatic manually
+ # Assumption: the base_date will usually be the first of the month
+ # NB. this method will fail if the base time is on the 29th or higher day of month
+ # -as can't have, e.g. Feb 31st.
+ new_month = int(base_time.month + xtime % 12)
+ new_year = int(math.floor(base_time.year + xtime / 12.))
+ new_time = datetime.datetime(new_year, new_month, base_time.day, base_time.hour, base_time.second, 0)
+ elif units == 'years':
+ dt = datetime.timedelta(years=xtime)
+ new_time = base_time + dt
+
+ times.append(new_time)
+
+ try:
+ timeStepLength = int(xtimes[1] - xtimes[0] + 1.e-12)
+ modelTimeStep = getModelTimeStep(units, timeStepLength)
+ except:
+ raise
+
+ times = normalizeDatetimes(times, modelTimeStep)
+
+ return times, modelTimeStep
+
+def getModelTimeStep(units, stepSize):
+ # Time units are now determined. Determine the time intervals of input data (mdlTimeStep)
+
+ if units == 'minutes':
+ if stepSize == 60:
+ modelTimeStep = 'hourly'
+ elif stepSize == 1440:
+ modelTimeStep = 'daily'
+ # 28 days through 31 days
+ elif 40320 <= stepSize <= 44640:
+ modelTimeStep = 'monthly'
+ # 365 days through 366 days
+ elif 525600 <= stepSize <= 527040:
+ modelTimeStep = 'annual'
+ else:
+ raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+ elif units == 'hours':
+ if stepSize == 1:
+ modelTimeStep = 'hourly'
+ elif stepSize == 24:
+ modelTimeStep = 'daily'
+ elif 672 <= stepSize <= 744:
+ modelTimeStep = 'monthly'
+ elif 8760 <= stepSize <= 8784:
+ modelTimeStep = 'annual'
+ else:
+ raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+ elif units == 'days':
+ if stepSize == 1:
+ modelTimeStep = 'daily'
+ elif 28 <= stepSize <= 31:
+ modelTimeStep = 'monthly'
+ elif 365 <= stepSize <= 366:
+ modelTimeStep = 'annual'
+ else:
+ raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+ elif units == 'months':
+ if stepSize == 1:
+ modelTimeStep = 'monthly'
+ elif stepSize == 12:
+ modelTimeStep = 'annual'
+ else:
+ raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+ elif units == 'years':
+ if stepSize == 1:
+ modelTimeStep = 'annual'
+ else:
+ raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+ else:
+ errorMessage = 'the time unit ', units, ' is not currently handled in this version.'
+ raise Exception(errorMessage)
+
+ return modelTimeStep
+
+def decodeTimeFromString(time_string):
+ '''
+ Decodes string into a python datetime object
+ *Method:* tries a bunch of different time format possibilities and hopefully one of them will hit.
+ ::
+
+ **Input:** time_string - a string that represents a date/time
+
+ **Output:** mytime - a python datetime object
+ '''
+
+ # This will deal with times that use decimal seconds
+ if '.' in time_string:
+ time_string = time_string.split('.')[0] + '0'
+ else:
+ pass
+
+ try:
+ mytime = time.strptime(time_string, '%Y-%m-%d %H:%M:%S')
+ mytime = datetime.datetime(*mytime[0:6])
+ return mytime
+
+ except ValueError:
+ pass
+
+ try:
+ mytime = time.strptime(time_string, '%Y/%m/%d %H:%M:%S')
+ mytime = datetime.datetime(*mytime[0:6])
+ return mytime
+
+ except ValueError:
+ pass
+
+ try:
+ mytime = time.strptime(time_string, '%Y%m%d %H:%M:%S')
+ mytime = datetime.datetime(*mytime[0:6])
+ return mytime
+
+ except ValueError:
+ pass
+
+ try:
+ mytime = time.strptime(time_string, '%Y:%m:%d %H:%M:%S')
+ mytime = datetime.datetime(*mytime[0:6])
+ return mytime
+
+ except ValueError:
+ pass
+
+ try:
+ mytime = time.strptime(time_string, '%Y%m%d%H%M%S')
+ mytime = datetime.datetime(*mytime[0:6])
+ return mytime
+
+ except ValueError:
+ pass
+
+ try:
+ mytime = time.strptime(time_string, '%Y-%m-%d %H:%M')
+ mytime = datetime.datetime(*mytime[0:6])
+ return mytime
+
+ except ValueError:
+ pass
+
+ try:
+ mytime = time.strptime(time_string, '%Y-%m-%d')
+ mytime = datetime.datetime(*mytime[0:6])
+ return mytime
+
+ except ValueError:
+ pass
+
+
+ print 'Error decoding time string: string does not match a predefined time format'
+ return 0
+
+def regrid_wrapper(regrid_choice, obsdata, obslats, obslons, wrfdata, wrflats, wrflons):
+ '''
+ Wrapper routine for regridding.
+
+ Either regrids model to obs grid, or obs to model grid, depending on user choice
+
+ Inputs::
+ regrid_choice - [0] = Regrid obs data onto model grid or
+ [1] = Regrid model data onto obs grid
+
+ obsdata,wrfdata - data arrays
+ obslats,obslons - observation lat,lon arrays
+ wrflats,wrflons - model lat,lon arrays
+
+ Output::
+ rdata,rdata2 - regridded data
+ lats,lons - latitudes and longitudes of regridded data
+
+ '''
+
+ # Regrid obs data onto model grid
+ if(regrid_choice == '0'):
+
+ ndims = len(obsdata.shape)
+ if(ndims == 3):
+ newshape = [obsdata.shape[0], wrfdata.shape[1], wrfdata.shape[2]]
+ nT = obsdata.shape[0]
+
+ if(ndims == 2):
+ newshape = [wrfdata.shape[0], wrfdata.shape[1]]
+ nT = 0
+
+ rdata = wrfdata
+ lats, lons = wrflats, wrflons
+
+ # Create a new masked array with the required dimensions
+ tmp = np.zeros(newshape)
+ rdata2 = ma.MaskedArray(tmp, mask=(tmp == 0))
+ tmp = 0 # free up some memory
+
+ rdata2[:] = 0.0
+
+ if(nT > 0):
+ for t in np.arange(nT):
+ rdata2[t, :, :] = do_regrid(obsdata[t, :, :], obslats[:, :], obslons[:, :], wrflats[:, :], wrflons[:, :])
+
+ if(nT == 0):
+ rdata2[:, :] = do_regrid(obsdata[:, :], obslats[:, :], obslons[:, :], wrflats[:, :], wrflons[:, :])
+
+
+ # Regrid model data onto obs grid
+ if(regrid_choice == '1'):
+ ndims = len(wrfdata.shape)
+ if(ndims == 3):
+ newshape = [wrfdata.shape[0], obsdata.shape[1], obsdata.shape[2]]
+ nT = obsdata.shape[0]
+
+ if(ndims == 2):
+ newshape = [obsdata.shape[0], obsdata.shape[1]]
+ nT = 0
+
+ rdata2 = obsdata
+ lats, lons = obslats, obslons
+
+ tmp = np.zeros(newshape)
+ rdata = ma.MaskedArray(tmp, mask=(tmp == 0))
+ tmp = 0 # free up some memory
+
+ rdata[:] = 0.0
+
+ if(nT > 0):
+ for t in np.arange(nT):
+ rdata[t, :, :] = do_regrid(wrfdata[t, :, :], wrflats[:, :], wrflons[:, :], obslats[:, :], obslons[:, :])
+
+ if(nT == 0):
+ rdata[:, :] = do_regrid(wrfdata[:, :], wrflats[:, :], wrflons[:, :], obslats[:, :], obslons[:, :])
+
+
+ return rdata, rdata2, lats, lons
+
+def extract_sub_time_selection(allTimes, subTimes, data):
+ '''
+ Routine to extract a sub-selection of times from a data array.
+
+ Example::
+ Suppose a data array has 30 time values for daily data for a whole month,
+ but you only want the data from the 5th-15th of the month.
+
+ Input::
+ allTimes - list of datetimes describing what times the data in the data array correspond to
+ subTimes - the times you want to extract data from
+ data - the data array
+
+ Output::
+ subdata - subselection of data array
+
+ '''
+ # Create new array to store the subselection
+ subdata = np.zeros([len(subTimes), data.shape[1], data.shape[2]])
+
+ # Loop over all Times and when it is a member of the required subselection, copy the data across
+ i = 0 # counter for allTimes
+ subi = 0 # counter for subTimes
+ for t in allTimes:
+ if(np.setmember1d(allTimes, subTimes)):
+ subdata[subi, :, :] = data[i, :, :]
+ subi += 1
+ i += 1
+
+ return subdata
+
+def calc_average_on_new_time_unit_K(data, dateList, unit):
+ '''
+ Routine to calculate averages on longer time units than the data exists on.
+ e.g. if the data is 6-hourly, calculate daily, or monthly means.
+
+ Input:
+ data - data values
+ dateList - list of python datetime structures corresponding to data values
+ unit - string describing time unit to average onto
+ e.g. 'monthly', 'daily', 'pentad','annual','decadal'
+ Output:
+ meanstorem - numpy masked array of data values meaned over required time period
+ newTimesList - a list of python datetime objects representing the data in the new averagin period
+ NB. currently set to beginning of averaging period,
+ i.e. mean Jan 1st - Jan 31st -> represented as Jan 1st, 00Z.
+ '''
+
+ # Check if the user-selected temporal grid is valid. If not, EXIT
+ acceptable = (unit=='full')|(unit=='annual')|(unit=='monthly')|(unit=='daily')|(unit=='pentad')
+ if not acceptable:
+ print 'Error: unknown unit type selected for time averaging: EXIT'
+ return -1,-1,-1,-1
+
+ # Calculate arrays of: annual timeseries: year (2007,2007),
+ # monthly time series: year-month (200701,200702),
+ # daily timeseries: year-month-day (20070101,20070102)
+ # depending on user-selected averaging period.
+
+ # Year list
+ if unit=='annual':
+ timeunits = []
+ for i in np.arange(len(dateList)):
+ timeunits.append(str(dateList[i].year))
+ timeunits = np.array(timeunits, dtype=int)
+
+ # YearMonth format list
+ if unit=='monthly':
+ timeunits = []
+ for i in np.arange(len(dateList)):
+ timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month))
+ timeunits = np.array(timeunits,dtype=int)
+
+ # YearMonthDay format list
+ if unit=='daily':
+ timeunits = []
+ for i in np.arange(len(dateList)):
+ timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month) + str("%02d" % dateList[i].day))
+ timeunits = np.array(timeunits,dtype=int)
+
+ # TODO: add pentad setting using Julian days?
+
+ # Full list: a special case
+ if unit == 'full':
+ comment='Calculating means data over the entire time range: i.e., annual-mean climatology'
+ timeunits = []
+ for i in np.arange(len(dateList)):
+ timeunits.append(999) # i.e. we just want the same value for all times.
+ timeunits = np.array(timeunits, dtype=int)
+
+ # empty list to store new times
+ newTimesList = []
+
+ # Decide whether or not you need to do any time averaging.
+ # i.e. if data are already on required time unit then just pass data through and
+ # calculate and return representative datetimes.
+ processing_required = True
+ if len(timeunits)==(len(np.unique(timeunits))):
+ processing_required = False
+
+ # 1D data arrays, i.e. time series
+ if data.ndim==1:
+ # Create array to store the resulting data
+ meanstore = np.zeros(len(np.unique(timeunits)))
+
+ # Calculate the means across each unique time unit
+ i=0
+ for myunit in np.unique(timeunits):
+ if processing_required:
+ datam=ma.masked_array(data,timeunits!=myunit)
+ meanstore[i] = ma.average(datam)
+
+ # construct new times list
+ smyunit = str(myunit)
+ if len(smyunit)==4: # YYYY
+ yyyy = int(myunit[0:4])
+ mm = 1
+ dd = 1
+ if len(smyunit)==6: # YYYYMM
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = 1
+ if len(smyunit)==8: # YYYYMMDD
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = int(smyunit[6:8])
+ if len(smyunit)==3: # Full time range
+ # Need to set an appropriate time representing the mid-point of the entire time span
+ dt = dateList[-1]-dateList[0]
+ halfway = dateList[0]+(dt/2)
+ yyyy = int(halfway.year)
+ mm = int(halfway.month)
+ dd = int(halfway.day)
+
+ newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0))
+ i = i+1
+
+ # 3D data arrays
+ if data.ndim==3:
+ # datamask = create_mask_using_threshold(data,threshold=0.75)
+ # Create array to store the resulting data
+ meanstore = np.zeros([len(np.unique(timeunits)),data.shape[1],data.shape[2]])
+
+ # Calculate the means across each unique time unit
+ i=0
+ datamask_store = []
+ for myunit in np.unique(timeunits):
+ if processing_required:
+ mask = np.zeros_like(data)
+ mask[timeunits!=myunit,:,:] = 1.0
+ # Calculate missing data mask within each time unit...
+ datamask_at_this_timeunit = np.zeros_like(data)
+ datamask_at_this_timeunit[:]= create_mask_using_threshold(data[timeunits==myunit,:,:],threshold=0.75)
+ # Store results for masking later
+ datamask_store.append(datamask_at_this_timeunit[0])
+ # Calculate means for each pixel in this time unit, ignoring missing data (using masked array).
+ datam = ma.masked_array(data,np.logical_or(mask,datamask_at_this_timeunit))
+ meanstore[i,:,:] = ma.average(datam,axis=0)
+ # construct new times list
+ smyunit = str(myunit)
+ if len(smyunit)==4: # YYYY
+ yyyy = int(smyunit[0:4])
+ mm = 1
+ dd = 1
+ if len(smyunit)==6: # YYYYMM
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = 1
+ if len(smyunit)==8: # YYYYMMDD
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = int(smyunit[6:8])
+ if len(smyunit)==3: # Full time range
+ # Need to set an appropriate time representing the mid-point of the entire time span
+ dt = dateList[-1]-dateList[0]
+ halfway = dateList[0]+(dt/2)
+ yyyy = int(halfway.year)
+ mm = int(halfway.month)
+ dd = int(halfway.day)
+ newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0))
+ i += 1
+
+ if not processing_required:
+ meanstorem = data
+
+ if processing_required:
+ # Create masked array (using missing data mask defined above)
+ datamask_store = np.array(datamask_store)
+ meanstorem = ma.masked_array(meanstore, datamask_store)
+
+ return meanstorem,newTimesList
+
+def decode_model_timesK(ifile,timeVarName,file_type):
+ #################################################################################################
+ # Convert model times ('hours since 1900...', 'days since ...') into a python datetime structure
+ # Input:
+ # filelist - list of model files
+ # timeVarName - name of the time variable in the model files
+ # Output:
+ # times - list of python datetime objects describing model data times
+ # Peter Lean February 2011
+ #################################################################################################
+ f = Nio.open_file(ifile,mode='r',options=None,format=file_type)
+ xtimes = f.variables[timeVarName]
+ timeFormat = xtimes.attributes['units']
+ #timeFormat = "days since 1979-01-01 00:00:00"
+ # search to check if 'since' appears in units
+ try:
+ sinceLoc = re.search('since',timeFormat).end()
+ except:
+ print 'Error decoding model times: time var attributes do not contain "since": Exit'
+ sys.exit()
+ # search for 'seconds','minutes','hours', 'days', 'months', 'years' so know units
+ # TODO: Swap out this section for a set of if...elseif statements
+ units = ''
+ try:
+ _ = re.search('minutes',timeFormat).end()
+ units = 'minutes'
+ except:
+ pass
+ try:
+ _ = re.search('hours',timeFormat).end()
+ units = 'hours'
+ except:
+ pass
+ try:
+ _ = re.search('days',timeFormat).end()
+ units = 'days'
+ except:
+ pass
+ try:
+ _ = re.search('months',timeFormat).end()
+ units = 'months'
+ except:
+ pass
+ try:
+ _ = re.search('years',timeFormat).end()
+ units = 'years'
+ except:
+ pass
+ # cut out base time (the bit following 'since')
+ base_time_string = string.lstrip(timeFormat[sinceLoc:])
+ # decode base time
+ # 9/25/2012: datetime.timedelta has problem with the argument because xtimes is NioVariable.
+ # Soln (J. Kim): use a temp variable ttmp in the for loop, then convert it into an integer xtime.
+ base_time = decodeTimeFromString(base_time_string)
+ times=[]
+ for ttmp in xtimes[:]:
+ xtime = int(ttmp)
+ if(units=='minutes'):
+ dt = datetime.timedelta(minutes=xtime); new_time = base_time + dt
+ if(units=='hours'):
+ dt = datetime.timedelta(hours=xtime); new_time = base_time + dt
+ if(units=='days'):
+ dt = datetime.timedelta(days=xtime); new_time = base_time + dt
+ if(units=='months'): # NB. adding months in python is complicated as month length varies and hence ambigous.
+ # Perform date arithmatic manually
+ # Assumption: the base_date will usually be the first of the month
+ # NB. this method will fail if the base time is on the 29th or higher day of month
+ # -as can't have, e.g. Feb 31st.
+ new_month = int(base_time.month + xtime % 12)
+ new_year = int(math.floor(base_time.year + xtime / 12.))
+ #print type(new_year),type(new_month),type(base_time.day),type(base_time.hour),type(base_time.second)
+ new_time = datetime.datetime(new_year,new_month,base_time.day,base_time.hour,base_time.second,0)
+ if(units=='years'):
+ dt = datetime.timedelta(years=xtime); new_time = base_time + dt
+ times.append(new_time)
+ return times
+
+
+def regrid_in_time(data,dateList,unit):
+ #################################################################################################
+ # Routine to calculate averages on longer time units than the data exists on.
+ # e.g. if the data is 6-hourly, calculate daily, or monthly means.
+ # Input:
+ # data - data values
+ # dateList - list of python datetime structures corresponding to data values
+ # unit - string describing time unit to average onto
+ # e.g. 'monthly', 'daily', 'pentad','annual','decadal'
+ # Output:
+ # meanstorem - numpy masked array of data values meaned over required time period
+ # newTimesList - a list of python datetime objects representing the data in the new averagin period
+ # NB. currently set to beginning of averaging period,
+ # i.e. mean Jan 1st - Jan 31st -> represented as Jan 1st, 00Z.
+ # ..............................
+ # Jinwon Kim, Sept 30, 2012
+ # Created from calc_average_on_new_time_unit_K, Peter's original, with the modification below:
+ # Instead of masked array used by Peter, use wh to defined data within the averaging range.
+ #################################################################################################
+
+ print '*** Begin calc_average_on_new_time_unit_KK ***'
+ # Check if the user-selected temporal grid is valid. If not, EXIT
+ acceptable = (unit=='full')|(unit=='annual')|(unit=='monthly')|(unit=='daily')|(unit=='pentad')
+ if not acceptable:
+ print 'Error: unknown unit type selected for time averaging: EXIT'; return -1,-1,-1,-1
+
+ # Calculate time arrays of: annual (yyyy, [2007]), monthly (yyyymm, [200701,200702]), daily (yyyymmdd, [20070101,20070102])
+ # "%02d" is similar to i2.2 in Fortran
+ if unit=='annual': # YYYY
+ timeunits = []
+ for i in np.arange(len(dateList)):
+ timeunits.append(str(dateList[i].year))
+ elif unit=='monthly': # YYYYMM
+ timeunits = []
+ for i in np.arange(len(dateList)):
+ timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month))
+ elif unit=='daily': # YYYYMMDD
+ timeunits = []
+ for i in np.arange(len(dateList)):
+ timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month) + str("%02d" % dateList[i].day))
+ elif unit=='full': # Full list: a special case
+ comment='Calculating means data over the entire time range: i.e., annual-mean climatology'
+ timeunits = []
+ for i in np.arange(len(dateList)):
+ timeunits.append(999) # i.e. we just want the same value for all times.
+ timeunits = np.array(timeunits,dtype=int)
+ print 'timeRegridOption= ',unit#'; output timeunits= ',timeunits
+ #print 'timeRegridOption= ',unit,'; output timeunits= ',timeunits
+
+ # empty list to store time levels after temporal regridding
+ newTimesList = []
+
+ # Decide whether or not you need to do any time averaging.
+ # i.e. if data are already on required time unit then just pass data through and calculate and return representative datetimes.
+ processing_required = True
+ if len(timeunits)==(len(np.unique(timeunits))):
+ processing_required = False
+ print 'processing_required= ',processing_required,': input time steps= ',len(timeunits),': regridded output time steps = ',len(np.unique(timeunits))
+ #print 'np.unique(timeunits): ',np.unique(timeunits)
+ print 'data.ndim= ',data.ndim
+
+ if data.ndim==1: # 1D data arrays, i.e. 1D time series
+ # Create array to store the temporally regridded data
+ meanstore = np.zeros(len(np.unique(timeunits)))
+ # Calculate the means across each unique time unit
+ i=0
+ for myunit in np.unique(timeunits):
+ if processing_required:
+ wh = timeunits==myunit
+ datam = data[wh]
+ meanstore[i] = ma.average(datam)
+ # construct new times list
+ smyunit = str(myunit)
+ if len(smyunit)==4: # YYYY
+ yyyy = int(myunit[0:4])
+ mm = 1
+ dd = 1
+ if len(smyunit)==6: # YYYYMM
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = 1
+ if len(smyunit)==8: # YYYYMMDD
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = int(smyunit[6:8])
+ if len(smyunit)==3: # Full time range
+ # Need to set an appropriate time representing the mid-point of the entire time span
+ dt = dateList[-1]-dateList[0]
+ halfway = dateList[0]+(dt/2)
+ yyyy = int(halfway.year)
+ mm = int(halfway.month)
+ dd = int(halfway.day)
+ newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0))
+ i = i+1
+
+ elif data.ndim==3: # 3D data arrays, i.e. 2D time series
+ # Create array to store the resulting data
+ meanstore = np.zeros([len(np.unique(timeunits)),data.shape[1],data.shape[2]])
+ # Calculate the means across each unique time unit
+ i=0
+ datamask_store = []
+ for myunit in np.unique(timeunits):
+ if processing_required:
+ wh = timeunits==myunit
+ datam = data[wh,:,:]
+ meanstore[i,:,:] = ma.average(datam,axis=0)
+ # construct new times list
+ smyunit = str(myunit)
+ if len(smyunit)==4: # YYYY
+ yyyy = int(smyunit[0:4])
+ mm = 1
+ dd = 1
+ if len(smyunit)==6: # YYYYMM
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = 1
+ if len(smyunit)==8: # YYYYMMDD
+ yyyy = int(smyunit[0:4])
+ mm = int(smyunit[4:6])
+ dd = int(smyunit[6:8])
+ if len(smyunit)==3: # Full time range
+ # Need to set an appropriate time representing the mid-point of the entire time span
+ dt = dateList[-1]-dateList[0]
+ halfway = dateList[0]+(dt/2)
+ yyyy = int(halfway.year)
+ mm = int(halfway.month)
+ dd = int(halfway.day)
+ newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0))
+ i += 1
+
+ if not processing_required:
+ meanstorem = data
+ elif processing_required:
+ meanstorem = meanstore
+
+ print '*** End calc_average_on_new_time_unit_KK ***'
+ return meanstorem,newTimesList
http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/.svn/text-base/visualize.py.svn-base
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/.svn/text-base/visualize.py.svn-base b/src/main/python/rcmes/toolkit/.svn/text-base/visualize.py.svn-base
new file mode 100755
index 0000000..20e4447
--- /dev/null
+++ b/src/main/python/rcmes/toolkit/.svn/text-base/visualize.py.svn-base
@@ -0,0 +1 @@
+"""Collection of functions to visualize the data analysis output artifacts"""
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/__init__.py b/src/main/python/rcmes/toolkit/__init__.py
new file mode 100755
index 0000000..11548e6
--- /dev/null
+++ b/src/main/python/rcmes/toolkit/__init__.py
@@ -0,0 +1,2 @@
+"""The toolkit Package is a collection of modules that provide a set of tools
+that can be used to process, analyze and plot the analysis results."""
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/__init__.pyc
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/__init__.pyc b/src/main/python/rcmes/toolkit/__init__.pyc
new file mode 100644
index 0000000..9b1707b
Binary files /dev/null and b/src/main/python/rcmes/toolkit/__init__.pyc differ
http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/do_data_prep.py
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/do_data_prep.py b/src/main/python/rcmes/toolkit/do_data_prep.py
new file mode 100755
index 0000000..db9ea39
--- /dev/null
+++ b/src/main/python/rcmes/toolkit/do_data_prep.py
@@ -0,0 +1,461 @@
+"""Prepare Datasets both model and observations for analysis using metrics"""
+
+
+import numpy as np
+import numpy.ma as ma
+import sys, os
+
+from storage import db, files
+import process
+from utils import misc
+import scipy.interpolate
+
+# TODO: swap gridBox for Domain
+def prep_data(settings,obsSource,obsDatasetList,obsList,obsVarName,obsLonName,obsLatName,obsTimeName,obsTimestep,gridBox,modelList):
+ """
+
+ returns numOBSs,numMDLs,nT,ngrdY,ngrdX,Times,lons,lats,obsData,modelData,obsList
+
+ numOBSs - number of Observational Datasets. Really need to look at using len(obsDatasetList) instead
+ numMDLs - number of Models used. Again should use the len(modelList) instead
+ nT - Time value count after temporal regridding. Should use length of the time axis for a given dataset
+ ngrdY - size of the Y-Axis grid after spatial regridding
+ ngrdX - size of the X-Axis grid after spatial regridding
+ Times - list of python datetime objects the represent the list of time to be used in further calculations
+ lons -
+ lats -
+ obsData -
+ modelData -
+ obsList -
+
+ # 5/28/2013: Add 'obsSource' to indicate the source of the reference data file(s)
+ """
+
+ # TODO: Stop the object Deserialization and work on refactoring the core code here
+ cachedir = settings.cacheDir
+ workdir = settings.workDir
+ variableType = settings.variableType
+ variableType = variableType.lower()
+
+ # Use list comprehensions to deconstruct obsDatasetList
+ # ['TRMM_pr_mon', 'CRU3.1_pr'] Basically a list of Dataset NAME +'_' + parameter name - THE 'CRU*' one triggers unit conversion issues later
+ # the plan here is to use the obsDatasetList which contains a longName key we can use.
+ if obsSource == 0: # reference data from RCMED
+ obsList = [str(x['longname']) for x in obsDatasetList]
+ # Also using the obsDatasetList with a key of ['dataset_id']
+ obsDatasetId = [str(x['dataset_id']) for x in obsDatasetList]
+ # obsDatasetList ['paramter_id'] list
+ obsParameterId = [str(x['parameter_id']) for x in obsDatasetList]
+ obsTimestep = [str(x['timestep']) for x in obsDatasetList]
+ elif obsSource == 1: # reference data from users' disk
+ # in this case, obsList and obsTimestep have been defined in the calling routine and passed into this routine
+ pass
+ elif obsSource < 0: # no reference data are involved
+ pass
+
+ mdlList = [model.filename for model in modelList]
+
+ # Since all of the model objects in the modelList have the same "VarName" and "variableType", merely pull these from modelList[0]
+ modelTimeVarName = modelList[0].timeVariable
+ modelLatVarName = modelList[0].latVariable
+ modelLonVarName = modelList[0].lonVariable
+ modelVarName = modelList[0].varName
+ regridOption = settings.spatialGrid
+ timeRegridOption = settings.temporalGrid
+ print 'model parameter names ',modelVarName,modelTimeVarName,modelLatVarName,modelLonVarName
+
+ """
+ Routine to read-in and re-grid both obs and mdl datasets.
+ Processes both single and multiple files of obs and mdl or combinations in a general way.
+ i) retrieve observations from the database
+ ii) load in model data
+ iii) temporal regridding
+ iv) spatial regridding
+ v) area-averaging
+ Input:
+ cachedir - string describing directory path
+ workdir - string describing directory path
+ obsList - string describing the observation data files
+ obsDatasetId - int, db dataset id
+ obsParameterId - int, db parameter id
+ latMin, latMax, lonMin, lonMax, dLat, dLon, naLats, naLons: define the evaluation/analysis domain/grid system
+ latMin - float
+ latMax - float
+ lonMin - float
+ lonMax - float
+ dLat - float
+ dLon - float
+ naLats - integer
+ naLons - integer
+ mdlList - string describing model file name + path
+ modelVarName - string describing name of variable to evaluate (as written in model file)
+ variableType - string
+ modelTimeVarName - string describing name of time variable in model file
+ modelLatVarName - string describing name of latitude variable in model file
+ modelLonVarName - string describing name of longitude variable in model file
+ regridOption - string: 'obs'|'model'|'user'
+ timeRegridOption -string: 'full'|'annual'|'monthly'|'daily'
+ maskOption - Boolean
+
+ # TODO: This isn't true in the current codebase.
+ Instead the SubRegion's are being used. You can see that these values are not
+ being used in the code, at least they are not being passed in from the function
+
+ maskLatMin - float (only used if maskOption=1)
+ maskLatMax - float (only used if maskOption=1)
+ maskLonMin - float (only used if maskOption=1)
+ maskLonMax - float (only used if maskOption=1)
+ Output: image files of plots + possibly data
+ Jinwon Kim, 7/11/2012
+ """
+
+
+ # check the number of obs & model data files
+ numOBSs = len(obsList)
+ numMDLs = len(mdlList)
+
+ # assign parameters that must be preserved throughout the process
+
+ print 'start & end time = ', settings.startDate, settings.endDate
+ yymm0 = settings.startDate.strftime("%Y%m")
+ yymm1 = settings.endDate.strftime("%Y%m")
+ print 'start & end eval period = ', yymm0, yymm1
+
+
+ #TODO: Wrap in try except blocks instead
+ # 2b expanded to encompass mdl-only or obs-only cases (post 2.1 item)
+ if numMDLs < 1:
+ print 'No input model data file. EXIT'
+ sys.exit()
+ if numOBSs < 1:
+ print 'No input observation data file. EXIT'
+ sys.exit()
+
+ ## Part 1: retrieve observation data from the database and regrid them
+ ## NB. automatically uses local cache if already retrieved.
+
+ # preparation for spatial re-gridding: define the size of horizontal array of the target interpolation grid system (ngrdX and ngrdY)
+ print 'regridOption in prep_data= ', regridOption
+ if regridOption == 'model':
+ ifile = mdlList[0]
+ typeF = 'nc'
+ #lats, lons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
+ lats, lons, mTimes = files.read_data_from_one_file(ifile, modelVarName,
+ modelLatVarName,
+ modelLonVarName,
+ modelTimeVarName,
+ typeF)[:3]
+ modelObject = modelList[0]
+ latMin = modelObject.latMin
+ latMax = modelObject.latMax
+ lonMin = modelObject.lonMin
+ lonMax = modelObject.lonMax
+ elif regridOption == 'user':
+ # Use the GridBox Object
+ latMin = gridBox.latMin
+ latMax = gridBox.latMax
+ lonMin = gridBox.lonMin
+ lonMax = gridBox.lonMax
+ naLats = gridBox.latCount
+ naLons = gridBox.lonCount
+ dLat = gridBox.latStep
+ dLon = gridBox.lonStep
+ lat = np.arange(naLats) * dLat + latMin
+ lon = np.arange(naLons) * dLon + lonMin
+ lons, lats = np.meshgrid(lon, lat)
+ lon = 0.
+ lat = 0.
+ else:
+ print "INVALID REGRID OPTION USED"
+ sys.exit()
+
+ ngrdY = lats.shape[0]
+ ngrdX = lats.shape[1]
+
+ # All user-provided reference data are to be in netCDF format
+ if obsSource == 1:
+ typeF = 'nc'
+ regObsData = []
+ print ''
+ regularGrid = raw_input('OBS data are on regular grids? [y/n] \n>')
+
+ for n in np.arange(numOBSs):
+
+ # read-in obs data and first do spatial regridding
+ if obsSource == 0: # reference data from RCMED
+ oLats, oLons, _, oTimes, oData = db.extractData(obsDatasetId[n],
+ obsParameterId[n],
+ latMin, latMax,
+ lonMin, lonMax,
+ settings.startDate, settings.endDate,
+ cachedir, obsTimestep[n])
+ # TODO: modify this if block with new metadata usage.
+ if (variableType == 'precipitation') and (obsList[n][0:3] == 'CRU'):
+ oData = 86400.0 * oData
+
+ elif obsSource == 1: # reference data from users' own file
+ ifile = obsDatasetList[n]
+ oLats, oLons, oTimes, oData, ovUnit = files.read_data_from_one_file(ifile,
+ obsVarName,
+ obsLatName,
+ obsLonName,
+ obsTimeName,
+ typeF)
+ print 'ovUnit=',ovUnit
+ if (variableType == 'precipitation') or (variableType == 'evaporation') or (variableType == 'runoff'):
+ if ((ovUnit=='KG M-2 S-1') or (ovUnit=='kg m-2 s-1') or (ovUnit == 'MM S-1') or (ovUnit == 'mm s-1')):
+ print 'convert model variable unit from mm/s to mm/day'
+ oData=86400.*oData
+ # extract the reference data for the specified period
+ obsT = []
+ oStp = len(oTimes)
+ for i in np.arange(oStp):
+ obsT.append(oTimes[i].strftime("%Y%m"))
+ wh = (np.array(obsT) >= yymm0) & (np.array(obsT) <= yymm1)
+ oTimes = list(np.array(oTimes)[wh])
+ oData = oData[wh,:,:]
+
+ nstOBSs = oData.shape[0] # note that the length of obs data can vary for different obs intervals (e.g., daily vs. monthly)
+ print 'Regrid OBS dataset onto the ', regridOption, ' grid system: ngrdY, ngrdX, nstOBSs= ', ngrdY, ngrdX, nstOBSs
+ print 'For dataset: %s' % obsList[n]
+
+ tmpOBS = ma.zeros((nstOBSs, ngrdY, ngrdX))
+ print 'tmpOBS shape = ', tmpOBS.shape
+ if regularGrid == 'y':
+ for t in np.arange(nstOBSs):
+ tmpOBS[t, :, :] = process.do_regrid(oData[t, :, :], oLats, oLons, lats, lons)
+ else:
+ # new intetpolation scheme "scipy.interpolate.griddata" is used. note that masked values must be specified using "np.nan"
+ # note that using "cubic" option causes problems in interpolation
+ for t in np.arange(nstOBSs):
+ inLats = oLats.flatten()
+ inLons = oLons.flatten()
+ length = len(inLats)
+ inPts = ma.zeros((length, 2))
+ inPts[:,0] = inLats
+ inPts[:,1] = inLons
+ inData = oData[t, :, :].flatten()
+ inData[inData < -999.] = np.nan
+ tmpOBS[t, :, :] = scipy.interpolate.griddata(inPts, inData, (lats, lons), method = 'linear')
+ inLats = 0.
+ inLons = 0.
+ inPts = 0.
+ inData = 0.
+ print 'spatial regridding of the obs data ',n,' is completed'
+
+ # TODO: Not sure this is needed with Python Garbage Collector
+ # The used memory should be freed when the objects are no longer referenced. If this continues to be an issue we may need to look
+ # at using generators where possible.
+ oLats = 0.0
+ oLons = 0.0 # release the memory occupied by the temporary variables oLats and oLons.
+
+ # temporally regrid the spatially regridded obs data
+ oData, newObsTimes = process.calc_average_on_new_time_unit_K(tmpOBS, oTimes, unit=timeRegridOption)
+
+ tmpOBS = 0.0 # release temporary array after regridding
+
+ # check the consistency of temporally regridded obs data
+ if n == 0:
+ oldObsTimes = newObsTimes
+ else:
+ if oldObsTimes != newObsTimes:
+ print 'temporally regridded obs data time levels do not match at ', n - 1, n
+ print '%s Time through Loop' % (n + 1)
+ print 'oldObsTimes Count: %s' % len(oldObsTimes)
+ print 'newObsTimes Count: %s' % len(newObsTimes)
+ # TODO: We need to handle these cases using Try Except Blocks or insert a sys.exit if appropriate
+ sys.exit()
+ else:
+ oldObsTimes = newObsTimes
+ # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
+ regObsData.append(oData)
+
+
+ """ all obs datasets have been read-in and regridded. convert the regridded obs data from 'list' to 'array'
+ also finalize 'obsTimes', the time coordinate values of the regridded obs data.
+ NOTE: using 'list_to_array' assigns values to the missing points; this has become a problem in handling the CRU data.
+ this problem disappears by using 'ma.array'."""
+
+ obsData = ma.array(regObsData)
+ obsTimes = newObsTimes
+ regObsData = 0
+ oldObsTimes = 0
+ nT = len(obsTimes)
+
+ # TODO: Refactor this into a function within the toolkit module
+ # compute the simple multi-obs ensemble if multiple obs are used
+ if numOBSs > 1:
+ print 'numOBSs = ', numOBSs
+ oData = obsData
+ print 'oData shape = ', oData.shape
+ obsData = ma.zeros((numOBSs + 1, nT, ngrdY, ngrdX))
+ print 'obsData shape = ', obsData.shape
+ avg = ma.zeros((nT, ngrdY, ngrdX))
+
+ for i in np.arange(numOBSs):
+ obsData[i, :, :, :] = oData[i, :, :, :]
+ avg[:, :, :] = avg[:, :, :] + oData[i, :, :, :]
+
+ avg = avg / float(numOBSs)
+ obsData[numOBSs, :, :, :] = avg[:, :, :] # store the obs-ensemble data
+ numOBSs = numOBSs + 1 # update the number of obs data to include the obs ensemble
+ obsList.append('ensOBS')
+ print 'OBS regridded: ', obsData.shape
+
+ ## Part 2: load in and regrid model data from file(s)
+ ## NOTE: tthe wo parameters, numMDLs and numMOmx are defined to represent the number of models (w/ all 240 mos) &
+ ## the total number of months, respectively, in later multi-model calculations.
+
+ typeF = 'nc'
+ regridMdlData = []
+ # extract the model names and store them in the list 'mdlList'
+ mdlName = []
+ mdlListReversed=[]
+ if len(mdlList) >1:
+ for element in mdlList:
+ mdlListReversed.append(element[::-1])
+ prefix=os.path.commonprefix(mdlList)
+ postfix=os.path.commonprefix(mdlListReversed)[::-1]
+ for element in mdlList:
+ mdlName.append(element.replace(prefix,'').replace(postfix,''))
+ else:
+ mdlName.append('model')
+
+ print ''
+ regularGrid = raw_input('MDL data are on regular grids? [y/n] \n>')
+
+ for n in np.arange(numMDLs):
+ # read model grid info, then model data
+ ifile = mdlList[n]
+ print 'ifile= ', ifile
+ #modelLats, modelLons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
+ #mTime, mdlDat, mvUnit = files.read_data_from_one_file(ifile, modelVarName, modelTimeVarName, modelLats, typeF)
+ modelLats, modelLons, mTimes, mdlDat, mvUnit = files.read_data_from_one_file(ifile,
+ modelVarName,
+ modelLatVarName,
+ modelLonVarName,
+ modelTimeVarName,
+ typeF)
+ mdlT = []
+ mStep = len(mTimes)
+
+ for i in np.arange(mStep):
+ mdlT.append(mTimes[i].strftime("%Y%m"))
+
+ wh = (np.array(mdlT) >= yymm0) & (np.array(mdlT) <= yymm1)
+ modelTimes = list(np.array(mTimes)[wh])
+ mData = mdlDat[wh, :, :]
+ # convert the units of model water flux variables (precipitation, evaporation, runoff) into "mm/day" as necessary
+ if (variableType == 'precipitation') or (variableType == 'evaporation') or (variableType == 'runoff'):
+ if (mvUnit=='KG M-2 S-1') or (mvUnit=='kg m-2 s-1') or (mvUnit == 'MM S-1') or (mvUnit == 'mm s-1'):
+ print 'convert model variable unit from mm/s to mm/day'
+ mData = 86400. * mData
+ else:
+ pass
+ elif (variableType == 'SWE') or (variableType == 'swe'):
+ if (mvUnit=='m') or (mvUnit=='M') or (mvUnit=='meter') or (mvUnit=='METER'):
+ print 'convert model SWE unit from meters to mm'
+ mData = 1.e3 * mData
+ else:
+ pass
+
+ # determine the dimension size from the model time and latitude data.
+ nT = len(modelTimes)
+
+ # UNUSED VARIABLES - WILL DELETE AFTER TESTING
+ # nmdlY=modelLats.shape[0]
+ # nmdlX=modelLats.shape[1]
+ #print 'nT, ngrdY, ngrdX = ',nT,ngrdY, ngrdX,min(modelTimes),max(modelTimes)
+ print ' The shape of model data to be processed= ', mData.shape, ' for the period ', min(modelTimes), max(modelTimes)
+ # spatial regridding of the modl data
+ tmpMDL = ma.zeros((nT, ngrdY, ngrdX))
+
+ if regridOption != 'model':
+ if regularGrid == 'y':
+ for t in np.arange(nT):
+ tmpMDL[t, :, :] = process.do_regrid(mData[t, :, :], modelLats, modelLons, lats, lons)
+ else:
+ # new intetpolation scheme "scipy.interpolate.griddata" is used. note that masked values must be specified using "np.nan"
+ # note that using "cubic" option causes problems in interpolation
+ for t in np.arange(nstOBSs):
+ inLats = modelLats.flatten()
+ inLons = modelLons.flatten()
+ length = len(inLats)
+ inPts = ma.zeros((length, 2))
+ inPts[:,0] = inLats
+ inPts[:,1] = inLons
+ inData = mData[t, :, :].flatten()
+ inData[inData < -999.] = np.nan
+ tmpMDL[t, :, :] = scipy.interpolate.griddata(inPts, inData, (lats, lons), method = 'linear')
+ inLats = 0.
+ inLons = 0.
+ inPts = 0.
+ inData = 0.
+ else:
+ tmpMDL = mData
+
+ # temporally regrid the model data
+ mData, newMdlTimes = process.regrid_in_time(tmpMDL, modelTimes, unit=timeRegridOption)
+ tmpMDL = 0.0
+
+ # check data consistency for all models
+ if n == 0:
+ oldMdlTimes = newMdlTimes
+ else:
+ if oldMdlTimes != newMdlTimes:
+ print 'temporally regridded mdl data time levels do not match at ', n - 1, n
+ print len(oldMdlTimes), len(newMdlTimes)
+ sys.exit()
+ else:
+ oldMdlTimes = newMdlTimes
+
+ # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
+ regridMdlData.append(mData)
+
+ modelData = ma.array(regridMdlData)
+ modelTimes = newMdlTimes
+ regridMdlData = 0
+ oldMdlTimes = 0
+ newMdlTimes = 0
+ # check consistency between the time levels of the model and obs data
+ # this is the final update of time levels: 'Times' and 'nT'
+ if obsTimes != modelTimes:
+ print 'time levels of the obs and model data are not consistent. EXIT'
+ print 'obsTimes'
+ print obsTimes
+ print 'modelTimes'
+ print modelTimes
+ sys.exit()
+ # 'Times = modelTimes = obsTimes' has been established and modelTimes and obsTimes will not be used hereafter. (de-allocated)
+ Times = modelTimes
+ nT = len(modelTimes)
+ modelTimes = 0
+ obsTimes = 0
+
+ print 'Reading and regridding model data are completed'
+ print 'numMDLs, modelData.shape= ', numMDLs, modelData.shape
+
+ # TODO: Do we need to make this a user supplied flag, or do we just create an ensemble ALWAYS
+ # TODO: Add in Kyo's code here as well
+ # TODO: Commented out until I can talk with Jinwon about this
+ # compute the simple multi-model ensemble if multiple models are evaluated
+ if numMDLs > 1:
+ mdlData=modelData
+ modelData=ma.zeros((numMDLs+1,nT,ngrdY,ngrdX))
+ avg=ma.zeros((nT,ngrdY,ngrdX))
+ for i in np.arange(numMDLs):
+ modelData[i,:,:,:]=mdlData[i,:,:,:]
+ avg[:,:,:]=avg[:,:,:]+mdlData[i,:,:,:]
+ avg=avg/float(numMDLs)
+ modelData[numMDLs,:,:,:]=avg[:,:,:] # store the model-ensemble data
+ # THESE ARE NOT USED. WILL DELETE AFTER TESTING
+ # i0mdl=0
+ # i1mdl=numMDLs
+ numMDLs=numMDLs+1
+ mdlName.append('ensMDL')
+ print 'Eval mdl-mean timeseries for the obs periods: modelData.shape= ',modelData.shape
+ # GOODALE: This ensemble code should be refactored into process.py module since it's purpose is data processing
+
+ # Processing complete
+ print 'data_prep is completed: both obs and mdl data are re-gridded to a common analysis grid'
+ return numOBSs, numMDLs, nT, ngrdY, ngrdX, Times, lons, lats, obsData, modelData, obsList, mdlName, variableType
http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/do_data_prep.pyc
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/do_data_prep.pyc b/src/main/python/rcmes/toolkit/do_data_prep.pyc
new file mode 100644
index 0000000..d2b708c
Binary files /dev/null and b/src/main/python/rcmes/toolkit/do_data_prep.pyc differ