You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@climate.apache.org by jo...@apache.org on 2014/05/09 04:03:50 UTC
[41/51] [abbrv] [partial] Adding Jinwon's custom RCMET

http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/.svn/text-base/process.py.svn-base
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/.svn/text-base/process.py.svn-base b/src/main/python/rcmes/toolkit/.svn/text-base/process.py.svn-base
new file mode 100755
index 0000000..b5dde08
--- /dev/null
+++ b/src/main/python/rcmes/toolkit/.svn/text-base/process.py.svn-base
@@ -0,0 +1,1263 @@
+"""Collection of functions that process numpy arrays of varying shapes.  
+Functions can range from subsetting to regridding"""
+
+# Python Standard Libary Imports
+import math
+import datetime
+import re
+import string
+import sys
+import time
+
+# 3rd Party Imports
+import Nio
+import numpy as np
+import numpy.ma as ma
+from scipy.ndimage import map_coordinates
+
+
+def extract_subregion_from_data_array(data, lats, lons, latmin, latmax, lonmin, lonmax):
+    '''
+     Extract a sub-region from a data array.
+     
+     Example::
+       **e.g.** the user may load a global model file, but only want to examine data over North America
+            This function extracts a sub-domain from the original data.
+            The defined sub-region must be a regular lat/lon bounding box,
+            but the model data may be on a non-regular grid (e.g. rotated, or Guassian grid layout).
+            Data are kept on the original model grid and a rectangular (in model-space) region 
+            is extracted which contains the rectangular (in lat/lon space) user supplied region.
+    
+     INPUT::
+       data - 3d masked data array
+       lats - 2d array of latitudes corresponding to data array
+       lons - 2d array of longitudes corresponding to data array
+       latmin, latmax, lonmin, lonmax - bounding box of required region to extract
+       
+     OUTPUT::
+       data2 - subset of original data array containing only data from required subregion
+       lats2 - subset of original latitude data array
+       lons2 - subset of original longitude data array
+
+    '''
+    
+    
+
+    # Mask model data array to find grid points inside the supplied bounding box
+    whlat = (lats > latmin) & (lats < latmax)
+    whlon = (lons > lonmin) & (lons < lonmax)
+    wh = whlat & whlon
+
+    # Find required matrix indices describing the limits of the regular lat/lon bounding box
+    jmax = np.where(wh == True)[0].max()
+    jmin = np.where(wh == True)[0].min()
+    imax = np.where(wh == True)[1].max()
+    imin = np.where(wh == True)[1].min()
+
+    # Cut out the sub-region from the data array
+    data2 = ma.zeros((data.shape[0], jmax - jmin, imax - imin))
+    data2 = data[:, jmin:jmax, imin:imax]
+
+    # Cut out sub-region from lats,lons arrays
+    lats2 = lats[jmin:jmax, imin:imax]
+    lons2 = lons[jmin:jmax, imin:imax]
+
+    return data2, lats2, lons2
+
+def calc_area_mean(data, lats, lons, mymask='not set'):
+    '''
+     Calculate Area Average of data in a masked array
+     
+     INPUT::
+         data:  a masked array of data (NB. only data from one time expected to be passed at once)
+         lats:  2d array of regularly gridded latitudes
+         lons:  2d array of regularly gridded longitudes
+         mymask:  (optional) defines spatial region to do averaging over
+     
+     OUTPUT::
+         area_mean: a value for the mean inside the area
+
+    '''
+    
+    
+
+    # If mask not passed in, then set maks to cover whole data domain
+    if(mymask == 'not set'):
+       mymask = np.empty(data.shape)
+       mymask[:] = False	# NB. mask means (don't show), so False everywhere means use everything.
+   
+    # Dimension check on lats, lons
+    #  Sometimes these arrays are 3d, sometimes 2d, sometimes 1d
+    #  This bit of code just converts to the required 2d array shape
+    if(len(lats.shape) == 3):
+       lats = lats[0, :, :]
+
+    if(len(lons.shape) == 3):
+       lons = lons[0, :, :]
+
+    if(np.logical_and(len(lats.shape) == 1, len(lons.shape) == 1)):
+       lons, lats = np.meshgrid(lons, lats)
+ 
+    # Calculate grid length (assuming regular lat/lon grid)
+    dlat = lats[1, 0] - lats[0, 0]
+    dlon = lons[0, 1] - lons[0, 0]
+
+    # Calculates weights for each grid box
+    myweights = calc_area_in_grid_box(lats, dlon, dlat)
+
+    # Create a new masked array covering just user selected area (defined in mymask)
+    #   NB. this preserves missing data points in the observations data
+    subdata = ma.masked_array(data, mask=mymask)
+ 
+    if(myweights.shape != subdata.shape):
+       myweights.resize(subdata.shape)
+       myweights[1:, :] = myweights[0, :]
+
+    # Calculate weighted mean using ma.average (which takes weights)
+    area_mean = ma.average(subdata, weights=myweights)
+ 
+    return area_mean
+
+def calc_area_in_grid_box(latitude, dlat, dlon):
+    '''
+     Calculate area of regular lat-lon grid box
+     
+     INPUT::
+        latitude: latitude of grid box (degrees)
+        dlat:     grid length in latitude direction (degrees)
+        dlon:     grid length in longitude direction (degrees)
+        
+     OUTPUT::
+        A:        area of the grid box
+      
+      '''
+
+    R = 6371000  # radius of Earth in metres
+    C = 2 * math.pi * R
+
+    latitude = np.radians(latitude)
+
+    A = (dlon * (C / 360.)) * (dlat * (C / 360.) * np.cos(latitude))
+
+    return A
+
+def do_regrid(q, lat, lon, lat2, lon2, order=1, mdi= -999999999):
+    '''
+     Perform regridding from one set of lat,lon values onto a new set (lat2,lon2)
+    
+     Input::
+         q          - the variable to be regridded
+         lat,lon    - original co-ordinates corresponding to q values
+         lat2,lon2  - new set of latitudes and longitudes that you want to regrid q onto 
+         order      - (optional) interpolation order 1=bi-linear, 3=cubic spline
+         mdi  	    - (optional) fill value for missing data (used in creation of masked array)
+      
+     Output::
+         q2  - q regridded onto the new set of lat2,lon2 
+    
+    '''
+
+    nlat = q.shape[0]
+    nlon = q.shape[1]
+
+    nlat2 = lat2.shape[0]
+    nlon2 = lon2.shape[1]
+
+    # To make our lives easier down the road, let's 
+    # turn these into arrays of x & y coords
+    loni = lon2.ravel()
+    lati = lat2.ravel()
+
+    loni = loni.copy() # NB. it won't run unless you do this...
+    lati = lati.copy()
+
+    # Now, we'll set points outside the boundaries to lie along an edge
+    loni[loni > lon.max()] = lon.max()
+    loni[loni < lon.min()] = lon.min()
+    
+    # To deal with the "hard" break, we'll have to treat y differently,
+    # so we're just setting the min here...
+    lati[lati > lat.max()] = lat.max()
+    lati[lati < lat.min()] = lat.min()
+    
+    
+    # We need to convert these to (float) indicies
+    #   (xi should range from 0 to (nx - 1), etc)
+    loni = (nlon - 1) * (loni - lon.min()) / (lon.max() - lon.min())
+    
+    # Deal with the "hard" break in the y-direction
+    lati = (nlat - 1) * (lati - lat.min()) / (lat.max() - lat.min())
+    
+    # Notes on dealing with MDI when regridding data.
+    #  Method adopted here:
+    #    Use bilinear interpolation of data by default (but user can specify other order using order=... in call)
+    #    Perform bilinear interpolation of data, and of mask.
+    #    To be conservative, new grid point which contained some missing data on the old grid is set to missing data.
+    #            -this is achieved by looking for any non-zero interpolated mask values.
+    #    To avoid issues with bilinear interpolation producing strong gradients leading into the MDI,
+    #     set values at MDI points to mean data value so little gradient visible = not ideal, but acceptable for now.
+    
+    # Set values in MDI so that similar to surroundings so don't produce large gradients when interpolating
+    # Preserve MDI mask, by only changing data part of masked array object.
+    for shift in (-1, 1):
+        for axis in (0, 1):
+            q_shifted = np.roll(q, shift=shift, axis=axis)
+            idx = ~q_shifted.mask * q.mask
+            q.data[idx] = q_shifted[idx]
+
+    # Now we actually interpolate
+    # map_coordinates does cubic interpolation by default, 
+    # use "order=1" to preform bilinear interpolation instead...
+    q2 = map_coordinates(q, [lati, loni], order=order)
+    q2 = q2.reshape([nlat2, nlon2])
+
+    # Set values to missing data outside of original domain
+    q2 = ma.masked_array(q2, mask=np.logical_or(np.logical_or(lat2 >= lat.max(), 
+                                                              lat2 <= lat.min()), 
+                                                np.logical_or(lon2 <= lon.min(), 
+                                                              lon2 >= lon.max())))
+    
+    # Make second map using nearest neighbour interpolation -use this to determine locations with MDI and mask these
+    qmdi = np.zeros_like(q)
+    qmdi[q.mask == True] = 1.
+    qmdi[q.mask == False] = 0.
+    qmdi_r = map_coordinates(qmdi, [lati, loni], order=order)
+    qmdi_r = qmdi_r.reshape([nlat2, nlon2])
+    mdimask = (qmdi_r != 0.0)
+    
+    # Combine missing data mask, with outside domain mask define above.
+    q2.mask = np.logical_or(mdimask, q2.mask)
+
+    return q2
+
+def create_mask_using_threshold(masked_array, threshold=0.5):
+   '''
+    Routine to create a mask, depending on the proportion of times with missing data.
+    For each pixel, calculate proportion of times that are missing data,
+    **if** the proportion is above a specified threshold value,
+    then mark the pixel as missing data.
+   
+    Input::
+        masked_array - a numpy masked array of data (assumes time on axis 0, and space on axes 1 and 2).
+        threshold    - (optional) threshold proportion above which a pixel is marked as 'missing data'. 
+        NB. default threshold = 50%
+   
+    Output::
+        mymask       - a numpy array describing the mask. NB. not a masked array, just the mask itself.
+
+   '''
+   
+
+   # try, except used as some model files don't have a full mask, but a single bool
+   #  the except catches this situation and deals with it appropriately.
+   try:
+       nT = masked_array.mask.shape[0]
+
+       # For each pixel, count how many times are masked.
+       nMasked = masked_array.mask[:, :, :].sum(axis=0)
+
+       # Define new mask as when a pixel has over a defined threshold ratio of masked data
+       #   e.g. if the threshold is 75%, and there are 10 times,
+       #        then a pixel will be masked if more than 5 times are masked.
+       mymask = nMasked > (nT * threshold)
+
+   except:
+       mymask = np.zeros_like(masked_array.data[0, :, :])
+
+   return mymask
+
+def calc_average_on_new_time_unit(data, dateList, unit='monthly'):
+   '''
+    Routine to calculate averages on longer time units than the data exists on.
+    
+    Example::
+        If the data is 6-hourly, calculate daily, or monthly means.
+   
+    Input::
+        data     - data values
+        dateList - list of python datetime structures corresponding to data values
+        unit     - string describing time unit to average onto. 
+        *Valid values are: 'monthly', 'daily', 'pentad','annual','decadal'*
+      
+     Output:
+        meanstorem - numpy masked array of data values meaned over required time period
+        newTimesList - a list of python datetime objects representing the data in the new averagin period
+        *NB.* currently set to beginning of averaging period, 
+        **i.e. mean Jan 1st - Jan 31st -> represented as Jan 1st, 00Z.**
+   '''
+
+   # First catch unknown values of time unit passed in by user
+   acceptable = (unit == 'full') | (unit == 'annual') | (unit == 'monthly') | (unit == 'daily') | (unit == 'pentad')
+
+   if not acceptable:
+      print 'Error: unknown unit type selected for time averaging'
+      print '       Please check your code.'
+      return
+
+   # Calculate arrays of years (2007,2007),
+   #                     yearsmonths (200701,200702),
+   #                     or yearmonthdays (20070101,20070102) 
+   #  -depending on user selected averaging period.
+
+   # Year list
+   if unit == 'annual':
+      print 'Calculating annual mean data'
+      timeunits = []
+
+      for i in np.arange(len(dateList)):
+         timeunits.append(str(dateList[i].year))
+
+      timeunits = np.array(timeunits, dtype=int)
+         
+   # YearMonth format list
+   if unit == 'monthly':
+      print 'Calculating monthly mean data'
+      timeunits = []
+
+      for i in np.arange(len(dateList)):
+         timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month))
+
+      timeunits = np.array(timeunits, dtype=int)
+
+   # YearMonthDay format list
+   if unit == 'daily':
+      print 'Calculating daily mean data'
+      timeunits = []
+ 
+      for i in np.arange(len(dateList)):
+         timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month) + \
+                          str("%02d" % dateList[i].day))
+
+      timeunits = np.array(timeunits, dtype=int)
+
+
+   # TODO: add pentad setting using Julian days?
+
+
+   # Full list: a special case
+   if unit == 'full':
+      print 'Calculating means data over full time range'
+      timeunits = []
+
+      for i in np.arange(len(dateList)):
+         timeunits.append(999)  # i.e. we just want the same value for all times.
+
+      timeunits = np.array(timeunits, dtype=int)
+
+
+
+   # empty list to store new times
+   newTimesList = []
+
+   # Decide whether or not you need to do any time averaging.
+   #   i.e. if data are already on required time unit then just pass data through and 
+   #        calculate and return representative datetimes.
+   processing_required = True
+   if len(timeunits) == (len(np.unique(timeunits))):
+        processing_required = False
+
+   # 1D data arrays, i.e. time series
+   if data.ndim == 1:
+     # Create array to store the resulting data
+     meanstore = np.zeros(len(np.unique(timeunits)))
+  
+     # Calculate the means across each unique time unit
+     i = 0
+     for myunit in np.unique(timeunits):
+       if processing_required:
+         datam = ma.masked_array(data, timeunits != myunit)
+         meanstore[i] = ma.average(datam)
+
+       # construct new times list
+       smyunit = str(myunit)
+       if len(smyunit) == 4:  # YYYY
+        yyyy = int(myunit[0:4])
+        mm = 1
+        dd = 1
+       if len(smyunit) == 6:  # YYYYMM
+        yyyy = int(smyunit[0:4])
+        mm = int(smyunit[4:6])
+        dd = 1
+       if len(smyunit) == 8:  # YYYYMMDD
+        yyyy = int(smyunit[0:4])
+        mm = int(smyunit[4:6])
+        dd = int(smyunit[6:8])
+       if len(smyunit) == 3:  # Full time range
+        # Need to set an appropriate time representing the mid-point of the entire time span
+        dt = dateList[-1] - dateList[0]
+        halfway = dateList[0] + (dt / 2)
+        yyyy = int(halfway.year)
+        mm = int(halfway.month)
+        dd = int(halfway.day)
+  
+       newTimesList.append(datetime.datetime(yyyy, mm, dd, 0, 0, 0, 0))
+       i = i + 1
+
+   # 3D data arrays
+   if data.ndim == 3:
+
+     #datamask = create_mask_using_threshold(data,threshold=0.75)
+
+     # Create array to store the resulting data
+     meanstore = np.zeros([len(np.unique(timeunits)), data.shape[1], data.shape[2]])
+  
+     # Calculate the means across each unique time unit
+     i = 0
+     datamask_store = []
+     for myunit in np.unique(timeunits):
+       if processing_required:
+
+         mask = np.zeros_like(data)
+         mask[timeunits != myunit, :, :] = 1.0
+
+         # Calculate missing data mask within each time unit...
+         datamask_at_this_timeunit = np.zeros_like(data)
+         datamask_at_this_timeunit[:] = create_mask_using_threshold(data[timeunits == myunit, :, :], threshold=0.75)
+         # Store results for masking later
+         datamask_store.append(datamask_at_this_timeunit[0])
+
+         # Calculate means for each pixel in this time unit, ignoring missing data (using masked array).
+         datam = ma.masked_array(data, np.logical_or(mask, datamask_at_this_timeunit))
+         meanstore[i, :, :] = ma.average(datam, axis=0)
+
+       # construct new times list
+       smyunit = str(myunit)
+       if len(smyunit) == 4:  # YYYY
+        yyyy = int(smyunit[0:4])
+        mm = 1
+        dd = 1
+       if len(smyunit) == 6:  # YYYYMM
+        yyyy = int(smyunit[0:4])
+        mm = int(smyunit[4:6])
+        dd = 1
+       if len(smyunit) == 8:  # YYYYMMDD
+        yyyy = int(smyunit[0:4])
+        mm = int(smyunit[4:6])
+        dd = int(smyunit[6:8])
+       if len(smyunit) == 3:  # Full time range
+        # Need to set an appropriate time representing the mid-point of the entire time span
+        dt = dateList[-1] - dateList[0]
+        halfway = dateList[0] + (dt / 2)
+        yyyy = int(halfway.year)
+        mm = int(halfway.month)
+        dd = int(halfway.day)
+
+       newTimesList.append(datetime.datetime(yyyy, mm, dd, 0, 0, 0, 0))
+
+       i += 1
+
+     if not processing_required:
+        meanstorem = data
+
+     if processing_required:
+        # Create masked array (using missing data mask defined above)
+        datamask_store = np.array(datamask_store)
+        meanstorem = ma.masked_array(meanstore, datamask_store)
+
+   return meanstorem, newTimesList
+
+def calc_running_accum_from_period_accum(data):
+    '''
+     Routine to calculate running total accumulations from individual period accumulations.
+     ::
+    
+         e.g.  0,0,1,0,0,2,2,1,0,0
+            -> 0,0,1,1,1,3,5,6,6,6
+    
+     Input::
+          data: numpy array with time in the first axis
+    
+     Output::
+          running_acc: running accumulations
+    
+    '''
+    
+    
+    running_acc = np.zeros_like(data)
+    
+    if(len(data.shape) == 1):
+        running_acc[0] = data[0]
+    
+    if(len(data.shape) > 1):
+        running_acc[0, :] = data[0, :]
+    
+    for i in np.arange(data.shape[0] - 1):
+        if(len(data.shape) == 1):
+            running_acc[i + 1] = running_acc[i] + data[i + 1]
+    
+        if(len(data.shape) > 1):
+            running_acc[i + 1, :] = running_acc[i, :] + data[i + 1, :]
+    
+    return running_acc
+
+def ignore_boundaries(data, rim=10):
+    '''
+     Routine to mask the lateral boundary regions of model data to ignore them from calculations.
+     
+     Input::
+         data - a masked array of model data
+         rim - (optional) number of grid points to ignore
+    
+     Output::
+         data - data array with boundary region masked
+    
+    '''
+    
+    nx = data.shape[1]
+    ny = data.shape[0]
+    
+    rimmask = np.zeros_like(data)
+    for j in np.arange(rim):
+        rimmask[j, 0:nx - 1] = 1.0
+    
+    for j in ny - 1 - np.arange(rim):
+        rimmask[j, 0:nx - 1] = 1.0
+    
+    for i in np.arange(rim):
+        rimmask[0:ny - 1, i] = 1.0
+    
+    for i in nx - 1 - np.arange(rim):
+        rimmask[0:ny - 1, i] = 1.0
+    
+    data = ma.masked_array(data, mask=rimmask)
+    
+    return data
+
+def normalizeDatetimes(datetimes, timestep):
+    """
+    Input::
+        datetimes - list of datetime objects that need to be normalized
+        timestep - string of value ('daily' | 'monthly')
+    Output::
+        normalDatetimes - list of datetime objects that have been normalized
+    
+    Normalization Rules::
+        Daily data will be forced to an hour value of 00:00:00
+        Monthly data will be forced to the first of the month at midnight 
+    """
+    normalDatetimes = []
+    if timestep.lower() == 'monthly':
+        for inputDatetime in datetimes:
+            if inputDatetime.day != 1:
+                # Clean the inputDatetime
+                inputDatetimeString = inputDatetime.strftime('%Y%m%d')
+                normalInputDatetimeString = inputDatetimeString[:6] + '01'
+                inputDatetime = datetime.datetime.strptime(normalInputDatetimeString, '%Y%m%d')
+
+            normalDatetimes.append(inputDatetime)
+
+    elif timestep.lower() == 'daily':
+        for inputDatetime in datetimes:
+            if inputDatetime.hour != 0 or inputDatetime.minute != 0 or inputDatetime.second != 0:
+                datetimeString = inputDatetime.strftime('%Y%m%d%H%M%S')
+                normalDatetimeString = datetimeString[:8] + '000000'
+                inputDatetime = datetime.datetime.strptime(normalDatetimeString, '%Y%m%d%H%M%S')
+            
+            normalDatetimes.append(inputDatetime)
+
+
+    return normalDatetimes
+
+def getModelTimes(modelFile, timeVarName):
+    '''
+    TODO:  Do a better job handling dates here
+    Routine to convert from model times ('hours since 1900...', 'days since ...')
+    into a python datetime structure
+
+    Input::
+        modelFile - path to the model tile you want to extract the times list and modelTimeStep from
+        timeVarName - name of the time variable in the model file
+
+    Output::
+        times  - list of python datetime objects describing model data times
+        modelTimeStep - 'hourly','daily','monthly','annual'
+    '''
+
+    f = Nio.open_file(modelFile)
+    xtimes = f.variables[timeVarName]
+    timeFormat = xtimes.attributes['units']
+
+    # search to check if 'since' appears in units
+    try:
+        sinceLoc = re.search('since', timeFormat).end()
+
+    except AttributeError:
+        print 'Error decoding model times: time variable attributes do not contain "since"'
+        raise
+    
+    units = None
+    TIME_UNITS = ('minutes', 'hours', 'days', 'months', 'years')
+    # search for 'seconds','minutes','hours', 'days', 'months', 'years' so know units
+    for unit in TIME_UNITS:
+        if re.search(unit, timeFormat):
+            units = unit
+            break
+
+    # cut out base time (the bit following 'since')
+    base_time_string = string.lstrip(timeFormat[sinceLoc:])
+    # decode base time
+    base_time = decodeTimeFromString(base_time_string)
+    times = []
+
+    for xtime in xtimes[:]:
+        
+        # Cast time as an int
+        xtime = int(xtime)
+
+        if units == 'minutes':
+            dt = datetime.timedelta(minutes=xtime)
+            new_time = base_time + dt
+        elif units == 'hours':
+            dt = datetime.timedelta(hours=xtime)
+            new_time = base_time + dt
+        elif units == 'days':
+            dt = datetime.timedelta(days=xtime)
+            new_time = base_time + dt
+        elif units == 'months':
+            # NB. adding months in python is complicated as month length varies and hence ambiguous.
+            # Perform date arithmatic manually
+            #  Assumption: the base_date will usually be the first of the month
+            #              NB. this method will fail if the base time is on the 29th or higher day of month
+            #                      -as can't have, e.g. Feb 31st.
+            new_month = int(base_time.month + xtime % 12)
+            new_year = int(math.floor(base_time.year + xtime / 12.))
+            new_time = datetime.datetime(new_year, new_month, base_time.day, base_time.hour, base_time.second, 0)
+        elif units == 'years':
+            dt = datetime.timedelta(years=xtime)
+            new_time = base_time + dt
+
+        times.append(new_time)
+    
+    try:
+        timeStepLength = int(xtimes[1] - xtimes[0] + 1.e-12)
+        modelTimeStep = getModelTimeStep(units, timeStepLength)
+    except:
+        raise
+    
+    times = normalizeDatetimes(times, modelTimeStep)
+    
+    return times, modelTimeStep
+
+def getModelTimeStep(units, stepSize):
+    # Time units are now determined. Determine the time intervals of input data (mdlTimeStep)
+
+    if units == 'minutes':
+        if stepSize == 60:
+            modelTimeStep = 'hourly'
+        elif stepSize == 1440:
+            modelTimeStep = 'daily'
+        # 28 days through 31 days
+        elif 40320 <= stepSize <= 44640:
+            modelTimeStep = 'monthly'
+        # 365 days through 366 days 
+        elif 525600 <= stepSize <= 527040:
+            modelTimeStep = 'annual' 
+        else:
+            raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+    elif units == 'hours':
+        if stepSize == 1:
+            modelTimeStep = 'hourly'
+        elif stepSize == 24:
+            modelTimeStep = 'daily'
+        elif 672 <= stepSize <= 744:
+            modelTimeStep = 'monthly' 
+        elif 8760 <= stepSize <= 8784:
+            modelTimeStep = 'annual' 
+        else:
+            raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+    elif units == 'days':
+        if stepSize == 1:
+            modelTimeStep = 'daily'
+        elif 28 <= stepSize <= 31:
+            modelTimeStep = 'monthly'
+        elif 365 <= stepSize <= 366:
+            modelTimeStep = 'annual'
+        else:
+            raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+    elif units == 'months':
+        if stepSize == 1:
+            modelTimeStep = 'monthly'
+        elif stepSize == 12:
+            modelTimeStep = 'annual'
+        else:
+            raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+    elif units == 'years':
+        if stepSize == 1:
+            modelTimeStep = 'annual'
+        else:
+            raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize)
+
+    else:
+        errorMessage = 'the time unit ', units, ' is not currently handled in this version.'
+        raise Exception(errorMessage)
+
+    return modelTimeStep
+
+def decodeTimeFromString(time_string):
+    '''
+     Decodes string into a python datetime object
+     *Method:* tries a bunch of different time format possibilities and hopefully one of them will hit.
+     ::
+
+       **Input:**  time_string - a string that represents a date/time
+       
+       **Output:** mytime - a python datetime object
+    '''
+
+    # This will deal with times that use decimal seconds
+    if '.' in time_string:
+        time_string = time_string.split('.')[0] + '0'
+    else:
+        pass
+
+    try:
+        mytime = time.strptime(time_string, '%Y-%m-%d %H:%M:%S')
+        mytime = datetime.datetime(*mytime[0:6])
+        return mytime
+
+    except ValueError:
+        pass
+
+    try:
+        mytime = time.strptime(time_string, '%Y/%m/%d %H:%M:%S')
+        mytime = datetime.datetime(*mytime[0:6])
+        return mytime
+
+    except ValueError:
+        pass
+
+    try:
+        mytime = time.strptime(time_string, '%Y%m%d %H:%M:%S')
+        mytime = datetime.datetime(*mytime[0:6])
+        return mytime
+
+    except ValueError:
+        pass
+
+    try:
+        mytime = time.strptime(time_string, '%Y:%m:%d %H:%M:%S')
+        mytime = datetime.datetime(*mytime[0:6])
+        return mytime
+
+    except ValueError:
+        pass
+
+    try:
+        mytime = time.strptime(time_string, '%Y%m%d%H%M%S')
+        mytime = datetime.datetime(*mytime[0:6])
+        return mytime
+
+    except ValueError:
+        pass
+
+    try:
+        mytime = time.strptime(time_string, '%Y-%m-%d %H:%M')
+        mytime = datetime.datetime(*mytime[0:6])
+        return mytime
+
+    except ValueError:
+        pass
+
+    try:
+        mytime = time.strptime(time_string, '%Y-%m-%d')
+        mytime = datetime.datetime(*mytime[0:6])
+        return mytime
+
+    except ValueError:
+        pass
+
+
+    print 'Error decoding time string: string does not match a predefined time format'
+    return 0
+
+def regrid_wrapper(regrid_choice, obsdata, obslats, obslons, wrfdata, wrflats, wrflons):
+   '''
+    Wrapper routine for regridding.
+    
+    Either regrids model to obs grid, or obs to model grid, depending on user choice
+    
+        Inputs::
+            regrid_choice - [0] = Regrid obs data onto model grid or 
+                            [1] = Regrid model data onto obs grid
+
+            obsdata,wrfdata - data arrays
+            obslats,obslons - observation lat,lon arrays
+            wrflats,wrflons - model lat,lon arrays
+   
+        Output::
+            rdata,rdata2 - regridded data
+   	        lats,lons    - latitudes and longitudes of regridded data
+   
+   '''
+
+   # Regrid obs data onto model grid
+   if(regrid_choice == '0'):
+
+      ndims = len(obsdata.shape)
+      if(ndims == 3):
+         newshape = [obsdata.shape[0], wrfdata.shape[1], wrfdata.shape[2]]
+	 nT = obsdata.shape[0]
+
+      if(ndims == 2):
+         newshape = [wrfdata.shape[0], wrfdata.shape[1]]
+	 nT = 0
+
+      rdata = wrfdata
+      lats, lons = wrflats, wrflons
+
+      # Create a new masked array with the required dimensions
+      tmp = np.zeros(newshape)
+      rdata2 = ma.MaskedArray(tmp, mask=(tmp == 0))
+      tmp = 0 # free up some memory
+
+      rdata2[:] = 0.0
+
+      if(nT > 0):
+       for t in np.arange(nT):
+         rdata2[t, :, :] = do_regrid(obsdata[t, :, :], obslats[:, :], obslons[:, :], wrflats[:, :], wrflons[:, :])
+
+      if(nT == 0):
+         rdata2[:, :] = do_regrid(obsdata[:, :], obslats[:, :], obslons[:, :], wrflats[:, :], wrflons[:, :])
+
+
+   # Regrid model data onto obs grid
+   if(regrid_choice == '1'):
+      ndims = len(wrfdata.shape)
+      if(ndims == 3):
+         newshape = [wrfdata.shape[0], obsdata.shape[1], obsdata.shape[2]]
+	 nT = obsdata.shape[0]
+
+      if(ndims == 2):
+         newshape = [obsdata.shape[0], obsdata.shape[1]]
+	 nT = 0
+
+      rdata2 = obsdata
+      lats, lons = obslats, obslons
+
+      tmp = np.zeros(newshape)
+      rdata = ma.MaskedArray(tmp, mask=(tmp == 0))
+      tmp = 0 # free up some memory
+
+      rdata[:] = 0.0
+ 
+      if(nT > 0):
+        for t in np.arange(nT):
+	 rdata[t, :, :] = do_regrid(wrfdata[t, :, :], wrflats[:, :], wrflons[:, :], obslats[:, :], obslons[:, :])
+
+      if(nT == 0):
+	rdata[:, :] = do_regrid(wrfdata[:, :], wrflats[:, :], wrflons[:, :], obslats[:, :], obslons[:, :])
+
+  
+   return rdata, rdata2, lats, lons
+
+def extract_sub_time_selection(allTimes, subTimes, data):
+    '''
+     Routine to extract a sub-selection of times from a data array.
+
+     Example::
+           Suppose a data array has 30 time values for daily data for a whole month, 
+           but you only want the data from the 5th-15th of the month.
+    
+     Input::
+           allTimes - list of datetimes describing what times the data in the data array correspond to
+           subTimes - the times you want to extract data from
+           data     - the data array
+    
+     Output::
+           subdata     - subselection of data array
+    
+    '''
+    # Create new array to store the subselection
+    subdata = np.zeros([len(subTimes), data.shape[1], data.shape[2]])
+    
+    # Loop over all Times and when it is a member of the required subselection, copy the data across
+    i = 0     # counter for allTimes
+    subi = 0  # counter for subTimes
+    for t in allTimes:
+        if(np.setmember1d(allTimes, subTimes)):
+            subdata[subi, :, :] = data[i, :, :]
+            subi += 1
+        i += 1
+    
+    return subdata
+
+def calc_average_on_new_time_unit_K(data, dateList, unit):
+    '''
+    Routine to calculate averages on longer time units than the data exists on.
+    e.g. if the data is 6-hourly, calculate daily, or monthly means.
+     
+    Input:
+        data     - data values
+        dateList - list of python datetime structures corresponding to data values
+        unit     - string describing time unit to average onto 
+                      e.g. 'monthly', 'daily', 'pentad','annual','decadal'
+    Output:
+        meanstorem - numpy masked array of data values meaned over required time period
+        newTimesList - a list of python datetime objects representing the data in the new averagin period
+                           NB. currently set to beginning of averaging period, 
+                           i.e. mean Jan 1st - Jan 31st -> represented as Jan 1st, 00Z.
+    '''
+
+    # Check if the user-selected temporal grid is valid. If not, EXIT
+    acceptable = (unit=='full')|(unit=='annual')|(unit=='monthly')|(unit=='daily')|(unit=='pentad')
+    if not acceptable:
+        print 'Error: unknown unit type selected for time averaging: EXIT'
+        return -1,-1,-1,-1
+
+    # Calculate arrays of: annual timeseries: year (2007,2007),
+    #                      monthly time series: year-month (200701,200702),
+    #                      daily timeseries:  year-month-day (20070101,20070102) 
+    #  depending on user-selected averaging period.
+
+    # Year list
+    if unit=='annual':
+        timeunits = []
+        for i in np.arange(len(dateList)):
+            timeunits.append(str(dateList[i].year))
+        timeunits = np.array(timeunits, dtype=int)
+         
+    # YearMonth format list
+    if unit=='monthly':
+        timeunits = []
+        for i in np.arange(len(dateList)):
+            timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month))
+        timeunits = np.array(timeunits,dtype=int)
+
+    # YearMonthDay format list
+    if unit=='daily':
+        timeunits = []
+        for i in np.arange(len(dateList)):
+            timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month) + str("%02d" % dateList[i].day))
+        timeunits = np.array(timeunits,dtype=int)
+
+    # TODO: add pentad setting using Julian days?
+
+    # Full list: a special case
+    if unit == 'full':
+        comment='Calculating means data over the entire time range: i.e., annual-mean climatology'
+        timeunits = []
+        for i in np.arange(len(dateList)):
+            timeunits.append(999)  # i.e. we just want the same value for all times.
+        timeunits = np.array(timeunits, dtype=int)
+
+    # empty list to store new times
+    newTimesList = []
+
+    # Decide whether or not you need to do any time averaging.
+    #   i.e. if data are already on required time unit then just pass data through and 
+    #        calculate and return representative datetimes.
+    processing_required = True
+    if len(timeunits)==(len(np.unique(timeunits))):
+        processing_required = False
+
+    # 1D data arrays, i.e. time series
+    if data.ndim==1:
+        # Create array to store the resulting data
+        meanstore = np.zeros(len(np.unique(timeunits)))
+  
+        # Calculate the means across each unique time unit
+        i=0
+        for myunit in np.unique(timeunits):
+            if processing_required:
+                datam=ma.masked_array(data,timeunits!=myunit)
+                meanstore[i] = ma.average(datam)
+            
+            # construct new times list
+            smyunit = str(myunit)
+            if len(smyunit)==4:  # YYYY
+                yyyy = int(myunit[0:4])
+                mm = 1
+                dd = 1
+            if len(smyunit)==6:  # YYYYMM
+                yyyy = int(smyunit[0:4])
+                mm = int(smyunit[4:6])
+                dd = 1
+            if len(smyunit)==8:  # YYYYMMDD
+                yyyy = int(smyunit[0:4])
+                mm = int(smyunit[4:6])
+                dd = int(smyunit[6:8])
+            if len(smyunit)==3:  # Full time range
+                # Need to set an appropriate time representing the mid-point of the entire time span
+                dt = dateList[-1]-dateList[0]
+                halfway = dateList[0]+(dt/2)
+                yyyy = int(halfway.year)
+                mm = int(halfway.month)
+                dd = int(halfway.day)
+
+            newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0))
+            i = i+1
+
+    # 3D data arrays
+    if data.ndim==3:
+        # datamask = create_mask_using_threshold(data,threshold=0.75)
+        # Create array to store the resulting data
+        meanstore = np.zeros([len(np.unique(timeunits)),data.shape[1],data.shape[2]])
+  
+        # Calculate the means across each unique time unit
+        i=0
+        datamask_store = []
+        for myunit in np.unique(timeunits):
+            if processing_required:
+                mask = np.zeros_like(data)
+                mask[timeunits!=myunit,:,:] = 1.0
+                # Calculate missing data mask within each time unit...
+                datamask_at_this_timeunit = np.zeros_like(data)
+                datamask_at_this_timeunit[:]= create_mask_using_threshold(data[timeunits==myunit,:,:],threshold=0.75)
+                # Store results for masking later
+                datamask_store.append(datamask_at_this_timeunit[0])
+                # Calculate means for each pixel in this time unit, ignoring missing data (using masked array).
+                datam = ma.masked_array(data,np.logical_or(mask,datamask_at_this_timeunit))
+                meanstore[i,:,:] = ma.average(datam,axis=0)
+            # construct new times list
+            smyunit = str(myunit)
+            if len(smyunit)==4:  # YYYY
+                yyyy = int(smyunit[0:4])
+                mm = 1
+                dd = 1
+            if len(smyunit)==6:  # YYYYMM
+                yyyy = int(smyunit[0:4])
+                mm = int(smyunit[4:6])
+                dd = 1
+            if len(smyunit)==8:  # YYYYMMDD
+                yyyy = int(smyunit[0:4])
+                mm = int(smyunit[4:6])
+                dd = int(smyunit[6:8])
+            if len(smyunit)==3:  # Full time range
+                # Need to set an appropriate time representing the mid-point of the entire time span
+                dt = dateList[-1]-dateList[0]
+                halfway = dateList[0]+(dt/2)
+                yyyy = int(halfway.year)
+                mm = int(halfway.month)
+                dd = int(halfway.day)
+            newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0))
+            i += 1
+
+        if not processing_required:
+            meanstorem = data
+
+        if processing_required:
+            # Create masked array (using missing data mask defined above)
+            datamask_store = np.array(datamask_store)
+            meanstorem = ma.masked_array(meanstore, datamask_store)
+
+    return meanstorem,newTimesList
+
+def decode_model_timesK(ifile,timeVarName,file_type):
+    #################################################################################################
+    #  Convert model times ('hours since 1900...', 'days since ...') into a python datetime structure
+    #  Input:
+    #      filelist - list of model files
+    #      timeVarName - name of the time variable in the model files
+    #  Output:
+    #      times  - list of python datetime objects describing model data times
+    #     Peter Lean February 2011
+    #################################################################################################
+    f = Nio.open_file(ifile,mode='r',options=None,format=file_type)
+    xtimes = f.variables[timeVarName]
+    timeFormat = xtimes.attributes['units']
+    #timeFormat = "days since 1979-01-01 00:00:00"
+    # search to check if 'since' appears in units
+    try:
+        sinceLoc = re.search('since',timeFormat).end()
+    except:
+        print 'Error decoding model times: time var attributes do not contain "since": Exit'
+        sys.exit()
+    # search for 'seconds','minutes','hours', 'days', 'months', 'years' so know units
+    # TODO:  Swap out this section for a set of if...elseif statements
+    units = ''
+    try:
+        _ = re.search('minutes',timeFormat).end()
+        units = 'minutes'
+    except:
+        pass
+    try:
+        _ = re.search('hours',timeFormat).end()
+        units = 'hours'
+    except:
+        pass
+    try:
+        _ = re.search('days',timeFormat).end()
+        units = 'days'
+    except:
+        pass
+    try:
+        _ = re.search('months',timeFormat).end()
+        units = 'months'
+    except:
+        pass
+    try:
+        _ = re.search('years',timeFormat).end()
+        units = 'years'
+    except:
+        pass
+    # cut out base time (the bit following 'since')
+    base_time_string = string.lstrip(timeFormat[sinceLoc:])
+    # decode base time
+    # 9/25/2012: datetime.timedelta has problem with the argument because xtimes is NioVariable.
+    # Soln (J. Kim): use a temp variable ttmp in the for loop, then convert it into an integer xtime.
+    base_time = decodeTimeFromString(base_time_string)
+    times=[]
+    for ttmp in xtimes[:]:
+        xtime = int(ttmp)
+        if(units=='minutes'):  
+            dt = datetime.timedelta(minutes=xtime); new_time = base_time + dt
+        if(units=='hours'):  
+            dt = datetime.timedelta(hours=xtime); new_time = base_time + dt
+        if(units=='days'):  
+            dt = datetime.timedelta(days=xtime); new_time = base_time + dt
+        if(units=='months'):   # NB. adding months in python is complicated as month length varies and hence ambigous.
+            # Perform date arithmatic manually
+            #  Assumption: the base_date will usually be the first of the month
+            #              NB. this method will fail if the base time is on the 29th or higher day of month
+            #                      -as can't have, e.g. Feb 31st.
+            new_month = int(base_time.month + xtime % 12)
+            new_year = int(math.floor(base_time.year + xtime / 12.))
+            #print type(new_year),type(new_month),type(base_time.day),type(base_time.hour),type(base_time.second)
+            new_time = datetime.datetime(new_year,new_month,base_time.day,base_time.hour,base_time.second,0)
+        if(units=='years'):
+            dt = datetime.timedelta(years=xtime); new_time = base_time + dt
+        times.append(new_time)
+    return times
+
+
+def regrid_in_time(data,dateList,unit):
+   #################################################################################################
+   # Routine to calculate averages on longer time units than the data exists on.
+   #  e.g. if the data is 6-hourly, calculate daily, or monthly means.
+   #  Input:
+   #         data     - data values
+   #         dateList - list of python datetime structures corresponding to data values
+   #         unit     - string describing time unit to average onto 
+   #                       e.g. 'monthly', 'daily', 'pentad','annual','decadal'
+   #  Output:
+   #         meanstorem - numpy masked array of data values meaned over required time period
+   #         newTimesList - a list of python datetime objects representing the data in the new averagin period
+   #                            NB. currently set to beginning of averaging period, 
+   #                            i.e. mean Jan 1st - Jan 31st -> represented as Jan 1st, 00Z.
+   # ..............................
+   #   Jinwon Kim, Sept 30, 2012
+   #   Created from calc_average_on_new_time_unit_K, Peter's original, with the modification below:
+   #   Instead of masked array used by Peter, use wh to defined data within the averaging range.
+   #################################################################################################
+
+   print '***  Begin calc_average_on_new_time_unit_KK  ***'
+   # Check if the user-selected temporal grid is valid. If not, EXIT
+   acceptable = (unit=='full')|(unit=='annual')|(unit=='monthly')|(unit=='daily')|(unit=='pentad')
+   if not acceptable:
+     print 'Error: unknown unit type selected for time averaging: EXIT'; return -1,-1,-1,-1
+
+   # Calculate time arrays of: annual (yyyy, [2007]), monthly (yyyymm, [200701,200702]), daily (yyyymmdd, [20070101,20070102]) 
+   # "%02d" is similar to i2.2 in Fortran
+   if unit=='annual':                      # YYYY
+      timeunits = []
+      for i in np.arange(len(dateList)):
+         timeunits.append(str(dateList[i].year))
+   elif unit=='monthly':                   # YYYYMM
+      timeunits = []
+      for i in np.arange(len(dateList)):
+         timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month))
+   elif unit=='daily':                     # YYYYMMDD
+      timeunits = []
+      for i in np.arange(len(dateList)):
+         timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month) + str("%02d" % dateList[i].day))
+   elif unit=='full':                      # Full list: a special case
+      comment='Calculating means data over the entire time range: i.e., annual-mean climatology'
+      timeunits = []
+      for i in np.arange(len(dateList)):
+         timeunits.append(999)             # i.e. we just want the same value for all times.
+   timeunits = np.array(timeunits,dtype=int)
+   print 'timeRegridOption= ',unit#'; output timeunits= ',timeunits
+   #print 'timeRegridOption= ',unit,'; output timeunits= ',timeunits
+
+   # empty list to store time levels after temporal regridding
+   newTimesList = []
+
+   # Decide whether or not you need to do any time averaging.
+   #   i.e. if data are already on required time unit then just pass data through and calculate and return representative datetimes.
+   processing_required = True
+   if len(timeunits)==(len(np.unique(timeunits))):
+        processing_required = False
+   print 'processing_required= ',processing_required,': input time steps= ',len(timeunits),': regridded output time steps = ',len(np.unique(timeunits))
+   #print 'np.unique(timeunits): ',np.unique(timeunits)
+   print 'data.ndim= ',data.ndim
+
+   if data.ndim==1:                                           # 1D data arrays, i.e. 1D time series
+      # Create array to store the temporally regridded data
+     meanstore = np.zeros(len(np.unique(timeunits)))
+      # Calculate the means across each unique time unit
+     i=0
+     for myunit in np.unique(timeunits):
+       if processing_required:
+         wh = timeunits==myunit
+         datam = data[wh]
+         meanstore[i] = ma.average(datam)
+        # construct new times list
+       smyunit = str(myunit)
+       if len(smyunit)==4:  # YYYY
+        yyyy = int(myunit[0:4])
+        mm = 1
+        dd = 1
+       if len(smyunit)==6:  # YYYYMM
+        yyyy = int(smyunit[0:4])
+        mm = int(smyunit[4:6])
+        dd = 1
+       if len(smyunit)==8:  # YYYYMMDD
+        yyyy = int(smyunit[0:4])
+        mm = int(smyunit[4:6])
+        dd = int(smyunit[6:8])
+       if len(smyunit)==3:  # Full time range
+        # Need to set an appropriate time representing the mid-point of the entire time span
+        dt = dateList[-1]-dateList[0]
+        halfway = dateList[0]+(dt/2)
+        yyyy = int(halfway.year)
+        mm = int(halfway.month)
+        dd = int(halfway.day)
+       newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0))
+       i = i+1
+
+   elif data.ndim==3:                                         # 3D data arrays, i.e. 2D time series
+      # Create array to store the resulting data
+     meanstore = np.zeros([len(np.unique(timeunits)),data.shape[1],data.shape[2]])
+     # Calculate the means across each unique time unit
+     i=0
+     datamask_store = []
+     for myunit in np.unique(timeunits):
+       if processing_required:
+         wh = timeunits==myunit
+         datam = data[wh,:,:]
+         meanstore[i,:,:] = ma.average(datam,axis=0)
+        # construct new times list
+       smyunit = str(myunit)
+       if len(smyunit)==4:  # YYYY
+        yyyy = int(smyunit[0:4])
+        mm = 1
+        dd = 1
+       if len(smyunit)==6:  # YYYYMM
+        yyyy = int(smyunit[0:4])
+        mm = int(smyunit[4:6])
+        dd = 1
+       if len(smyunit)==8:  # YYYYMMDD
+        yyyy = int(smyunit[0:4])
+        mm = int(smyunit[4:6])
+        dd = int(smyunit[6:8])
+       if len(smyunit)==3:  # Full time range
+        # Need to set an appropriate time representing the mid-point of the entire time span
+        dt = dateList[-1]-dateList[0]
+        halfway = dateList[0]+(dt/2)
+        yyyy = int(halfway.year)
+        mm = int(halfway.month)
+        dd = int(halfway.day)
+       newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0))
+       i += 1
+
+     if not processing_required:
+        meanstorem = data
+     elif processing_required:
+        meanstorem = meanstore
+
+   print '***  End calc_average_on_new_time_unit_KK  ***'
+   return meanstorem,newTimesList

http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/.svn/text-base/visualize.py.svn-base
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/.svn/text-base/visualize.py.svn-base b/src/main/python/rcmes/toolkit/.svn/text-base/visualize.py.svn-base
new file mode 100755
index 0000000..20e4447
--- /dev/null
+++ b/src/main/python/rcmes/toolkit/.svn/text-base/visualize.py.svn-base
@@ -0,0 +1 @@
+"""Collection of functions to visualize the data analysis output artifacts"""
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/__init__.py b/src/main/python/rcmes/toolkit/__init__.py
new file mode 100755
index 0000000..11548e6
--- /dev/null
+++ b/src/main/python/rcmes/toolkit/__init__.py
@@ -0,0 +1,2 @@
+"""The toolkit Package is a collection of modules that provide a set of tools
+that can be used to process, analyze and plot the analysis results."""
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/__init__.pyc
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/__init__.pyc b/src/main/python/rcmes/toolkit/__init__.pyc
new file mode 100644
index 0000000..9b1707b
Binary files /dev/null and b/src/main/python/rcmes/toolkit/__init__.pyc differ

http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/do_data_prep.py
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/do_data_prep.py b/src/main/python/rcmes/toolkit/do_data_prep.py
new file mode 100755
index 0000000..db9ea39
--- /dev/null
+++ b/src/main/python/rcmes/toolkit/do_data_prep.py
@@ -0,0 +1,461 @@
+"""Prepare Datasets both model and observations for analysis using metrics"""
+
+
+import numpy as np
+import numpy.ma as ma
+import sys, os
+
+from storage import db, files
+import process
+from utils import misc
+import scipy.interpolate
+
+# TODO:  swap gridBox for Domain
+def prep_data(settings,obsSource,obsDatasetList,obsList,obsVarName,obsLonName,obsLatName,obsTimeName,obsTimestep,gridBox,modelList):
+    """
+    
+    returns numOBSs,numMDLs,nT,ngrdY,ngrdX,Times,lons,lats,obsData,modelData,obsList
+    
+        numOBSs - number of Observational Datasets.  Really need to look at using len(obsDatasetList) instead
+        numMDLs - number of Models used.  Again should use the len(modelList) instead
+        nT - Time value count after temporal regridding. Should use length of the time axis for a given dataset
+        ngrdY - size of the Y-Axis grid after spatial regridding
+        ngrdX - size of the X-Axis grid after spatial regridding
+        Times - list of python datetime objects the represent the list of time to be used in further calculations
+        lons - 
+        lats - 
+        obsData - 
+        modelData - 
+        obsList - 
+        
+    # 5/28/2013: Add 'obsSource' to indicate the source of the reference data file(s)
+    """
+    
+    # TODO:  Stop the object Deserialization and work on refactoring the core code here
+    cachedir = settings.cacheDir
+    workdir = settings.workDir
+    variableType = settings.variableType
+    variableType = variableType.lower()
+
+    # Use list comprehensions to deconstruct obsDatasetList
+    #  ['TRMM_pr_mon', 'CRU3.1_pr']    Basically a list of Dataset NAME +'_' + parameter name - THE 'CRU*' one triggers unit conversion issues later
+    # the plan here is to use the obsDatasetList which contains a longName key we can use.
+    if obsSource == 0:              # reference data from RCMED
+        obsList = [str(x['longname']) for x in obsDatasetList]
+        # Also using the obsDatasetList with a key of ['dataset_id']
+        obsDatasetId = [str(x['dataset_id']) for x in obsDatasetList]
+        # obsDatasetList ['paramter_id'] list
+        obsParameterId = [str(x['parameter_id']) for x in obsDatasetList]
+        obsTimestep = [str(x['timestep']) for x in obsDatasetList]
+    elif obsSource == 1:              # reference data from users' disk
+        # in this case, obsList and obsTimestep have been defined in the calling routine and passed into this routine
+        pass
+    elif obsSource < 0:               # no reference data are involved
+        pass
+
+    mdlList = [model.filename for model in modelList]
+
+    # Since all of the model objects in the modelList have the same "VarName" and "variableType", merely pull these from modelList[0]
+    modelTimeVarName = modelList[0].timeVariable
+    modelLatVarName = modelList[0].latVariable
+    modelLonVarName = modelList[0].lonVariable
+    modelVarName = modelList[0].varName
+    regridOption = settings.spatialGrid
+    timeRegridOption = settings.temporalGrid
+    print 'model parameter names ',modelVarName,modelTimeVarName,modelLatVarName,modelLonVarName
+    
+    """
+     Routine to read-in and re-grid both obs and mdl datasets.
+     Processes both single and multiple files of obs and mdl or combinations in a general way.
+           i)    retrieve observations from the database
+           ii)   load in model data
+           iii)  temporal regridding
+           iv)   spatial regridding
+           v)    area-averaging
+           Input:
+                   cachedir 	- string describing directory path
+                   workdir 	- string describing directory path
+                   obsList        - string describing the observation data files
+                   obsDatasetId 	- int, db dataset id
+                   obsParameterId	- int, db parameter id 
+                   latMin, latMax, lonMin, lonMax, dLat, dLon, naLats, naLons: define the evaluation/analysis domain/grid system
+    	         latMin		- float
+                   latMax		- float
+                   lonMin		- float
+                   lonMax		- float
+                   dLat  		- float
+                   dLon  		- float
+                   naLats		- integer
+                   naLons		- integer
+                   mdlList	- string describing model file name + path
+                   modelVarName	- string describing name of variable to evaluate (as written in model file)
+                 variableType   - string
+                   modelTimeVarName - string describing name of time variable in model file 	
+                   modelLatVarName  - string describing name of latitude variable in model file 
+                   modelLonVarName  - string describing name of longitude variable in model file 
+                   regridOption 	 - string: 'obs'|'model'|'user'
+                   timeRegridOption -string: 'full'|'annual'|'monthly'|'daily'
+                   maskOption - Boolean
+                   
+                   # TODO:  This isn't true in the current codebase.
+                   Instead the SubRegion's are being used.  You can see that these values are not
+                   being used in the code, at least they are not being passed in from the function
+                   
+                   maskLatMin - float (only used if maskOption=1)
+                   maskLatMax - float (only used if maskOption=1)
+    	         maskLonMin - float (only used if maskOption=1)
+                   maskLonMax - float (only used if maskOption=1)
+           Output: image files of plots + possibly data
+           Jinwon Kim, 7/11/2012
+    """
+
+
+    # check the number of obs & model data files
+    numOBSs = len(obsList)
+    numMDLs = len(mdlList)
+    
+    # assign parameters that must be preserved throughout the process
+    
+    print 'start & end time = ', settings.startDate, settings.endDate
+    yymm0 = settings.startDate.strftime("%Y%m")
+    yymm1 = settings.endDate.strftime("%Y%m")
+    print 'start & end eval period = ', yymm0, yymm1
+
+
+    #TODO: Wrap in try except blocks instead
+    # 2b expanded to encompass mdl-only or obs-only cases (post 2.1 item)
+    if numMDLs < 1: 
+        print 'No input model data file. EXIT'
+        sys.exit()
+    if numOBSs < 1: 
+        print 'No input observation data file. EXIT'
+        sys.exit()
+
+    ## Part 1: retrieve observation data from the database and regrid them
+    ##       NB. automatically uses local cache if already retrieved.
+
+    # preparation for spatial re-gridding: define the size of horizontal array of the target interpolation grid system (ngrdX and ngrdY)
+    print 'regridOption in prep_data= ', regridOption
+    if regridOption == 'model':
+        ifile = mdlList[0]
+        typeF = 'nc'
+        #lats, lons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
+        lats, lons, mTimes = files.read_data_from_one_file(ifile, modelVarName,
+                                                           modelLatVarName,
+                                                           modelLonVarName,
+                                                           modelTimeVarName,
+                                                           typeF)[:3]
+        modelObject = modelList[0]
+        latMin = modelObject.latMin
+        latMax = modelObject.latMax
+        lonMin = modelObject.lonMin
+        lonMax = modelObject.lonMax
+    elif regridOption == 'user':
+        # Use the GridBox Object
+        latMin = gridBox.latMin
+        latMax = gridBox.latMax
+        lonMin = gridBox.lonMin
+        lonMax = gridBox.lonMax
+        naLats = gridBox.latCount
+        naLons = gridBox.lonCount
+        dLat = gridBox.latStep
+        dLon = gridBox.lonStep
+        lat = np.arange(naLats) * dLat + latMin
+        lon = np.arange(naLons) * dLon + lonMin
+        lons, lats = np.meshgrid(lon, lat)
+        lon = 0.
+        lat = 0.
+    else:
+        print "INVALID REGRID OPTION USED"
+        sys.exit()
+        
+    ngrdY = lats.shape[0]
+    ngrdX = lats.shape[1]
+
+    # All user-provided reference data are to be in netCDF format
+    if obsSource == 1:
+        typeF = 'nc'
+    regObsData = []
+    print ''
+    regularGrid = raw_input('OBS data are on regular grids? [y/n] \n>')
+    
+    for n in np.arange(numOBSs):
+
+        # read-in obs data and first do spatial regridding
+        if obsSource == 0:         # reference data from RCMED
+            oLats, oLons, _, oTimes, oData = db.extractData(obsDatasetId[n],
+                                                            obsParameterId[n],
+                                                            latMin, latMax,
+                                                            lonMin, lonMax,
+                                                            settings.startDate, settings.endDate,
+                                                            cachedir, obsTimestep[n])
+            # TODO: modify this if block with new metadata usage.
+            if (variableType == 'precipitation') and (obsList[n][0:3] == 'CRU'):
+                oData = 86400.0 * oData
+
+        elif obsSource == 1:      # reference data from users' own file
+            ifile = obsDatasetList[n]
+            oLats, oLons, oTimes, oData, ovUnit = files.read_data_from_one_file(ifile,
+                                                                                obsVarName,
+                                                                                obsLatName,
+                                                                                obsLonName,
+                                                                                obsTimeName,
+                                                                                typeF)
+            print 'ovUnit=',ovUnit
+            if (variableType == 'precipitation') or (variableType == 'evaporation') or (variableType == 'runoff'):
+                if ((ovUnit=='KG M-2 S-1') or (ovUnit=='kg m-2 s-1') or (ovUnit == 'MM S-1') or (ovUnit == 'mm s-1')):
+                    print 'convert model variable unit from mm/s to mm/day'
+                    oData=86400.*oData
+            # extract the reference data for the specified period
+            obsT = []
+            oStp = len(oTimes)
+            for i in np.arange(oStp):
+                obsT.append(oTimes[i].strftime("%Y%m"))
+            wh = (np.array(obsT) >= yymm0) & (np.array(obsT) <= yymm1)
+            oTimes = list(np.array(oTimes)[wh])
+            oData = oData[wh,:,:]
+
+        nstOBSs = oData.shape[0]         # note that the length of obs data can vary for different obs intervals (e.g., daily vs. monthly)
+        print 'Regrid OBS dataset onto the ', regridOption, ' grid system: ngrdY, ngrdX, nstOBSs= ', ngrdY, ngrdX, nstOBSs
+        print 'For dataset: %s' % obsList[n]
+        
+        tmpOBS = ma.zeros((nstOBSs, ngrdY, ngrdX))
+        print 'tmpOBS shape = ', tmpOBS.shape
+        if regularGrid == 'y':
+            for t in np.arange(nstOBSs):
+                tmpOBS[t, :, :] = process.do_regrid(oData[t, :, :], oLats, oLons, lats, lons)
+        else:
+            # new intetpolation scheme "scipy.interpolate.griddata" is used. note that masked values must be specified using "np.nan"
+            #     note that using "cubic" option causes problems in interpolation
+            for t in np.arange(nstOBSs):
+                inLats = oLats.flatten()
+                inLons = oLons.flatten()
+                length = len(inLats)
+                inPts = ma.zeros((length, 2))
+                inPts[:,0] = inLats
+                inPts[:,1] = inLons
+                inData = oData[t, :, :].flatten()
+                inData[inData < -999.] = np.nan
+                tmpOBS[t, :, :] = scipy.interpolate.griddata(inPts, inData, (lats, lons), method = 'linear')
+            inLats = 0.
+            inLons = 0.
+            inPts  = 0.
+            inData = 0.
+        print 'spatial regridding of the obs data ',n,' is completed'
+            
+        # TODO:  Not sure this is needed with Python Garbage Collector
+        # The used memory should be freed when the objects are no longer referenced.  If this continues to be an issue we may need to look
+        # at using generators where possible.
+        oLats = 0.0
+        oLons = 0.0       # release the memory occupied by the temporary variables oLats and oLons.
+        
+        # temporally regrid the spatially regridded obs data
+        oData, newObsTimes = process.calc_average_on_new_time_unit_K(tmpOBS, oTimes, unit=timeRegridOption)
+
+        tmpOBS = 0.0      # release temporary array after regridding
+        
+        # check the consistency of temporally regridded obs data
+        if n == 0:
+            oldObsTimes = newObsTimes
+        else:
+            if oldObsTimes != newObsTimes:
+                print 'temporally regridded obs data time levels do not match at ', n - 1, n
+                print '%s Time through Loop' % (n + 1)
+                print 'oldObsTimes Count: %s' % len(oldObsTimes)
+                print 'newObsTimes Count: %s' % len(newObsTimes)
+                # TODO:  We need to handle these cases using Try Except Blocks or insert a sys.exit if appropriate
+                sys.exit()
+            else:
+                oldObsTimes = newObsTimes
+        # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
+        regObsData.append(oData)
+
+
+    """ all obs datasets have been read-in and regridded. convert the regridded obs data from 'list' to 'array'
+    also finalize 'obsTimes', the time coordinate values of the regridded obs data.
+    NOTE: using 'list_to_array' assigns values to the missing points; this has become a problem in handling the CRU data.
+          this problem disappears by using 'ma.array'."""
+
+    obsData = ma.array(regObsData)
+    obsTimes = newObsTimes
+    regObsData = 0
+    oldObsTimes = 0
+    nT = len(obsTimes)
+
+    # TODO:  Refactor this into a function within the toolkit module
+    # compute the simple multi-obs ensemble if multiple obs are used
+    if numOBSs > 1:
+        print 'numOBSs = ', numOBSs
+        oData = obsData
+        print 'oData shape = ', oData.shape
+        obsData = ma.zeros((numOBSs + 1, nT, ngrdY, ngrdX))
+        print 'obsData shape = ', obsData.shape
+        avg = ma.zeros((nT, ngrdY, ngrdX))
+        
+        for i in np.arange(numOBSs):
+            obsData[i, :, :, :] = oData[i, :, :, :]
+            avg[:, :, :] = avg[:, :, :] + oData[i, :, :, :]
+
+        avg = avg / float(numOBSs)
+        obsData[numOBSs, :, :, :] = avg[:, :, :]     # store the obs-ensemble data
+        numOBSs = numOBSs + 1                        # update the number of obs data to include the obs ensemble
+        obsList.append('ensOBS')
+    print 'OBS regridded: ', obsData.shape
+
+    ## Part 2: load in and regrid model data from file(s)
+    ## NOTE: tthe wo parameters, numMDLs and numMOmx are defined to represent the number of models (w/ all 240 mos) &
+    ##       the total number of months, respectively, in later multi-model calculations.
+
+    typeF = 'nc'
+    regridMdlData = []
+    # extract the model names and store them in the list 'mdlList'
+    mdlName = []
+    mdlListReversed=[]
+    if len(mdlList) >1:
+       for element in mdlList:
+           mdlListReversed.append(element[::-1])
+       prefix=os.path.commonprefix(mdlList)
+       postfix=os.path.commonprefix(mdlListReversed)[::-1]
+       for element in mdlList:
+           mdlName.append(element.replace(prefix,'').replace(postfix,''))
+    else:
+        mdlName.append('model') 
+
+    print ''
+    regularGrid = raw_input('MDL data are on regular grids? [y/n] \n>')
+    
+    for n in np.arange(numMDLs):
+        # read model grid info, then model data
+        ifile = mdlList[n]
+        print 'ifile= ', ifile
+        #modelLats, modelLons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
+        #mTime, mdlDat, mvUnit = files.read_data_from_one_file(ifile, modelVarName, modelTimeVarName, modelLats, typeF)
+        modelLats, modelLons, mTimes, mdlDat, mvUnit = files.read_data_from_one_file(ifile,
+                                                                                     modelVarName,
+                                                                                     modelLatVarName,
+                                                                                     modelLonVarName,
+                                                                                     modelTimeVarName,
+                                                                                     typeF)
+        mdlT = []
+        mStep = len(mTimes)
+
+        for i in np.arange(mStep):
+            mdlT.append(mTimes[i].strftime("%Y%m"))
+
+        wh = (np.array(mdlT) >= yymm0) & (np.array(mdlT) <= yymm1)
+        modelTimes = list(np.array(mTimes)[wh])
+        mData = mdlDat[wh, :, :]
+        # convert the units of model water flux variables (precipitation, evaporation, runoff) into "mm/day" as necessary
+        if (variableType == 'precipitation') or (variableType == 'evaporation') or (variableType == 'runoff'):
+            if (mvUnit=='KG M-2 S-1') or (mvUnit=='kg m-2 s-1') or (mvUnit == 'MM S-1') or (mvUnit == 'mm s-1'):
+                print 'convert model variable unit from mm/s to mm/day'
+                mData = 86400. * mData
+            else:
+                pass
+        elif (variableType == 'SWE') or (variableType == 'swe'):
+            if (mvUnit=='m') or (mvUnit=='M') or (mvUnit=='meter') or (mvUnit=='METER'):
+                print 'convert model SWE unit from meters to mm'
+                mData = 1.e3 * mData
+            else:
+                pass
+   
+        # determine the dimension size from the model time and latitude data.
+        nT = len(modelTimes)
+        
+        # UNUSED VARIABLES - WILL DELETE AFTER TESTING
+        # nmdlY=modelLats.shape[0]
+        # nmdlX=modelLats.shape[1]
+        #print 'nT, ngrdY, ngrdX = ',nT,ngrdY, ngrdX,min(modelTimes),max(modelTimes)
+        print '  The shape of model data to be processed= ', mData.shape, ' for the period ', min(modelTimes), max(modelTimes)
+        # spatial regridding of the modl data
+        tmpMDL = ma.zeros((nT, ngrdY, ngrdX))
+        
+        if regridOption != 'model':
+            if regularGrid == 'y':
+                for t in np.arange(nT):
+                    tmpMDL[t, :, :] = process.do_regrid(mData[t, :, :], modelLats, modelLons, lats, lons)
+            else:
+                # new intetpolation scheme "scipy.interpolate.griddata" is used. note that masked values must be specified using "np.nan"
+                #     note that using "cubic" option causes problems in interpolation
+                for t in np.arange(nstOBSs):
+                    inLats = modelLats.flatten()
+                    inLons = modelLons.flatten()
+                    length = len(inLats)
+                    inPts = ma.zeros((length, 2))
+                    inPts[:,0] = inLats
+                    inPts[:,1] = inLons
+                    inData = mData[t, :, :].flatten()
+                    inData[inData < -999.] = np.nan
+                    tmpMDL[t, :, :] = scipy.interpolate.griddata(inPts, inData, (lats, lons), method = 'linear')
+                inLats = 0.
+                inLons = 0.
+                inPts  = 0.
+                inData = 0.
+        else:
+            tmpMDL = mData
+
+        # temporally regrid the model data
+        mData, newMdlTimes = process.regrid_in_time(tmpMDL, modelTimes, unit=timeRegridOption)
+        tmpMDL = 0.0
+        
+        # check data consistency for all models 
+        if n == 0:
+            oldMdlTimes = newMdlTimes
+        else:
+            if oldMdlTimes != newMdlTimes:
+                print 'temporally regridded mdl data time levels do not match at ', n - 1, n
+                print len(oldMdlTimes), len(newMdlTimes)
+                sys.exit()
+            else:
+                oldMdlTimes = newMdlTimes
+
+        # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
+        regridMdlData.append(mData)
+
+    modelData = ma.array(regridMdlData)
+    modelTimes = newMdlTimes
+    regridMdlData = 0
+    oldMdlTimes = 0
+    newMdlTimes = 0
+    # check consistency between the time levels of the model and obs data
+    #   this is the final update of time levels: 'Times' and 'nT'
+    if obsTimes != modelTimes:
+        print 'time levels of the obs and model data are not consistent. EXIT'
+        print 'obsTimes'
+        print obsTimes
+        print 'modelTimes'
+        print modelTimes
+        sys.exit()
+    #  'Times = modelTimes = obsTimes' has been established and modelTimes and obsTimes will not be used hereafter. (de-allocated)
+    Times = modelTimes
+    nT = len(modelTimes)
+    modelTimes = 0
+    obsTimes = 0
+
+    print 'Reading and regridding model data are completed'
+    print 'numMDLs, modelData.shape= ', numMDLs, modelData.shape
+    
+    # TODO: Do we need to make this a user supplied flag, or do we just create an ensemble ALWAYS
+    # TODO: Add in Kyo's code here as well
+    # TODO:  Commented out until I can talk with Jinwon about this
+    # compute the simple multi-model ensemble if multiple models are evaluated
+    if numMDLs > 1:
+        mdlData=modelData
+        modelData=ma.zeros((numMDLs+1,nT,ngrdY,ngrdX))
+        avg=ma.zeros((nT,ngrdY,ngrdX))
+        for i in np.arange(numMDLs):
+            modelData[i,:,:,:]=mdlData[i,:,:,:]
+            avg[:,:,:]=avg[:,:,:]+mdlData[i,:,:,:]
+        avg=avg/float(numMDLs)
+        modelData[numMDLs,:,:,:]=avg[:,:,:]     # store the model-ensemble data
+        # THESE ARE NOT USED.  WILL DELETE AFTER TESTING
+        # i0mdl=0
+        # i1mdl=numMDLs
+        numMDLs=numMDLs+1
+        mdlName.append('ensMDL')
+        print 'Eval mdl-mean timeseries for the obs periods: modelData.shape= ',modelData.shape
+    # GOODALE:  This ensemble code should be refactored into process.py module since it's purpose is data processing
+
+    # Processing complete
+    print 'data_prep is completed: both obs and mdl data are re-gridded to a common analysis grid'
+    return numOBSs, numMDLs, nT, ngrdY, ngrdX, Times, lons, lats, obsData, modelData, obsList, mdlName, variableType

http://git-wip-us.apache.org/repos/asf/climate/blob/a6aa1cd2/src/main/python/rcmes/toolkit/do_data_prep.pyc
----------------------------------------------------------------------
diff --git a/src/main/python/rcmes/toolkit/do_data_prep.pyc b/src/main/python/rcmes/toolkit/do_data_prep.pyc
new file mode 100644
index 0000000..d2b708c
Binary files /dev/null and b/src/main/python/rcmes/toolkit/do_data_prep.pyc differ