You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@climate.apache.org by pr...@apache.org on 2013/08/27 07:35:49 UTC
svn commit: r1517753 [8/33] - in /incubator/climate/branches/rcmet-2.1.1: ./
src/ src/main/ src/main/python/ src/main/python/bin/ src/main/python/docs/
src/main/python/docs/_static/ src/main/python/docs/_templates/
src/main/python/rcmes/ src/main/pytho...
Added: incubator/climate/branches/rcmet-2.1.1/src/main/python/rcmes/toolkit/do_data_prep.py
URL: http://svn.apache.org/viewvc/incubator/climate/branches/rcmet-2.1.1/src/main/python/rcmes/toolkit/do_data_prep.py?rev=1517753&view=auto
==============================================================================
--- incubator/climate/branches/rcmet-2.1.1/src/main/python/rcmes/toolkit/do_data_prep.py (added)
+++ incubator/climate/branches/rcmet-2.1.1/src/main/python/rcmes/toolkit/do_data_prep.py Tue Aug 27 05:35:42 2013
@@ -0,0 +1,434 @@
+"""Prepare Datasets both model and observations for analysis using metrics"""
+
+
+import numpy as np
+import numpy.ma as ma
+import sys, os
+
+from storage import db, files
+import process
+from utils import misc
+import scipy.interpolate
+
+# TODO: swap gridBox for Domain
+def prep_data(settings,obsSource,obsDatasetList,obsList,obsVarName,obsLonName,obsLatName,obsTimeName,obsTimestep,gridBox,modelList):
+ """
+
+ returns numOBSs,numMDLs,nT,ngrdY,ngrdX,Times,lons,lats,obsData,modelData,obsList
+
+ numOBSs - number of Observational Datasets. Really need to look at using len(obsDatasetList) instead
+ numMDLs - number of Models used. Again should use the len(modelList) instead
+ nT - Time value count after temporal regridding. Should use length of the time axis for a given dataset
+ ngrdY - size of the Y-Axis grid after spatial regridding
+ ngrdX - size of the X-Axis grid after spatial regridding
+ Times - list of python datetime objects the represent the list of time to be used in further calculations
+ lons -
+ lats -
+ obsData -
+ modelData -
+ obsList -
+
+ # 5/28/2013: Add 'obsSource' to indicate the source of the reference data file(s)
+ """
+
+ # TODO: Stop the object Deserialization and work on refactoring the core code here
+ cachedir = settings.cacheDir
+ workdir = settings.workDir
+
+ # Use list comprehensions to deconstruct obsDatasetList
+ # ['TRMM_pr_mon', 'CRU3.1_pr'] Basically a list of Dataset NAME +'_' + parameter name - THE 'CRU*' one triggers unit conversion issues later
+ # the plan here is to use the obsDatasetList which contains a longName key we can use.
+ if obsSource == 0: # reference data from RCMED
+ obsList = [str(x['longname']) for x in obsDatasetList]
+ # Also using the obsDatasetList with a key of ['dataset_id']
+ obsDatasetId = [str(x['dataset_id']) for x in obsDatasetList]
+ # obsDatasetList ['paramter_id'] list
+ obsParameterId = [str(x['parameter_id']) for x in obsDatasetList]
+ obsTimestep = [str(x['timestep']) for x in obsDatasetList]
+ elif obsSource == 1: # reference data from users' disk
+ # in this case, obsList and obsTimestep have been defined in the calling routine and passed into this routine
+ pass
+ elif obsSource < 0: # no reference data are involved
+ pass
+
+ mdlList = [model.filename for model in modelList]
+
+ # Since all of the model objects in the modelList have the same VarName and PrecipFlag, I am going to merely
+ # pull this from modelList[0] for now
+ modelVarName = modelList[0].varName
+ precipFlag = modelList[0].precipFlag
+ modelTimeVarName = modelList[0].timeVariable
+ modelLatVarName = modelList[0].latVariable
+ modelLonVarName = modelList[0].lonVariable
+ regridOption = settings.spatialGrid
+ timeRegridOption = settings.temporalGrid
+ print 'model parameter names ',modelVarName,modelTimeVarName,modelLatVarName,modelLonVarName
+
+ """
+ Routine to read-in and re-grid both obs and mdl datasets.
+ Processes both single and multiple files of obs and mdl or combinations in a general way.
+ i) retrieve observations from the database
+ ii) load in model data
+ iii) temporal regridding
+ iv) spatial regridding
+ v) area-averaging
+ Input:
+ cachedir - string describing directory path
+ workdir - string describing directory path
+ obsList - string describing the observation data files
+ obsDatasetId - int, db dataset id
+ obsParameterId - int, db parameter id
+ latMin, latMax, lonMin, lonMax, dLat, dLon, naLats, naLons: define the evaluation/analysis domain/grid system
+ latMin - float
+ latMax - float
+ lonMin - float
+ lonMax - float
+ dLat - float
+ dLon - float
+ naLats - integer
+ naLons - integer
+ mdlList - string describing model file name + path
+ modelVarName - string describing name of variable to evaluate (as written in model file)
+ precipFlag - bool (is this precipitation data? True/False)
+ modelTimeVarName - string describing name of time variable in model file
+ modelLatVarName - string describing name of latitude variable in model file
+ modelLonVarName - string describing name of longitude variable in model file
+ regridOption - string: 'obs'|'model'|'user'
+ timeRegridOption -string: 'full'|'annual'|'monthly'|'daily'
+ maskOption - Boolean
+
+ # TODO: This isn't true in the current codebase.
+ Instead the SubRegion's are being used. You can see that these values are not
+ being used in the code, at least they are not being passed in from the function
+
+ maskLatMin - float (only used if maskOption=1)
+ maskLatMax - float (only used if maskOption=1)
+ maskLonMin - float (only used if maskOption=1)
+ maskLonMax - float (only used if maskOption=1)
+ Output: image files of plots + possibly data
+ Jinwon Kim, 7/11/2012
+ """
+
+
+ # check the number of obs & model data files
+ numOBSs = len(obsList)
+ numMDLs = len(mdlList)
+
+ # assign parameters that must be preserved throughout the process
+
+ print 'start & end time = ', settings.startDate, settings.endDate
+ yymm0 = settings.startDate.strftime("%Y%m")
+ yymm1 = settings.endDate.strftime("%Y%m")
+ print 'start & end eval period = ', yymm0, yymm1
+
+
+ #TODO: Wrap in try except blocks instead
+ # 2b expanded to encompass mdl-only or obs-only cases (post 2.1 item)
+ if numMDLs < 1:
+ print 'No input model data file. EXIT'
+ sys.exit()
+ if numOBSs < 1:
+ print 'No input observation data file. EXIT'
+ sys.exit()
+
+ ## Part 1: retrieve observation data from the database and regrid them
+ ## NB. automatically uses local cache if already retrieved.
+
+ # preparation for spatial re-gridding: define the size of horizontal array of the target interpolation grid system (ngrdX and ngrdY)
+ print 'regridOption in prep_data= ', regridOption
+ if regridOption == 'model':
+ ifile = mdlList[0]
+ typeF = 'nc'
+ lats, lons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
+ modelObject = modelList[0]
+ latMin = modelObject.latMin
+ latMax = modelObject.latMax
+ lonMin = modelObject.lonMin
+ lonMax = modelObject.lonMax
+ elif regridOption == 'user':
+ # Use the GridBox Object
+ latMin = gridBox.latMin
+ latMax = gridBox.latMax
+ lonMin = gridBox.lonMin
+ lonMax = gridBox.lonMax
+ naLats = gridBox.latCount
+ naLons = gridBox.lonCount
+ dLat = gridBox.latStep
+ dLon = gridBox.lonStep
+ lat = np.arange(naLats) * dLat + latMin
+ lon = np.arange(naLons) * dLon + lonMin
+ lons, lats = np.meshgrid(lon, lat)
+ lon = 0.
+ lat = 0.
+ else:
+ print "INVALID REGRID OPTION USED"
+ sys.exit()
+
+ ngrdY = lats.shape[0]
+ ngrdX = lats.shape[1]
+
+ # All user-provided reference data are to be in netCDF format
+ if obsSource == 1:
+ typeF = 'nc'
+ regObsData = []
+ print ''
+ regularGrid = raw_input('OBS data are on regular grids? [y/n] \n>')
+
+ for n in np.arange(numOBSs):
+
+ # read-in obs data and first do spatial regridding
+ if obsSource == 0: # reference data from RCMED
+ oLats, oLons, _, oTimes, oData = db.extractData(obsDatasetId[n],
+ obsParameterId[n],
+ latMin, latMax,
+ lonMin, lonMax,
+ settings.startDate, settings.endDate,
+ cachedir, obsTimestep[n])
+ # TODO: modify this if block with new metadata usage.
+ if precipFlag == True and obsList[n][0:3] == 'CRU':
+ oData = 86400.0 * oData
+ elif obsSource == 1: # reference data from users' own file
+ ifile = obsDatasetList[n]
+ oLats, oLons, oTimes = files.read_lolaT_from_file(ifile, obsLatName, obsLonName, obsTimeName, typeF)
+ oTime, oData, ovUnit = files.read_data_from_one_file(ifile, obsVarName, obsTimeName, oLats, typeF)
+ if precipFlag & ((ovUnit=='KG M-2 S-1') or (ovUnit=='kg m-2 s-1') or (ovUnit == 'MM S-1') or (ovUnit == 'mm s-1')):
+ print 'convert model variable unit from mm/s to mm/day'
+ oData=86400.*oData
+ # extract the reference data for the specified period
+ obsT = []
+ oStp = len(oTimes)
+ for i in np.arange(oStp):
+ obsT.append(oTimes[i].strftime("%Y%m"))
+ wh = (np.array(obsT) >= yymm0) & (np.array(obsT) <= yymm1)
+ oTimes = list(np.array(oTimes)[wh])
+ oData = oData[wh,:,:]
+
+ nstOBSs = oData.shape[0] # note that the length of obs data can vary for different obs intervals (e.g., daily vs. monthly)
+ print 'Regrid OBS dataset onto the ', regridOption, ' grid system: ngrdY, ngrdX, nstOBSs= ', ngrdY, ngrdX, nstOBSs
+ print 'For dataset: %s' % obsList[n]
+
+ tmpOBS = ma.zeros((nstOBSs, ngrdY, ngrdX))
+ print 'tmpOBS shape = ', tmpOBS.shape
+ if regularGrid == 'y':
+ for t in np.arange(nstOBSs):
+ tmpOBS[t, :, :] = process.do_regrid(oData[t, :, :], oLats, oLons, lats, lons)
+ else:
+ # new intetpolation scheme "scipy.interpolate.griddata" is used. note that masked values must be specified using "np.nan"
+ # note that using "cubic" option causes problems in interpolation
+ for t in np.arange(nstOBSs):
+ inLats = oLats.flatten()
+ inLons = oLons.flatten()
+ length = len(inLats)
+ inPts = ma.zeros((length, 2))
+ inPts[:,0] = inLats
+ inPts[:,1] = inLons
+ inData = oData[t, :, :].flatten()
+ inData[inData < -999.] = np.nan
+ tmpOBS[t, :, :] = scipy.interpolate.griddata(inPts, inData, (lats, lons), method = 'linear')
+ inLats = 0.
+ inLons = 0.
+ inPts = 0.
+ inData = 0.
+ print 'spatial regridding of the obs data ',n,' is completed'
+
+ # TODO: Not sure this is needed with Python Garbage Collector
+ # The used memory should be freed when the objects are no longer referenced. If this continues to be an issue we may need to look
+ # at using generators where possible.
+ oLats = 0.0
+ oLons = 0.0 # release the memory occupied by the temporary variables oLats and oLons.
+
+ # temporally regrid the spatially regridded obs data
+ oData, newObsTimes = process.calc_average_on_new_time_unit_K(tmpOBS, oTimes, unit=timeRegridOption)
+
+ tmpOBS = 0.0 # release temporary array after regridding
+
+ # check the consistency of temporally regridded obs data
+ if n == 0:
+ oldObsTimes = newObsTimes
+ else:
+ if oldObsTimes != newObsTimes:
+ print 'temporally regridded obs data time levels do not match at ', n - 1, n
+ print '%s Time through Loop' % (n + 1)
+ print 'oldObsTimes Count: %s' % len(oldObsTimes)
+ print 'newObsTimes Count: %s' % len(newObsTimes)
+ # TODO: We need to handle these cases using Try Except Blocks or insert a sys.exit if appropriate
+ sys.exit()
+ else:
+ oldObsTimes = newObsTimes
+ # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
+ regObsData.append(oData)
+
+
+ """ all obs datasets have been read-in and regridded. convert the regridded obs data from 'list' to 'array'
+ also finalize 'obsTimes', the time coordinate values of the regridded obs data.
+ NOTE: using 'list_to_array' assigns values to the missing points; this has become a problem in handling the CRU data.
+ this problem disappears by using 'ma.array'."""
+
+ obsData = ma.array(regObsData)
+ obsTimes = newObsTimes
+ regObsData = 0
+ oldObsTimes = 0
+ nT = len(obsTimes)
+
+ # TODO: Refactor this into a function within the toolkit module
+ # compute the simple multi-obs ensemble if multiple obs are used
+ if numOBSs > 1:
+ print 'numOBSs = ', numOBSs
+ oData = obsData
+ print 'oData shape = ', oData.shape
+ obsData = ma.zeros((numOBSs + 1, nT, ngrdY, ngrdX))
+ print 'obsData shape = ', obsData.shape
+ avg = ma.zeros((nT, ngrdY, ngrdX))
+
+ for i in np.arange(numOBSs):
+ obsData[i, :, :, :] = oData[i, :, :, :]
+ avg[:, :, :] = avg[:, :, :] + oData[i, :, :, :]
+
+ avg = avg / float(numOBSs)
+ obsData[numOBSs, :, :, :] = avg[:, :, :] # store the obs-ensemble data
+ numOBSs = numOBSs + 1 # update the number of obs data to include the obs ensemble
+ obsList.append('ensOBS')
+ print 'OBS regridded: ', obsData.shape
+
+ ## Part 2: load in and regrid model data from file(s)
+ ## NOTE: tthe wo parameters, numMDLs and numMOmx are defined to represent the number of models (w/ all 240 mos) &
+ ## the total number of months, respectively, in later multi-model calculations.
+
+ typeF = 'nc'
+ regridMdlData = []
+ # extract the model names and store them in the list 'mdlList'
+ mdlName = []
+ mdlListReversed=[]
+ if len(mdlList) >1:
+ for element in mdlList:
+ mdlListReversed.append(element[::-1])
+ prefix=os.path.commonprefix(mdlList)
+ postfix=os.path.commonprefix(mdlListReversed)[::-1]
+ for element in mdlList:
+ mdlName.append(element.replace(prefix,'').replace(postfix,''))
+ else:
+ mdlName.append('model')
+
+ print ''
+ regularGrid = raw_input('MDL data are on regular grids? [y/n] \n>')
+
+ for n in np.arange(numMDLs):
+ # read model grid info, then model data
+ ifile = mdlList[n]
+ print 'ifile= ', ifile
+ modelLats, modelLons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
+ mTime, mdlDat, mvUnit = files.read_data_from_one_file(ifile, modelVarName, modelTimeVarName, modelLats, typeF)
+ mdlT = []
+ mStep = len(mTimes)
+
+ for i in np.arange(mStep):
+ mdlT.append(mTimes[i].strftime("%Y%m"))
+
+ wh = (np.array(mdlT) >= yymm0) & (np.array(mdlT) <= yymm1)
+ modelTimes = list(np.array(mTimes)[wh])
+ mData = mdlDat[wh, :, :]
+
+ # determine the dimension size from the model time and latitude data.
+ nT = len(modelTimes)
+
+ # UNUSED VARIABLES - WILL DELETE AFTER TESTING
+ # nmdlY=modelLats.shape[0]
+ # nmdlX=modelLats.shape[1]
+ #print 'nT, ngrdY, ngrdX = ',nT,ngrdY, ngrdX,min(modelTimes),max(modelTimes)
+ print ' The shape of model data to be processed= ', mData.shape, ' for the period ', min(modelTimes), max(modelTimes)
+ # spatial regridding of the modl data
+ tmpMDL = ma.zeros((nT, ngrdY, ngrdX))
+
+ if regridOption != 'model':
+ if regularGrid == 'y':
+ for t in np.arange(nT):
+ tmpMDL[t, :, :] = process.do_regrid(mData[t, :, :], modelLats, modelLons, lats, lons)
+ else:
+ # new intetpolation scheme "scipy.interpolate.griddata" is used. note that masked values must be specified using "np.nan"
+ # note that using "cubic" option causes problems in interpolation
+ for t in np.arange(nstOBSs):
+ inLats = modelLats.flatten()
+ inLons = modelLons.flatten()
+ length = len(inLats)
+ inPts = ma.zeros((length, 2))
+ inPts[:,0] = inLats
+ inPts[:,1] = inLons
+ inData = mData[t, :, :].flatten()
+ inData[inData < -999.] = np.nan
+ tmpMDL[t, :, :] = scipy.interpolate.griddata(inPts, inData, (lats, lons), method = 'linear')
+ inLats = 0.
+ inLons = 0.
+ inPts = 0.
+ inData = 0.
+ else:
+ tmpMDL = mData
+
+ # temporally regrid the model data
+ mData, newMdlTimes = process.regrid_in_time(tmpMDL, modelTimes, unit=timeRegridOption)
+ tmpMDL = 0.0
+
+ # check data consistency for all models
+ if n == 0:
+ oldMdlTimes = newMdlTimes
+ else:
+ if oldMdlTimes != newMdlTimes:
+ print 'temporally regridded mdl data time levels do not match at ', n - 1, n
+ print len(oldMdlTimes), len(newMdlTimes)
+ sys.exit()
+ else:
+ oldMdlTimes = newMdlTimes
+
+ # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
+ regridMdlData.append(mData)
+
+ modelData = ma.array(regridMdlData)
+ modelTimes = newMdlTimes
+ regridMdlData = 0
+ oldMdlTimes = 0
+ newMdlTimes = 0
+ if precipFlag & ((mvUnit=='KG M-2 S-1') or (mvUnit=='kg m-2 s-1') or (mvUnit == 'MM S-1') or (mvUnit == 'mm s-1')):
+ print 'convert model variable unit from mm/s to mm/day'
+ modelData = 86400.*modelData
+
+ # check consistency between the time levels of the model and obs data
+ # this is the final update of time levels: 'Times' and 'nT'
+ if obsTimes != modelTimes:
+ print 'time levels of the obs and model data are not consistent. EXIT'
+ print 'obsTimes'
+ print obsTimes
+ print 'modelTimes'
+ print modelTimes
+ sys.exit()
+ # 'Times = modelTimes = obsTimes' has been established and modelTimes and obsTimes will not be used hereafter. (de-allocated)
+ Times = modelTimes
+ nT = len(modelTimes)
+ modelTimes = 0
+ obsTimes = 0
+
+ print 'Reading and regridding model data are completed'
+ print 'numMDLs, modelData.shape= ', numMDLs, modelData.shape
+
+ # TODO: Do we need to make this a user supplied flag, or do we just create an ensemble ALWAYS
+ # TODO: Add in Kyo's code here as well
+ # TODO: Commented out until I can talk with Jinwon about this
+ # compute the simple multi-model ensemble if multiple models are evaluated
+ if numMDLs > 1:
+ mdlData=modelData
+ modelData=ma.zeros((numMDLs+1,nT,ngrdY,ngrdX))
+ avg=ma.zeros((nT,ngrdY,ngrdX))
+ for i in np.arange(numMDLs):
+ modelData[i,:,:,:]=mdlData[i,:,:,:]
+ avg[:,:,:]=avg[:,:,:]+mdlData[i,:,:,:]
+ avg=avg/float(numMDLs)
+ modelData[numMDLs,:,:,:]=avg[:,:,:] # store the model-ensemble data
+ # THESE ARE NOT USED. WILL DELETE AFTER TESTING
+ # i0mdl=0
+ # i1mdl=numMDLs
+ numMDLs=numMDLs+1
+ mdlName.append('ensMDL')
+ print 'Eval mdl-mean timeseries for the obs periods: modelData.shape= ',modelData.shape
+ # GOODALE: This ensemble code should be refactored into process.py module since it's purpose is data processing
+
+ # Processing complete
+ print 'data_prep is completed: both obs and mdl data are re-gridded to a common analysis grid'
+ return numOBSs, numMDLs, nT, ngrdY, ngrdX, Times, lons, lats, obsData, modelData, obsList, mdlName
Propchange: incubator/climate/branches/rcmet-2.1.1/src/main/python/rcmes/toolkit/do_data_prep.py
------------------------------------------------------------------------------
svn:executable = *