You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@climate.apache.org by jo...@apache.org on 2014/11/10 17:05:52 UTC
[05/10] climate git commit: CLIMATE-551 - Drop old RCMET toolkit from
codebase
http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/storage/db.py
----------------------------------------------------------------------
diff --git a/rcmet/src/main/python/rcmes/storage/db.py b/rcmet/src/main/python/rcmes/storage/db.py
deleted file mode 100644
index 22a1dfd..0000000
--- a/rcmet/src/main/python/rcmes/storage/db.py
+++ /dev/null
@@ -1,359 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Collection of functions used to interface with the database and to create netCDF file
-"""
-import os
-import urllib2
-import re
-import numpy as np
-import numpy.ma as ma
-import json
-import netCDF4
-
-from classes import RCMED
-from toolkit import process
-from datetime import timedelta ,datetime
-from calendar import monthrange
-
-def reorderXYT(lons, lats, times, values):
- # Re-order values in values array such that when reshaped everywhere is where it should be
- # (as DB doesn't necessarily return everything in order)
- order = np.lexsort((lons, lats, times))
- counter = 0
- sortedValues = np.zeros_like(values)
- sortedLats = np.zeros_like(lats)
- sortedLons = np.zeros_like(lons)
- for i in order:
- sortedValues[counter] = values[i]
- sortedLats[counter] = lats[i]
- sortedLons[counter] = lons[i]
- counter += 1
-
- return sortedValues, sortedLats, sortedLons
-
-def findUnique(seq, idfun=None):
- """
- Function to find unique values (used in construction of unique datetime list)
- NB. order preserving
- Input: seq - a list of randomly ordered values
- Output: result - list of ordered values
- """
- if idfun is None:
- def idfun(x):
- return x
-
- seen = {};
- result = []
-
- for item in seq:
- marker = idfun(item)
- # in old Python versions:
- # if seen.has_key(marker)
- # but in new ones:
- if marker in seen: continue
- seen[marker] = 1
- result.append(item)
- return result
-
-def get_param_info(url):
-
- '''
- This function will get the general information by given URL from the parameter table.
- '''
- url = url + "&info=yes"
- result = urllib2.urlopen(url)
- datastring = result.read()
- datastring=json.loads(datastring)
- database=datastring["database"]
- timestep=datastring["timestep"]
- realm=datastring["realm"]
- instrument=datastring["instrument"]
- start_date=datastring["start_date"]
- end_date=datastring["end_date"]
- unit=datastring["units"]
-
- return database, timestep, realm, instrument, start_date, end_date, unit
-
-def get_data(url):
-
- '''
- This function will get the url, query from database and will return datapoints' latitude, longitude, level, time and value.
- '''
-
- result = urllib2.urlopen(url)
- datastring = result.read()
- d = re.search('data: \r\n', datastring)
- data = datastring[d.end():len(datastring)]
-
- # To create a list of all datapoints
- data=data.split('\r\n')
-
- latitudes = []
- longitudes = []
- levels = []
- values = []
- timestamps = []
-
- # To make a series of lists from datapoints
- for i in range(len(data)-1): # Because the last row is empty, "len(data)-1" is used.
- row=data[i].split(',')
- latitudes.append(np.float32(row[0]))
- longitudes.append(np.float32(row[1]))
- levels.append(np.float32(row[2]))
- # timestamps are strings so we will leave them alone for now
- timestamps.append(row[3])
- values.append(np.float32(row[4]))
-
- return latitudes, longitudes, levels, values, timestamps
-
-
-def create_netCDF(latitudes, longitudes, levels, values, timestamps, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, netCD_fileName):
-
- '''
- This function will generate netCDF files.
- '''
-
- # To generate netCDF file from database
- netcdf = netCDF4.Dataset(netCD_fileName,mode='w')
- string="The netCDF file for parameter: " + database + ", latMin: " + str(latMin) + ", latMax: " + str(latMax) + ", lonMin: " + str(lonMin) + ", lonMax: " + str(lonMax) + " startTime: " + str(startTime) + " and endTime: " + str(endTime) + "."
- netcdf.globalAttName = str(string)
- netcdf.createDimension('dim', len(latitudes))
- latitude = netcdf.createVariable('lat', 'd', ('dim',))
- longitude = netcdf.createVariable('lon', 'd', ('dim',))
- level = netcdf.createVariable('lev', 'd', ('dim',))
- time = netcdf.createVariable('time', 'd', ('dim',))
- value = netcdf.createVariable('value', 'd', ('dim',))
-
- netcdf.variables['lat'].varAttName = 'latitude'
- netcdf.variables['lat'].units = 'degrees_north'
- netcdf.variables['lon'].varAttName = 'longitude'
- netcdf.variables['lon'].units = 'degrees_east'
- netcdf.variables['time'].varAttName = 'time'
- netcdf.variables['time'].units = 'hours since ' + str(startTime)
- netcdf.variables['value'].varAttName = 'value'
- netcdf.variables['value'].units = str(unit)
- netcdf.variables['lev'].varAttName = 'level'
- netcdf.variables['lev'].units = 'hPa'
-
- hours=[]
- timeFormat = "%Y-%m-%d %H:%M:%S"
- base_date=startTime
- # To convert the date to hours
- for t in timestamps:
- date=datetime.strptime(t, timeFormat)
- diff=date-base_date
- hours.append(diff.days*24)
-
- latitude[:]=latitudes[:]
- longitude[:]=longitudes[:]
- level[:]=levels[:]
- time[:]=hours[:]
- value[:]=values[:]
- netcdf.close()
-
-def read_netcdf(netCD_fileName):
-
- '''
- This function will read the existed netCDF file, convert the hours from netCDF time variable
- and return latitudes, longitudes, levels, times and values.
- '''
- # To use the created netCDF file
- netcdf = netCDF4.Dataset(netCD_fileName , mode='r')
- # To get all data from netCDF file
- latitudes = netcdf.variables['lat'][:]
- longitudes = netcdf.variables['lon'][:]
- levels = netcdf.variables['lev'][:]
- hours = netcdf.variables['time'][:]
- values = ma.array(netcdf.variables['value'][:])
-
- # To get the base date
- time_unit=netcdf.variables['time'].units.encode()
- time_unit=time_unit.split(' ')
- base_date=time_unit[2] + " " + time_unit[3]
-
- netcdf.close()
-
- timeFormat = "%Y-%m-%d %H:%M:%S"
-
- # Because time in netCDF file is based on hours since a specific date, it needs to be converted to date format
- times=[]
- # To convert the base date to the python datetime format
- base_date = datetime.strptime(base_date, timeFormat)
- for each in range(len(hours)):
- hour=timedelta(hours[each]/24)
- eachTime=base_date + hour
- times.append(str(eachTime.year) + '-' + str("%02d" % (eachTime.month)) + '-' + str("%02d" % (eachTime.day)) + ' ' + str("%02d" % (eachTime.hour)) + ':' + str("%02d" % (eachTime.minute)) + ':' + str("%02d" % (eachTime.second)))
-
- return latitudes, longitudes, levels, times, values
-
-
-def improve_data(latitudes, longitudes, levels, times, values, timestep):
-
- # Make arrays of unique latitudes, longitudes, levels and times
- uniqueLatitudes = np.unique(latitudes)
- uniqueLongitudes = np.unique(longitudes)
- uniqueLevels = np.unique(levels)
- uniqueTimestamps = np.unique(times)
-
- # Calculate nx and ny
- uniqueLongitudeCount = len(uniqueLongitudes)
- uniqueLatitudeCount = len(uniqueLatitudes)
- uniqueLevelCount = len(uniqueLevels)
- uniqueTimeCount = len(uniqueTimestamps)
-
- values, latitudes, longitudes = reorderXYT(longitudes, latitudes, times, values)
-
- # Convert each unique time from strings into list of Python datetime objects
- # TODO - LIST COMPS!
- timeFormat = "%Y-%m-%d %H:%M:%S"
- timesUnique = [datetime.strptime(t, timeFormat) for t in uniqueTimestamps]
- timesUnique.sort()
- timesUnique = process.normalizeDatetimes(timesUnique, timestep)
-
- # Reshape arrays
- latitudes = latitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
- longitudes = longitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
- levels = np.array(levels).reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
- values = values.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
-
- # Flatten dimension if only single level
- if uniqueLevelCount == 1:
- values = values[:, :, :, 0]
- latitudes = latitudes[0, :, :, 0]
- longitudes = longitudes[0, :, :, 0]
-
- # Created masked array to deal with missing values
- # -these make functions like values.mean(), values.max() etc ignore missing values
- mdi = -9999 # TODO: extract this value from the DB retrieval metadata
- mdata = ma.masked_array(values, mask=(values == mdi))
-
-
- return latitudes, longitudes, uniqueLevels, timesUnique, mdata
-
-
-def extractData ( datasetID, paramID, latMin, latMax, lonMin, lonMax, userStartTime, userEndTime, cachedir, timestep ):
-
- """
- Main function to extract data from DB into numpy masked arrays, and also to create monthly netCDF file as cache
-
- Input::
- datasetID, paramID: required identifiers of data in database
- latMin, latMax, lonMin, lonMax: location range to extract data for
- startTime, endTime: python datetime objects describing required time range to extract
- cachedir: directory path used to store temporary cache files
- timestep: "daily" | "monthly" so we can be sure to query the RCMED properly
- Output:
- uniqueLatitudes,uniqueLongitudes: 1d-numpy array of latitude and longitude grid values
- uniqueLevels: 1d-numpy array of vertical level values
- timesUnique: list of python datetime objects describing times of returned data
- mdata: masked numpy arrays of data values
- """
-
- url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, userStartTime, userEndTime, cachedir, timestep)
-
- # To get the parameter's information from parameter table
- database, timestep, realm, instrument, dbStartDate, dbEndDate, unit = get_param_info(url)
-
- # Create a directory inside the cache directory
- name = []
- # activity is a fix value
- activity = "obs4cmip5"
- name.append(activity)
- # product is a fix value
- product = "observations"
- name.append(product)
- # realm, variable,frequency and instrument will be get from parameter table
- realm = realm
- name.append(realm)
- variable = database
- name.append(variable)
- frequency = timestep
- name.append(frequency)
- data_structure = "grid"
- name.append(data_structure)
- institution = "NASA"
- name.append(institution)
- project = "RCMES"
- name.append(project)
- instrument = instrument
- name.append(instrument)
- version = "v1"
- name.append(version)
-
- # Check to see whether the folder is already created for netCDF or not, then it will be created
- temp_path = cachedir
- for n in name:
- path = os.path.join(temp_path, n)
- if os.path.exists(path):
- temp_path = path
- pass
- else:
- os.mkdir(path)
- temp_path = path
-
- processing_level = 'L3'
- processing_version = "processing_version" # the processing version is still unknown and can be added later
-
- timeFormat = "%Y-%m-%d %H:%M:%S"
-
- date_list, lats, longs, uniqueLevls, uniqueTimes, vals = [], [], [], [], [], []
-
- # To make a list (date_list) of all months available based on user time request
- while userStartTime <= userEndTime:
- #To get the beginning of month
- beginningOfMonth = str("%04d" % userStartTime.year) + "-" + str("%02d" % userStartTime.month) + "-" + "01 00:00:00"
- #To get the end of month
- endOfMonth = str("%04d" % userStartTime.year) + "-" + str("%02d" % userStartTime.month) + "-" + str(monthrange(userStartTime.year,userStartTime.month)[1]) + " 00:00:00"
- #To convert both beginning and end of month from string to Python datetime format
- beginningOfMonth = datetime.strptime(beginningOfMonth, timeFormat)
- endOfMonth = datetime.strptime(endOfMonth, timeFormat)
- #To add beginning and end of month as a list to the date_list list
- date_list.append([beginningOfMonth, endOfMonth])
- #To get the beginning of next month
- userStartTime= endOfMonth + timedelta(days=1)
-
-
- # To loop over all months and return data
- for i, date in enumerate(date_list):
- netCDF_name = variable + '_' + project + '_' + processing_level + '_' + processing_version + '_' + str(latMin) + '_' + str(latMax) + '_' + str(lonMin) + '_' + str(lonMax) + '_' + str("%04d" % date[0].year) + str("%02d" % date[0].month) + '.nc'
-
- # To check if netCDF file exists, then use it
- if os.path.exists(path+"/"+ netCDF_name):
- latitudes, longitudes, levels, times, values = read_netcdf(path + "/" + netCDF_name)
-
- # If the netCDF file does not exist, then create one and read it.
- else:
- # To just query for one year of data
- print "%s of %s Database Download(s) Complete" % (i, len(date_list))
- url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, date[0], date[1], cachedir, timestep)
-
- # To get data from DB
- latitudes, longitudes, levels, values, timestamps = get_data(url)
- create_netCDF(latitudes, longitudes, levels, values, timestamps, database, latMin, latMax, lonMin, lonMax, date[0], date[1], unit, path + "/" + netCDF_name)
-
- # To read from netCDF files
- latitudes, longitudes, levels, times, values = read_netcdf(path + "/" + netCDF_name)
-
- lats=np.append(lats,latitudes)
- longs=np.append(longs,longitudes)
- uniqueLevls=np.append(uniqueLevls,levels)
- uniqueTimes=np.append(uniqueTimes,times)
- vals=np.append(vals,values)
-
- latitudes, longitudes, uniqueLevels, timesUnique, mdata = improve_data(lats, longs, uniqueLevls, uniqueTimes, vals, timestep)
-
- return latitudes, longitudes, uniqueLevels, timesUnique, mdata
http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/storage/files.py
----------------------------------------------------------------------
diff --git a/rcmet/src/main/python/rcmes/storage/files.py b/rcmet/src/main/python/rcmes/storage/files.py
deleted file mode 100644
index b238754..0000000
--- a/rcmet/src/main/python/rcmes/storage/files.py
+++ /dev/null
@@ -1,783 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""
-Module for handling data input files. Requires netCDF and Numpy be
-installed.
-
-This module can easily open NetCDF, HDF and Grib files. Search the netCDF4
-documentation for a complete list of supported formats.
-"""
-
-from os import path
-import netCDF4
-import numpy as np
-import numpy.ma as ma
-import sys
-
-from toolkit import process
-from utils import fortranfile
-from utils import misc
-
-
-VARIABLE_NAMES = {'time': ['time', 'times', 'date', 'dates', 'julian'],
- 'latitude': ['latitude', 'lat', 'lats', 'latitudes'],
- 'longitude': ['longitude', 'lon', 'lons', 'longitudes']
- }
-
-
-def findunique(seq):
- keys = {}
- for e in seq:
- keys[e] = 1
- return keys.keys()
-
-def getVariableByType(filename, variableType):
- """
- Function that will try to return the variable from a file based on a provided
- parameter type.
-
- Input::
- filename - the file to inspect
- variableType - time | latitude | longitude
-
- Output::
- variable name OR list of all variables in the file if a single variable
- name match cannot be found.
- """
- try:
- f = netCDF4.Dataset(filename, mode='r')
- except:
- print "netCDF4Error:", sys.exc_info()[0]
- raise
-
- variableKeys = f.variables.keys()
- f.close()
- variableKeys = [variable.encode().lower() for variable in variableKeys]
- variableMatch = VARIABLE_NAMES[variableType]
-
- commonVariables = list(set(variableKeys).intersection(variableMatch))
-
- if len(commonVariables) == 1:
- return str(commonVariables[0])
-
- else:
- return variableKeys
-
-def getVariableRange(filename, variableName):
- """
- Function to return the min and max values of the given variable in
- the supplied filename.
-
- Input::
- filename - absolute path to a file
- variableName - variable whose min and max values should be returned
-
- Output::
- variableRange - tuple of order (variableMin, variableMax)
- """
- try:
- f = netCDF4.Dataset(filename, mode='r')
- except:
- print "netCDF4Error:", sys.exc_info()[0]
- raise
-
- varArray = f.variables[variableName][:]
- return (varArray.min(), varArray.max())
-
-
-def read_data_from_file_list(filelist, myvar, timeVarName, latVarName, lonVarName):
- '''
- Read in data from a list of model files into a single data structure
-
- Input:
- filelist - list of filenames (including path)
- myvar - string containing name of variable to load in (as it appears in file)
- Output:
- lat, lon - 2D array of latitude and longitude values
- timestore - list of times
- t2store - numpy array containing data from all files
-
- NB. originally written specific for WRF netCDF output files
- modified to make more general (Feb 2011)
-
- Peter Lean July 2010
- '''
-
- filelist.sort()
- filename = filelist[0]
- # Crash nicely if 'filelist' is zero length
- """TODO: Throw Error instead via try Except"""
- if len(filelist) == 0:
- print 'Error: no files have been passed to read_data_from_file_list()'
- sys.exit()
-
- # Open the first file in the list to:
- # i) read in lats, lons
- # ii) find out how many timesteps in the file
- # (assume same ntimes in each file in list)
- # -allows you to create an empty array to store variable data for all times
- tmp = netCDF4.Dataset(filename, mode='r')
- latsraw = tmp.variables[latVarName][:]
- lonsraw = tmp.variables[lonVarName][:]
- if(latsraw.ndim == 1):
- lon, lat = np.meshgrid(lonsraw, latsraw)
- if(latsraw.ndim == 2):
- lon = lonsraw
- lat = latsraw
-
- timesraw = tmp.variables[timeVarName]
- ntimes = len(timesraw)
-
- print 'Lats and lons read in for first file in filelist'
-
- # Create a single empty masked array to store model data from all files
- t2store = ma.zeros((ntimes * len(filelist), len(lat[:, 0]), len(lon[0, :])))
- timestore = ma.zeros((ntimes * len(filelist)))
-
- # Now load in the data for real
- # NB. no need to reload in the latitudes and longitudes -assume invariant
- i = 0
- timesaccu = 0 # a counter for number of times stored so far in t2store
- # NB. this method allows for missing times in data files
- # as no assumption made that same number of times in each file...
-
-
- for i, ifile in enumerate(filelist):
-
- #print 'Loading data from file: ',filelist[i]
- f = netCDF4.Dataset(ifile, mode='r')
- t2raw = ma.array(f.variables[myvar][:])
- timesraw = f.variables[timeVarName]
- time = timesraw[:]
- ntimes = len(time)
- print 'file= ', i, 'ntimes= ', ntimes, filelist[i]
- print 't2raw shape: ', t2raw.shape
-
- # Flatten dimensions which needn't exist, i.e. level
- # e.g. if for single level then often data have 4 dimensions, when 3 dimensions will do.
- # Code requires data to have dimensions, (time,lat,lon)
- # i.e. remove level dimensions
- # Remove 1d axis from the t2raw array
- # Example: t2raw.shape == (365, 180, 360 1) <maps to (time, lat, lon, height)>
- # After the squeeze you will be left with (365, 180, 360) instead
- t2tmp = t2raw.squeeze()
- # Nb. if this happens to be data for a single time only, then we just flattened it by accident
- # lets put it back...
- if t2tmp.ndim == 2:
- t2tmp = np.expand_dims(t2tmp, 0)
-
- t2store[timesaccu + np.arange(ntimes), :, :] = t2tmp[:, :, :]
- timestore[timesaccu + np.arange(ntimes)] = time
- timesaccu += ntimes
- f.close()
-
- print 'Data read in successfully with dimensions: ', t2store.shape
-
- # TODO: search for duplicated entries (same time) and remove duplicates.
- # Check to see if number of unique times == number of times, if so then no problem
-
- if(len(np.unique(timestore)) != len(np.where(timestore != 0)[0].view())):
- print 'WARNING: Possible duplicated times'
-
- # Decode model times into python datetime objects. Note: timestore becomes a list (no more an array) here
- timestore, _ = process.getModelTimes(filename, timeVarName)
-
- # Make sure latlon grid is monotonically increasing and that the domains
- # are correct
- lat, lon, t2store = checkLatLon(lat, lon, t2store)
- data_dict = {'lats': lat, 'lons': lon, 'times': timestore, 'data': t2store}
- return data_dict
-
-def select_var_from_file(myfile, fmt='not set'):
- '''
- Routine to act as user interface to allow users to select variable of interest from a file.
-
- Input:
- myfile - filename
- fmt - (optional) specify fileformat for netCDF4 if filename suffix is non-standard
-
- Output:
- myvar - variable name in file
-
- Peter Lean September 2010
- '''
-
- print fmt
-
- if fmt == 'not set':
- f = netCDF4.Dataset(myfile, mode='r')
-
- if fmt != 'not set':
- f = netCDF4.Dataset(myfile, mode='r', format=fmt)
-
- keylist = [key.encode().lower() for key in f.variables.keys()]
-
- i = 0
- for v in keylist:
- print '[', i, '] ', f.variables[v].long_name, ' (', v, ')'
- i += 1
-
- user_selection = raw_input('Please select variable : [0 -' + str(i - 1) + '] ')
-
- myvar = keylist[int(user_selection)]
-
- return myvar
-
-def select_var_from_wrf_file(myfile):
- '''
- Routine to act as user interface to allow users to select variable of interest from a wrf netCDF file.
-
- Input:
- myfile - filename
-
- Output:
- mywrfvar - variable name in wrf file
-
- Peter Lean September 2010
- '''
-
- f = netCDF4.Dataset(myfile, mode='r', format='NETCDF4')
- keylist = [key.encode().lower() for key in f.variables.keys()]
-
- i = 0
- for v in keylist:
- try:
- print '[', i, '] ', f.variables[v].description, ' (', v, ')'
- except:
- print ''
-
- i += 1
-
- user_selection = raw_input('Please select WRF variable : [0 -' + str(i - 1) + '] ')
-
- mywrfvar = keylist[int(user_selection)]
-
- return mywrfvar
-
-def read_data_from_one_file(ifile, myvar, latVarName, lonVarName, timeVarName, file_type):
- """
- Purpose::
- Read in data from one file at a time
-
- Input::
- filelist - list of filenames (including path)
- myvar - string containing name of variable to load in (as it appears in file)s
- lonVarName - name of the Longitude Variable
- timeVarName - name of the Time Variable
- fileType - type of file we are trying to parse
-
- Output::
- lat, lon - 2D arrays of latitude and longitude values
- times - list of times
- t2store - numpy array containing data from the file for the requested variable
- varUnit - units for the variable given by t2store
- """
- f = netCDF4.Dataset(ifile, mode='r')
- try:
- varUnit = f.variables[myvar].units.encode().upper()
- except:
- varUnit = raw_input('Enter the model variable unit: \n> ').upper()
- t2raw = ma.array(f.variables[myvar][:])
- t2tmp = t2raw.squeeze()
- if t2tmp.ndim == 2:
- t2tmp = np.expand_dims(t2tmp, 0)
-
- lonsraw = f.variables[lonVarName][:]
- latsraw = f.variables[latVarName][:]
- if(latsraw.ndim == 1):
- lon, lat = np.meshgrid(lonsraw, latsraw)
- if(latsraw.ndim == 2):
- lon = lonsraw
- lat = latsraw
-
- f.close()
- print ' success read_data_from_one_file: VarName=', myvar, ' Shape(Full)= ', t2tmp.shape, ' Unit= ', varUnit
- timestore = process.decode_model_timesK(ifile, timeVarName, file_type)
-
- # Make sure latlon grid is monotonically increasing and that the domains
- # are correct
- lat, lon, t2store = checkLatLon(lat, lon, t2tmp)
- return lat, lon, timestore, t2store, varUnit
-
-def findTimeVariable(filename):
- """
- Function to find what the time variable is called in a model file.
- Input::
- filename - file to crack open and check for a time variable
- Output::
- timeName - name of the input file's time variable
- variableNameList - list of variable names from the input filename
- """
- try:
- f = netCDF4.Dataset(filename, mode='r')
- except:
- print("Unable to open '%s' to try and read the Time variable" % filename)
- raise
-
- variableNameList = [variable.encode() for variable in f.variables.keys()]
- # convert all variable names into lower case
- varNameListLowerCase = [x.lower() for x in variableNameList]
-
- # Use "set" types for finding common variable name from in the file and from the list of possibilities
- possibleTimeNames = set(['time', 'times', 'date', 'dates', 'julian'])
-
- # Use the sets to find the intersection where variable names are in possibleNames
- timeNameSet = possibleTimeNames.intersection(varNameListLowerCase)
-
- if len(timeNameSet) == 0:
- print("Unable to autodetect the Time Variable Name in the '%s'" % filename)
- timeName = misc.askUserForVariableName(variableNameList, targetName ="Time")
-
- else:
- timeName = timeNameSet.pop()
-
- return timeName, variableNameList
-
-
-def findLatLonVarFromFile(filename):
- """
- Function to find what the latitude and longitude variables are called in a model file.
-
- Input::
- -filename
- Output::
- -latVarName
- -lonVarName
- -latMin
- -latMax
- -lonMin
- -lonMax
- """
- try:
- f = netCDF4.Dataset(filename, mode='r')
- except:
- print("Unable to open '%s' to try and read the Latitude and Longitude variables" % filename)
- raise
-
- variableNameList = [variable.encode() for variable in f.variables.keys()]
- # convert all variable names into lower case
- varNameListLowerCase = [x.lower() for x in variableNameList]
-
- # Use "set" types for finding common variable name from in the file and from the list of possibilities
- possibleLatNames = set(['latitude', 'lat', 'lats', 'latitudes'])
- possibleLonNames = set(['longitude', 'lon', 'lons', 'longitudes'])
-
- # Use the sets to find the intersection where variable names are in possibleNames
- latNameSet = possibleLatNames.intersection(varNameListLowerCase)
- lonNameSet = possibleLonNames.intersection(varNameListLowerCase)
-
- if len(latNameSet) == 0 or len(lonNameSet) == 0:
- print("Unable to autodetect Latitude and/or Longitude values in the file")
- latName = misc.askUserForVariableName(variableNameList, targetName ="Latitude")
- lonName = misc.askUserForVariableName(variableNameList, targetName ="Longitude")
-
- else:
- latName = latNameSet.pop()
- lonName = lonNameSet.pop()
-
- lats = np.array(f.variables[latName][:])
- lons = np.array(f.variables[lonName][:])
-
- latMin = lats.min()
- latMax = lats.max()
-
- # Convert the lons from 0:360 into -180:180
- lons[lons > 180] = lons[lons > 180] - 360.
- lonMin = lons.min()
- lonMax = lons.max()
-
- return latName, lonName, latMin, latMax, lonMin, lonMax
-
-
-def read_data_from_file_list_K(filelist, myvar, timeVarName, latVarName, lonVarName, file_type):
- ##################################################################################
- # Read in data from a list of model files into a single data structure
- # Input: filelist - list of filenames (including path)
- # myvar - string containing name of variable to load in (as it appears in file)
- # Output: lat, lon - 2D array of latitude and longitude values
- # times - list of times
- # t2store - numpy array containing data from all files
- # Modified from read_data_from_file_list to read data from multiple models into a 4-D array
- # 1. The code now processes model data that completely covers the 20-yr period. Thus,
- # all model data must have the same time levels (ntimes). Unlike in the oroginal, ntimes
- # is fixed here.
- # 2. Because one of the model data exceeds 240 mos (243 mos), the model data must be
- # truncated to the 240 mons using the ntimes determined from the first file.
- ##################################################################################
- filelist.sort()
- nfiles = len(filelist)
- # Crash nicely if 'filelist' is zero length
- if nfiles == 0:
- print 'Error: no files have been passed to read_data_from_file_list(): Exit'
- sys.exit()
-
- # Open the first file in the list to:
- # i) read in lats, lons
- # ii) find out how many timesteps in the file (assume same ntimes in each file in list)
- # -allows you to create an empty array to store variable data for all times
- tmp = netCDF4.Dataset(filelist[0], mode='r', format=file_type)
- latsraw = tmp.variables[latVarName][:]
- lonsraw = tmp.variables[lonVarName][:]
- timesraw = tmp.variables[timeVarName]
-
- if(latsraw.ndim == 1):
- lon, lat = np.meshgrid(lonsraw, latsraw)
-
- elif(latsraw.ndim == 2):
- lon = lonsraw
- lat = latsraw
- ntimes = len(timesraw); nygrd = len(lat[:, 0]); nxgrd = len(lon[0, :])
-
- print 'Lats and lons read in for first file in filelist'
-
- # Create a single empty masked array to store model data from all files
- #t2store = ma.zeros((ntimes*nfiles,nygrd,nxgrd))
- t2store = ma.zeros((nfiles, ntimes, nygrd, nxgrd))
- #timestore=ma.zeros((ntimes*nfiles))
-
- ## Now load in the data for real
- ## NB. no need to reload in the latitudes and longitudes -assume invariant
- #timesaccu=0 # a counter for number of times stored so far in t2store
- # NB. this method allows for missing times in data files
- # as no assumption made that same number of times in each file...
-
- for i, ifile in enumerate(filelist):
- #print 'Loading data from file: ',filelist[i]
- f = netCDF4.Dataset(ifile, mode='r')
- t2raw = ma.array(f.variables[myvar][:])
- timesraw = f.variables[timeVarName]
- #ntimes=len(time)
- #print 'file= ',i,'ntimes= ',ntimes,filelist[i]
- ## Flatten dimensions which needn't exist, i.e. level
- ## e.g. if for single level then often data have 4 dimensions, when 3 dimensions will do.
- ## Code requires data to have dimensions, (time,lat,lon)
- ## i.e. remove level dimensions
- t2tmp = t2raw.squeeze()
- ## Nb. if data happen to be for a single time, we flattened it by accident; lets put it back...
- if t2tmp.ndim == 2:
- t2tmp = np.expand_dims(t2tmp, 0)
- #t2store[timesaccu+np.arange(ntimes),:,:]=t2tmp[0:ntimes,:,:]
- t2store[i, 0:ntimes, :, :] = t2tmp[0:ntimes, :, :]
- #timestore[timesaccu+np.arange(ntimes)]=time
- #timesaccu=timesaccu+ntimes
- f.close()
-
- print 'Data read in successfully with dimensions: ', t2store.shape
-
- # Decode model times into python datetime objects. Note: timestore becomes a list (no more an array) here
- ifile = filelist[0]
- timestore, _ = process.getModelTimes(ifile, timeVarName)
-
- # Make sure latlon grid is monotonically increasing and that the domains
- # are correct
- lat, lon, t2store = checkLatLon(lat, lon, t2store)
- return lat, lon, timestore, t2store
-
-def find_latlon_ranges(filelist, lat_var_name, lon_var_name):
- # Function to return the latitude and longitude ranges of the data in a file,
- # given the identifying variable names.
- #
- # Input:
- # filelist - list of filenames (data is read in from first file only)
- # lat_var_name - variable name of the 'latitude' variable
- # lon_var_name - variable name of the 'longitude' variable
- #
- # Output:
- # latMin, latMax, lonMin, lonMax - self explanatory
- #
- # Peter Lean March 2011
-
- filename = filelist[0]
-
- try:
- f = netCDF4.Dataset(filename, mode='r')
-
- lats = f.variables[lat_var_name][:]
- latMin = lats.min()
- latMax = lats.max()
-
- lons = f.variables[lon_var_name][:]
- lons[lons > 180] = lons[lons > 180] - 360.
- lonMin = lons.min()
- lonMax = lons.max()
-
- return latMin, latMax, lonMin, lonMax
-
- except:
- print 'Error: there was a problem with finding the latitude and longitude ranges in the file'
- print ' Please check that you specified the filename, and variable names correctly.'
-
- sys.exit()
-
-def writeBN_lola(fileName, lons, lats):
- # write a binary data file that include longitude (1-d) and latitude (1-d) values
-
- F = fortranfile.FortranFile(fileName, mode='w')
- ngrdY = lons.shape[0]; ngrdX = lons.shape[1]
- tmpDat = ma.zeros(ngrdX); tmpDat[:] = lons[0, :]; F.writeReals(tmpDat)
- tmpDat = ma.zeros(ngrdY); tmpDat[:] = lats[:, 0]; F.writeReals(tmpDat)
- # release temporary arrays
- tmpDat = 0
- F.close()
-
-def writeBNdata(fileName, numOBSs, numMDLs, nT, ngrdX, ngrdY, numSubRgn, obsData, mdlData, obsRgnAvg, mdlRgnAvg):
- #(fileName,maskOption,numOBSs,numMDLs,nT,ngrdX,ngrdY,numSubRgn,obsData,mdlData,obsRgnAvg,mdlRgnAvg):
- # write spatially- and regionally regridded data into a binary data file
- missing = -1.e26
- F = fortranfile.FortranFile(fileName, mode='w')
- # construct a data array to replace mask flag with a missing value (missing=-1.e12) for printing
- data = ma.zeros((nT, ngrdY, ngrdX))
- for m in np.arange(numOBSs):
- data[:, :, :] = obsData[m, :, :, :]; msk = data.mask
- for n in np.arange(nT):
- for j in np.arange(ngrdY):
- for i in np.arange(ngrdX):
- if msk[n, j, i]: data[n, j, i] = missing
-
- # write observed data. allowed to write only one row at a time
- tmpDat = ma.zeros(ngrdX)
- for n in np.arange(nT):
- for j in np.arange(ngrdY):
- tmpDat[:] = data[n, j, :]
- F.writeReals(tmpDat)
-
- # write model data (dep. on the number of models).
- for m in np.arange(numMDLs):
- data[:, :, :] = mdlData[m, :, :, :]
- msk = data.mask
- for n in np.arange(nT):
- for j in np.arange(ngrdY):
- for i in np.arange(ngrdX):
- if msk[n, j, i]:
- data[n, j, i] = missing
-
- for n in np.arange(nT):
- for j in np.arange(ngrdY):
- tmpDat[:] = data[n, j, :]
- F.writeReals(tmpDat)
-
- data = 0 # release the array allocated for data
- # write data in subregions
- if numSubRgn > 0:
- print 'Also included are the time series of the means over ', numSubRgn, ' areas from obs and model data'
- tmpDat = ma.zeros(nT); print numSubRgn
- for m in np.arange(numOBSs):
- for n in np.arange(numSubRgn):
- tmpDat[:] = obsRgnAvg[m, n, :]
- F.writeReals(tmpDat)
- for m in np.arange(numMDLs):
- for n in np.arange(numSubRgn):
- tmpDat[:] = mdlRgnAvg[m, n, :]
- F.writeReals(tmpDat)
- tmpDat = 0 # release the array allocated for tmpDat
- F.close()
-
-def writeNCfile(fileName, numSubRgn, lons, lats, obsData, mdlData, obsRgnAvg, mdlRgnAvg, obsList, mdlList, subRegions):
- # write an output file of variables up to 3 dimensions
- # fileName: the name of the output data file
- # numSubRgn : the number of subregions
- # lons[ngrdX]: longitude
- # lats[ngrdY]: latitudes
- # obsData[nT,ngrdY,ngrdX]: the obs time series of the entire model domain
- # mdlData[numMDLs,nT,ngrdY,ngrdX]: the mdltime series of the entire model domain
- # obsRgnAvg[numSubRgn,nT]: the obs time series for the all subregions
- # mdlRgnAvg[numMDLs,numSubRgn,nT]: the mdl time series for the all subregions
- dimO = obsData.shape[0] # the number of obs data
- dimM = mdlData.shape[0] # the number of mdl data
- dimT = mdlData.shape[1] # the number of time levels
- dimY = mdlData.shape[2] # y-dimension
- dimX = mdlData.shape[3] # x-dimension
- dimR = obsRgnAvg.shape[1] # the number of subregions
- f = netCDF4.Dataset(fileName, mode='w', format='NETCDF4')
- print mdlRgnAvg.shape, dimM, dimR, dimT
- #create global attributes
- f.description = ''
- # create dimensions
- print 'Creating Dimensions within the NetCDF Object...'
- f.createDimension('unity', 1)
- f.createDimension('time', dimT)
- f.createDimension('west_east', dimX)
- f.createDimension('south_north', dimY)
- f.createDimension('obs', dimO)
- f.createDimension('models', dimM)
-
- # create the variable (real*4) to be written in the file
- print 'Creating Variables...'
- f.createVariable('lon', 'd', ('south_north', 'west_east'))
- f.createVariable('lat', 'd', ('south_north', 'west_east'))
- f.createVariable('oDat', 'd', ('obs', 'time', 'south_north', 'west_east'))
- f.createVariable('mDat', 'd', ('models', 'time', 'south_north', 'west_east'))
-
- if subRegions:
- f.createDimension('regions', dimR)
- f.createVariable('oRgn', 'd', ('obs', 'regions', 'time'))
- f.createVariable('mRgn', 'd', ('models', 'regions', 'time'))
- f.variables['oRgn'].varAttName = 'Observation time series: Subregions'
- f.variables['mRgn'].varAttName = 'Model time series: Subregions'
-
- loadDataIntoNetCDF(f, obsList, obsData, 'Observation')
- print 'Loaded the Observations into the NetCDF'
-
- loadDataIntoNetCDF(f, mdlList, mdlData, 'Model')
-
- # create attributes and units for the variable
- print 'Creating Attributes and Units...'
- f.variables['lon'].varAttName = 'Longitudes'
- f.variables['lon'].varUnit = 'degrees East'
- f.variables['lat'].varAttName = 'Latitudes'
- f.variables['lat'].varUnit = 'degrees North'
- f.variables['oDat'].varAttName = 'Observation time series: entire domain'
- f.variables['mDat'].varAttName = 'Model time series: entire domain'
-
- # assign the values to the variable and write it
- f.variables['lon'][:] = lons[:]
- f.variables['lat'][:] = lats[:]
- if subRegions:
- f.variables['oRgn'][:] = obsRgnAvg[:]
- f.variables['mRgn'][:] = mdlRgnAvg[:]
-
- f.close()
-
-def loadDataIntoNetCDF(fileObject, datasets, dataArray, dataType):
- """
- Input::
- fileObject - netCDF4 file object data will be loaded into
- datasets - List of dataset names
- dataArray - Multi-dimensional array of data to be loaded into the NetCDF file
- dataType - String with value of either 'Model' or 'Observation'
- Output::
- No return value. netCDF4 file object is updated in place
- """
- datasetCount = 0
- for datasetCount, dataset in enumerate(datasets):
- if dataType.lower() == 'observation':
- datasetName = dataset.replace(' ','')
- elif dataType.lower() == 'model':
- datasetName = path.splitext(path.basename(dataset))[0]
- print "Creating variable %s" % datasetName
- fileObject.createVariable(datasetName, 'd', ('time', 'south_north', 'west_east'))
- fileObject.variables[datasetName].varAttName = 'Obseration time series: entire domain'
- print 'Loading values into %s' % datasetName
- fileObject.variables[datasetName][:] = dataArray[datasetCount,:,:,:]
-
-def checkLatLon(latsin, lonsin, datain):
- """
- Purpose::
- Checks whether latitudes and longitudes are monotonically increasing
- within the domains [-90, 90) and [-180, 180) respectively, and rearranges the input data
- accordingly if they are not.
-
- Input::
- latsin - Array of latitudes read from a raw netcdf file
- lonsin - Array of longitudes read from a raw netcdf file
- datain - Array of data values read from a raw netcdf file.
- The shape is assumed to be (..., nLat, nLon).
-
- Output::
- latsout - 2D array of (rearranged) latitudes
- lonsout - 2D array of (rearranged) longitudes
- dataout - Array of (rearranged) data
- """
- # Avoid unnecessary shifting if all lons are higher than 180
- if lonsin.min() > 180:
- lonsin -= 360
-
- # Make sure lats and lons are monotonically increasing
- latsDecreasing = np.diff(latsin[:, 0]) < 0
- lonsDecreasing = np.diff(lonsin[0]) < 0
-
- # If all values are decreasing then they just need to be reversed
- latsReversed, lonsReversed = latsDecreasing.all(), lonsDecreasing.all()
-
- # If the lat values are unsorted then raise an exception
- if not latsReversed and latsDecreasing.any():
- raise ValueError('Latitudes must be monotonically increasing.')
-
- # Perform same checks now for lons
- if not lonsReversed and lonsDecreasing.any():
- raise ValueError('Longitudes must be monotonically increasing.')
-
- # Also check if lons go from [0, 360), and convert to [-180, 180)
- # if necessary
- lonsShifted = lonsin.max() > 180
- latsout, lonsout, dataout = latsin[:], lonsin[:], datain[:]
- # Now correct data if latlon grid needs to be shifted
- if latsReversed:
- latsout = latsout[::-1]
- dataout = dataout[..., ::-1, :]
-
- if lonsReversed:
- lonsout = lonsout[..., ::-1]
- dataout = dataout[..., ::-1]
-
- if lonsShifted:
- lat1d = latsout[:, 0]
- dataout, lon1d = shiftgrid(180, dataout, lonsout[0], start=False)
- lonsout, latsout = np.meshgrid(lon1d, lat1d)
-
- return latsout, lonsout, dataout
-
-def shiftgrid(lon0, datain, lonsin, start= True, cyclic=360.0):
- """
- Purpose::
- Shift global lat/lon grid east or west. This function is taken directly
- from the (unreleased) basemap 1.0.7 source code as version 1.0.6 does not
- currently support arrays with more than two dimensions.
- https://github.com/matplotlib/basemap
-
- Input::
- lon0 - starting longitude for shifted grid (ending longitude if start=False).
- lon0 must be on input grid (within the range of lonsin).
- datain - original data with longitude the right-most dimension.
- lonsin - original longitudes.
- start - if True, lon0 represents the starting longitude of the new grid.
- if False, lon0 is the ending longitude. Default True.
- cyclic - width of periodic domain (default 360)
-
- Output::
- dataout - data on shifted grid
- lonsout - lons on shifted grid
- """
- if np.fabs(lonsin[-1]-lonsin[0]-cyclic) > 1.e-4:
- # Use all data instead of raise ValueError, 'cyclic point not included'
- start_idx = 0
- else:
- # If cyclic, remove the duplicate point
- start_idx = 1
- if lon0 < lonsin[0] or lon0 > lonsin[-1]:
- raise ValueError('lon0 outside of range of lonsin')
- i0 = np.argmin(np.fabs(lonsin-lon0))
- i0_shift = len(lonsin)-i0
- if ma.isMA(datain):
- dataout = ma.zeros(datain.shape,datain.dtype)
- else:
- dataout = np.zeros(datain.shape,datain.dtype)
- if ma.isMA(lonsin):
- lonsout = ma.zeros(lonsin.shape,lonsin.dtype)
- else:
- lonsout = np.zeros(lonsin.shape,lonsin.dtype)
- if start:
- lonsout[0:i0_shift] = lonsin[i0:]
- else:
- lonsout[0:i0_shift] = lonsin[i0:]-cyclic
- dataout[...,0:i0_shift] = datain[...,i0:]
- if start:
- lonsout[i0_shift:] = lonsin[start_idx:i0+start_idx]+cyclic
- else:
- lonsout[i0_shift:] = lonsin[start_idx:i0+start_idx]
- dataout[...,i0_shift:] = datain[...,start_idx:i0+start_idx]
- return dataout,lonsout
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/storage/rcmed.py
----------------------------------------------------------------------
diff --git a/rcmet/src/main/python/rcmes/storage/rcmed.py b/rcmet/src/main/python/rcmes/storage/rcmed.py
deleted file mode 100644
index c99334c..0000000
--- a/rcmet/src/main/python/rcmes/storage/rcmed.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-'''This is a collection of functions that provide the single interface to the
-rcmed. Initial design will includes several functions to interact with the
-available parameters within rcmed and their metadata.
-
-Future work includes rolling the rcmed querying code into this module as well.
-'''
-
-import requests, json
-
-paramUri = 'http://rcmes.jpl.nasa.gov/bottle/rcmed/param.json'
-
-def getParams(uri=paramUri):
- '''This will return all of the parameters from the database as
- a list of dictionaries.
-
- If the database is not available, then the method will return None'''
- # Use a get request to call the Web Service
- try:
- httpRequest = requests.get(uri)
- except:
- print "HTTPRequest failed. Bottle WebServer is offline"
- raise
- # TODO Check the request status code if it is 400 or 500 range then
- # return None
- # if the status code is 200 then return the request.text's param list
- # http_request.status_code is an int we can inspect
- paramDict = json.loads(httpRequest.text)
- paramList = paramDict['param']
-
- filteredParams = []
- # Filter list to remove missing data values
- for param in paramList:
- paramGood = True
- for key, value in param.iteritems():
- if value == None:
- paramGood = False
-
- if paramGood:
- filteredParams.append(param)
- else:
- filteredParams.append(param)
-
-
- return filteredParams
-
-
-
-#class Parameter(object):
-#
-# def __init__(self):
-# self.param_query_uri = 'http://some.url'
-# self.param_list = self.param_metadata()
-#
-# def param_metadata(self):
-# '''This method will return a list of python dict's. Each dict will
-# contain a complete record for each parameter from rcmed'''
-# # 1. Query the Parameters Metadata Endpoint using param_query_uri
-# # 2. Parse the returned data and re-format into a dict
-# # 3. define self.para_met_dict
-# test_list = [{"id":12,
-# "description":"ERA Dataset 2 Metre Temperature",
-# "type":'temp'
-# },
-# {"id":13,
-# "description":"ERA Dataset 2 Metre Dewpoint Temperature",
-# 'type':'temp'
-# },
-# {"id":14,
-# "description":"TRMM Dataset HRF parameter",
-# 'type':'hrf'
-# }
-# ]
-# print "self.param_met_dict has been created"
-# return test_list
-#
-# def get_param_by_id(self, id):
-# '''This will take in a parameter id and return a single dict. Can we
-# safely assume we will always hold a unique parameter id? - Currently
-# this is True'''
-# for p in self.param_list:
-# if p['id'] == id:
-# return p
-# else:
-# pass
-#
-# def get_params_by_type(self, type):
-# '''This will take in a parameter type like precip, temp, pressure, etc.
-# and will return a list of all the params that are of the given type.'''
-# param_list = [] #empty list to collect the param dicts
-# for p in self.param_list:
-# if p['type'] == type:
-# param_list.append(p)
-# else:
-# pass
-# return param_list
-#
-#
-#class ObsData(object):
-#
-# def __init__(self):
-# self.query_url = 'http://rcmes/rcmed....' #can we merely insert the query criteria into the url attribute?
-# self.param_id = 6
-# self.dataset_id = 1
-# self.lat_range = [25.4,55.0]
-# self.lon_range = [0.0,10.7]
-# self.time_range = [start,end]
-#
-# def set_param(self, param_dict):
-# self.param_id = param_dict['id']
-# self.dataset_id = null
-# # look up the dataset id using the parameter id and set it
-# p = Parameter.get_param_by_id(id)
-
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/toolkit/__init__.py
----------------------------------------------------------------------
diff --git a/rcmet/src/main/python/rcmes/toolkit/__init__.py b/rcmet/src/main/python/rcmes/toolkit/__init__.py
deleted file mode 100644
index c980999..0000000
--- a/rcmet/src/main/python/rcmes/toolkit/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""The toolkit Package is a collection of modules that provide a set of tools
-that can be used to process, analyze and plot the analysis results."""
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/climate/blob/8c142c35/rcmet/src/main/python/rcmes/toolkit/do_data_prep.py
----------------------------------------------------------------------
diff --git a/rcmet/src/main/python/rcmes/toolkit/do_data_prep.py b/rcmet/src/main/python/rcmes/toolkit/do_data_prep.py
deleted file mode 100644
index 0b910c4..0000000
--- a/rcmet/src/main/python/rcmes/toolkit/do_data_prep.py
+++ /dev/null
@@ -1,366 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Prepare Datasets both model and observations for analysis using metrics"""
-
-
-import numpy as np
-import numpy.ma as ma
-import sys, os
-
-from storage import db, files
-import process
-from utils import misc
-
-# TODO: swap gridBox for Domain
-def prep_data(settings, obsDatasetList, gridBox, modelList):
- """
-
- returns numOBSs,numMDLs,nT,ngrdY,ngrdX,Times,lons,lats,obsData,modelData,obsList
-
- numOBSs - number of Observational Datasets. Really need to look at using len(obsDatasetList) instead
- numMDLs - number of Models used. Again should use the len(modelList) instead
- nT - Time value count after temporal regridding. Should use length of the time axis for a given dataset
- ngrdY - size of the Y-Axis grid after spatial regridding
- ngrdX - size of the X-Axis grid after spatial regridding
- Times - list of python datetime objects the represent the list of time to be used in further calculations
- lons -
- lats -
- obsData -
- modelData -
- obsList -
-
-
- """
-
- # TODO: Stop the object Deserialization and work on refactoring the core code here
- cachedir = settings.cacheDir
- workdir = settings.workDir
-
- # Use list comprehensions to deconstruct obsDatasetList
- # ['TRMM_pr_mon', 'CRU3.1_pr'] Basically a list of Dataset NAME +'_' + parameter name - THE 'CRU*' one triggers unit conversion issues later
- # the plan here is to use the obsDatasetList which contains a longName key we can use.
- obsList = [str(x['longname']) for x in obsDatasetList]
- # Also using the obsDatasetList with a key of ['dataset_id']
- obsDatasetId = [str(x['dataset_id']) for x in obsDatasetList]
- # obsDatasetList ['paramter_id'] list
- obsParameterId = [str(x['parameter_id']) for x in obsDatasetList]
- obsTimestep = [str(x['timestep']) for x in obsDatasetList]
- mdlList = [model.filename for model in modelList]
-
- # Since all of the model objects in the modelList have the same Varnames and Precip Flag, I am going to merely
- # pull this from modelList[0] for now
- modelVarName = modelList[0].varName
- precipFlag = modelList[0].precipFlag
- modelTimeVarName = modelList[0].timeVariable
- modelLatVarName = modelList[0].latVariable
- modelLonVarName = modelList[0].lonVariable
- regridOption = settings.spatialGrid
- timeRegridOption = settings.temporalGrid
-
- """
- Routine to read-in and re-grid both obs and mdl datasets.
- Processes both single and multiple files of obs and mdl or combinations in a general way.
- i) retrieve observations from the database
- ii) load in model data
- iii) spatial regridding
- iv) temporal regridding
- v) area-averaging
- Input:
- cachedir - string describing directory path
- workdir - string describing directory path
- obsList - string describing the observation data files
- obsDatasetId - int, db dataset id
- obsParameterId - int, db parameter id
- latMin, latMax, lonMin, lonMax, dLat, dLon, naLats, naLons: define the evaluation/analysis domain/grid system
- latMin - float
- latMax - float
- lonMin - float
- lonMax - float
- dLat - float
- dLon - float
- naLats - integer
- naLons - integer
- mdlList - string describing model file name + path
- modelVarName - string describing name of variable to evaluate (as written in model file)
- precipFlag - bool (is this precipitation data? True/False)
- modelTimeVarName - string describing name of time variable in model file
- modelLatVarName - string describing name of latitude variable in model file
- modelLonVarName - string describing name of longitude variable in model file
- regridOption - string: 'obs'|'model'|'user'
- timeRegridOption -string: 'full'|'annual'|'monthly'|'daily'
- maskOption - Boolean
-
- # TODO: This isn't true in the current codebase.
- Instead the SubRegion's are being used. You can see that these values are not
- being used in the code, at least they are not being passed in from the function
-
- maskLatMin - float (only used if maskOption=1)
- maskLatMax - float (only used if maskOption=1)
- maskLonMin - float (only used if maskOption=1)
- maskLonMax - float (only used if maskOption=1)
- Output: image files of plots + possibly data
- Jinwon Kim, 7/11/2012
- """
-
-
- # check the number of obs & model data files
- numOBSs = len(obsList)
- numMDLs = len(mdlList)
-
- # assign parameters that must be preserved throughout the process
-
- print 'start & end time = ', settings.startDate, settings.endDate
- yymm0 = settings.startDate.strftime("%Y%m")
- yymm1 = settings.endDate.strftime("%Y%m")
- print 'start & end eval period = ', yymm0, yymm1
-
-
-
- #TODO: Wrap in try except blocks instead
- if numMDLs < 1:
- print 'No input model data file. EXIT'
- sys.exit()
- if numOBSs < 1:
- print 'No input observation data file. EXIT'
- sys.exit()
-
- ## Part 0: Setup the regrid variables based on the regridOption given
-
- # preparation for spatial re-gridding: define the size of horizontal array of the target interpolation grid system (ngrdX and ngrdY)
- print 'regridOption in prep_data= ', regridOption
- if regridOption == 'model':
- ifile = mdlList[0]
- typeF = 'nc'
- lats, lons, mTimes = files.read_data_from_one_file(ifile, modelVarName,
- modelLatVarName,
- modelLonVarName,
- modelTimeVarName,
- typeF)[:3]
- modelObject = modelList[0]
- latMin = modelObject.latMin
- latMax = modelObject.latMax
- lonMin = modelObject.lonMin
- lonMax = modelObject.lonMax
- elif regridOption == 'user':
- # Use the GridBox Object
- latMin = gridBox.latMin
- latMax = gridBox.latMax
- lonMin = gridBox.lonMin
- lonMax = gridBox.lonMax
- naLats = gridBox.latCount
- naLons = gridBox.lonCount
- dLat = gridBox.latStep
- dLon = gridBox.lonStep
- lat = np.arange(naLats) * dLat + latMin
- lon = np.arange(naLons) * dLon + lonMin
- lons, lats = np.meshgrid(lon, lat)
- lon = 0.
- lat = 0.
- else:
- print "INVALID REGRID OPTION USED"
- sys.exit()
-
- ngrdY = lats.shape[0]
- ngrdX = lats.shape[1]
-
- regObsData = []
-
-
-
- ## Part 1: retrieve observation data from the database and regrid them
- ## NB. automatically uses local cache if already retrieved.
-
- for n in np.arange(numOBSs):
- # spatial regridding
- oLats, oLons, _, oTimes, oData = db.extractData(obsDatasetId[n],
- obsParameterId[n],
- latMin, latMax,
- lonMin, lonMax,
- settings.startDate, settings.endDate,
- cachedir, obsTimestep[n])
-
- # TODO: modify this if block with new metadata usage.
- if precipFlag == True and obsList[n][0:3] == 'CRU':
- oData = 86400.0 * oData
-
- nstOBSs = oData.shape[0] # note that the length of obs data can vary for different obs intervals (e.g., daily vs. monthly)
- print 'Regrid OBS dataset onto the ', regridOption, ' grid system: ngrdY, ngrdX, nstOBSs= ', ngrdY, ngrdX, nstOBSs
- print 'For dataset: %s' % obsList[n]
-
- tmpOBS = ma.zeros((nstOBSs, ngrdY, ngrdX))
-
- print 'tmpOBS shape = ', tmpOBS.shape
-
- # OBS SPATIAL REGRIDING
- for t in np.arange(nstOBSs):
- tmpOBS[t, :, :] = process.do_regrid(oData[t, :, :], oLats, oLons, lats, lons)
-
- # TODO: Not sure this is needed with Python Garbage Collector
- # The used memory should be freed when the objects are no longer referenced. If this continues to be an issue we may need to look
- # at using generators where possible.
- oLats = 0.0
- oLons = 0.0 # release the memory occupied by the temporary variables oLats and oLons.
-
- # OBS TEMPORAL REGRIDING
- oData, newObsTimes = process.calc_average_on_new_time_unit_K(tmpOBS, oTimes, unit=timeRegridOption)
-
- tmpOBS = 0.0
-
- # check the consistency of temporally regridded obs data
- if n == 0:
- oldObsTimes = newObsTimes
- else:
- if oldObsTimes != newObsTimes:
- print 'temporally regridded obs data time levels do not match at ', n - 1, n
- print '%s Time through Loop' % (n + 1)
- print 'oldObsTimes Count: %s' % len(oldObsTimes)
- print 'newObsTimes Count: %s' % len(newObsTimes)
- # TODO: We need to handle these cases using Try Except Blocks or insert a sys.exit if appropriate
- sys.exit()
- else:
- oldObsTimes = newObsTimes
- # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
- regObsData.append(oData)
-
-
- """ all obs datasets have been read-in and regridded. convert the regridded obs data from 'list' to 'array'
- also finalize 'obsTimes', the time coordinate values of the regridded obs data.
- NOTE: using 'list_to_array' assigns values to the missing points; this has become a problem in handling the CRU data.
- this problem disappears by using 'ma.array'."""
-
- obsData = ma.array(regObsData)
- obsTimes = newObsTimes
-
- # compute the simple multi-obs ensemble if multiple obs are used
- if numOBSs > 1:
- ensemble = np.mean(regObsData, axis=0)
- regObsData.append(ensemble)
- numOBSs = len(regObsData)
- obsList.append('ENS-OBS')
- obsData = ma.array(regObsData) # Cast to a masked array
-
-
- ## Part 2: load in and regrid model data from file(s)
- ## NOTE: the two parameters, numMDLs and numMOmx are defined to represent the number of models (w/ all 240 mos) &
- ## the total number of months, respectively, in later multi-model calculations.
-
- typeF = 'nc'
- regridMdlData = []
- # extract the model names and store them in the list 'mdlList'
- mdlName = []
- mdlListReversed=[]
- if len(mdlList) >1:
- for element in mdlList:
- mdlListReversed.append(element[::-1])
- prefix=os.path.commonprefix(mdlList)
- postfix=os.path.commonprefix(mdlListReversed)[::-1]
- for element in mdlList:
- mdlName.append(element.replace(prefix,'').replace(postfix,''))
- else:
- mdlName.append('model')
-
-
- for n in np.arange(numMDLs):
- # read model grid info, then model data
- ifile = mdlList[n]
- print 'ifile= ', ifile
- modelLats, modelLons, mTimes, mdlDat, mvUnit = files.read_data_from_one_file(ifile,
- modelVarName,
- modelLatVarName,
- modelLonVarName,
- modelTimeVarName,
- typeF)
- mdlT = []
- mStep = len(mTimes)
-
- for i in np.arange(mStep):
- mdlT.append(mTimes[i].strftime("%Y%m"))
-
- wh = (np.array(mdlT) >= yymm0) & (np.array(mdlT) <= yymm1)
- modelTimes = list(np.array(mTimes)[wh])
- mData = mdlDat[wh, :, :]
-
- # determine the dimension size from the model time and latitude data.
- nT = len(modelTimes)
-
- print ' The shape of model data to be processed= ', mData.shape, ' for the period ', min(modelTimes), max(modelTimes)
- # spatial regridding of the modl data
- tmpMDL = ma.zeros((nT, ngrdY, ngrdX))
-
- if regridOption != 'model':
- for t in np.arange(nT):
- tmpMDL[t, :, :] = process.do_regrid(mData[t, :, :], modelLats, modelLons, lats, lons)
- else:
- tmpMDL = mData
-
- # temporally regrid the model data
- mData, newMdlTimes = process.regrid_in_time(tmpMDL, modelTimes, unit=timeRegridOption)
- tmpMDL = 0.0
-
- # check data consistency for all models
- if n == 0:
- oldMdlTimes = newMdlTimes
- else:
- if oldMdlTimes != newMdlTimes:
- print 'temporally regridded mdl data time levels do not match at ', n - 1, n
- print len(oldMdlTimes), len(newMdlTimes)
- sys.exit()
- else:
- oldMdlTimes = newMdlTimes
-
- # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
- regridMdlData.append(mData)
-
- modelData = ma.array(regridMdlData)
- modelTimes = newMdlTimes
-
- if (precipFlag == True) & (mvUnit == 'KG M-2 S-1'):
- print 'convert model variable unit from mm/s to mm/day'
- modelData = 86400.*modelData
-
- # check consistency between the time levels of the model and obs data
- # this is the final update of time levels: 'Times' and 'nT'
- if obsTimes != modelTimes:
- print 'time levels of the obs and model data are not consistent. EXIT'
- print 'obsTimes'
- print obsTimes
- print 'modelTimes'
- print modelTimes
- sys.exit()
- # 'Times = modelTimes = obsTimes' has been established and modelTimes and obsTimes will not be used hereafter. (de-allocated)
- Times = modelTimes
- nT = len(modelTimes)
- modelTimes = 0
- obsTimes = 0
-
- print 'Reading and regridding model data are completed'
- print 'numMDLs, modelData.shape= ', numMDLs, modelData.shape
-
- # TODO: Commented out until I can talk with Jinwon about this
- # compute the simple multi-model ensemble if multiple models are evaluated
- if numMDLs > 1:
- model_ensemble = np.mean(regridMdlData, axis=0)
- regridMdlData.append(model_ensemble)
- numMDLs = len(regridMdlData)
- modelData = ma.array(regridMdlData)
- mdlName.append('ENS-MODEL')
- print 'Eval mdl-mean timeseries for the obs periods: modelData.shape= ',modelData.shape
- # GOODALE: This ensemble code should be refactored into process.py module since it's purpose is data processing
-
- # Processing complete
- print 'data_prep is completed: both obs and mdl data are re-gridded to a common analysis grid'
- return numOBSs, numMDLs, nT, ngrdY, ngrdX, Times, lons, lats, obsData, modelData, obsList, mdlName