You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@climate.apache.org by bo...@apache.org on 2013/04/30 00:53:37 UTC
svn commit: r1477399 -
/incubator/climate/trunk/rcmet/src/main/python/rcmes/storage/db.py
Author: boustani
Date: Mon Apr 29 22:53:37 2013
New Revision: 1477399
URL: http://svn.apache.org/r1477399
Log:
new version of db.py that could generate netCDF files as chace files (netCDF files will be break down to month by month)
Modified:
incubator/climate/trunk/rcmet/src/main/python/rcmes/storage/db.py
Modified: incubator/climate/trunk/rcmet/src/main/python/rcmes/storage/db.py
URL: http://svn.apache.org/viewvc/incubator/climate/trunk/rcmet/src/main/python/rcmes/storage/db.py?rev=1477399&r1=1477398&r2=1477399&view=diff
==============================================================================
--- incubator/climate/trunk/rcmet/src/main/python/rcmes/storage/db.py (original)
+++ incubator/climate/trunk/rcmet/src/main/python/rcmes/storage/db.py Mon Apr 29 22:53:37 2013
@@ -11,6 +11,7 @@ import Nio
from classes import RCMED
from toolkit import process
from datetime import timedelta ,datetime
+from calendar import monthrange
def reorderXYT(lons, lats, times, values):
# Re-order values in values array such that when reshaped everywhere is where it should be
@@ -52,89 +53,12 @@ def findUnique(seq, idfun=None):
result.append(item)
return result
-def extractData(datasetID, paramID, latMin, latMax, lonMin, lonMax, startTime, endTime, cachedir, timestep):
- """
- Main function to extract data from DB into numpy masked arrays
-
- Input::
- datasetID, paramID: required identifiers of data in database
- latMin, latMax, lonMin, lonMax: location range to extract data for
- startTime, endTime: python datetime objects describing required time range to extract
- cachedir: directory path used to store temporary cache files
- timestep: "daily" | "monthly" so we can be sure to query the RCMED properly
- Output:
- uniqueLatitudes,uniqueLongitudes: 1d-numpy array of latitude and longitude grid values
- uniqueLevels: 1d-numpy array of vertical level values
- timesUnique: list of python datetime objects describing times of returned data
- mdata: masked numpy arrays of data values
-
- """
-
- url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, startTime, endTime, cachedir, timestep)
-
- # To get the parameter's information from parameter table
- database,timestep,realm,instrument,start_date,end_date,unit=get_param_info(url)
-
- # Create a directory inside the cachedir folder
- name=[]
- # activity is a fix value
- activity="obs4cmip5"
- name.append(activity)
- # product is a fix value
- product="observations"
- name.append(product)
- # realm, variable,frequency and instrument will be get from parameter table
- realm=realm
- name.append(realm)
- variable=database
- name.append(variable)
- frequency=timestep
- name.append(frequency)
- data_structure="grid"
- name.append(data_structure)
- institution="NASA"
- name.append(institution)
- project="RCMES"
- name.append(project)
- instrument=instrument
- name.append(instrument)
- version="v1"
- name.append(version)
-
- # Check to see whether the folder is already created for netCDF or not, then it will be created
- os.chdir(cachedir)
- path=os.getcwd()
- for n in name:
- if os.path.exists(path+"/"+n):
- os.chdir(path+"/"+n)
- path=os.getcwd()
- else:
- os.mkdir(n)
- os.chdir(path+"/"+n)
- path=os.getcwd()
-
- # To establish the netCDF file name
- processing_level='L3'
- processing_version="processing_version" # the processing version is still unknown
- start_date=str(startTime)[0:4]+str(startTime)[5:7]+str(startTime)[8:10]
- end_date=str(endTime)[0:4]+str(endTime)[5:7]+str(endTime)[8:10]
- netCD_fileName=variable + '_' + project + '_' + processing_level + '_' + processing_version + '_' + str(latMin) + '_' + str(latMax) + '_' + str(lonMin) + '_' + str(lonMax) + '_' + start_date + '_' + end_date + '.nc'
-
- # To check if netCDF file exists, then use it
- if os.path.exists(path+"/"+netCD_fileName):
- latitudes, longitudes, uniqueLevels, timesUnique, mdata=read_netcdf(path+"/"+netCD_fileName,timestep)
- # If the netCDF file does not exists, then create one.
- else:
- latitudes, longitudes, uniqueLevels, timesUnique, mdata=create_netCDF(url, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, path+"/"+netCD_fileName,timestep)
-
- return latitudes, longitudes, uniqueLevels, timesUnique, mdata
-
def get_param_info(url):
'''
This function will get the general information by given URL from the parameter table.
'''
- url=url + "&info=yes"
+ url = url + "&info=yes"
result = urllib2.urlopen(url)
datastring = result.read()
datastring=json.loads(datastring)
@@ -146,13 +70,14 @@ def get_param_info(url):
end_date=datastring["end_date"]
unit=datastring["units"]
- return database,timestep,realm,instrument,start_date,end_date,unit
-
-
-def create_netCDF(url, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, netCD_fileName, timestep):
+ return database, timestep, realm, instrument, start_date, end_date, unit
+def get_data(url):
+
+ '''
+ This function will get the url, query from database and will return datapoints' latitude, longitude, level, time and value.
+ '''
- print 'Starting retrieval from DB (this may take several minutes)'
result = urllib2.urlopen(url)
datastring = result.read()
d = re.search('data: \r\n', datastring)
@@ -177,28 +102,18 @@ def create_netCDF(url, database, latMin,
timestamps.append(row[3])
values.append(np.float32(row[4]))
- # Need to sort time to make sure start and end times are correct
- time_label_list = timestamps
- print "start sort"
- time_label_list.sort()
- print "done sort"
- start_time_label = time_label_list[0]
- end_time_label = time_label_list[-1]
-
- hours=[]
- timeFormat = "%Y-%m-%d %H:%M:%S"
- base_date=datetime.strptime(start_time_label, timeFormat)
- # To convert the date to hours
- for t in timestamps:
- date=datetime.strptime(t, timeFormat)
- dif=date-base_date
- hours.append(dif.days*24)
+ return latitudes, longitudes, levels, values, timestamps
+
+def create_netCDF(latitudes, longitudes, levels, values, timestamps, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, netCD_fileName):
+ '''
+ This function will generate netCDF files.
+ '''
+
# To generate netCDF file from database
- print "Generating netCDF file in the cache directory...."
netcdf = Nio.open_file(netCD_fileName,'w')
- string="The netCDF file for parameter: " + database + ", latMin: " + str(latMin) + ", latMax: " + str(latMax) + ", lonMin: " + str(lonMin) + ", lonMax: " + str(lonMax) + " startTime: " + str(start_time_label) + " and endTime: " + str(end_time_label) + "."
+ string="The netCDF file for parameter: " + database + ", latMin: " + str(latMin) + ", latMax: " + str(latMax) + ", lonMin: " + str(lonMin) + ", lonMax: " + str(lonMax) + " startTime: " + str(startTime) + " and endTime: " + str(endTime) + "."
netcdf.globalAttName = str(string)
netcdf.create_dimension('dim', len(latitudes))
latitude = netcdf.create_variable('lat', 'd', ('dim',))
@@ -212,28 +127,35 @@ def create_netCDF(url, database, latMin,
netcdf.variables['lon'].varAttName = 'longitude'
netcdf.variables['lon'].units = 'degrees_east'
netcdf.variables['time'].varAttName = 'time'
- netcdf.variables['time'].units = 'hours since ' + str(start_time_label)
+ netcdf.variables['time'].units = 'hours since ' + str(startTime)
netcdf.variables['value'].varAttName = 'value'
netcdf.variables['value'].units = str(unit)
netcdf.variables['lev'].varAttName = 'level'
netcdf.variables['lev'].units = 'hPa'
+
+ hours=[]
+ timeFormat = "%Y-%m-%d %H:%M:%S"
+ base_date=startTime
+ # To convert the date to hours
+ for t in timestamps:
+ date=datetime.strptime(t, timeFormat)
+ diff=date-base_date
+ hours.append(diff.days*24)
+
latitude[:]=latitudes[:]
longitude[:]=longitudes[:]
level[:]=levels[:]
time[:]=hours[:]
value[:]=values[:]
netcdf.close()
+
+def read_netcdf(netCD_fileName):
- print "Data stored as netCDF file (cache file)"
-
- latitudes, longitudes, uniqueLevels, timesUnique, mdata = read_netcdf(netCD_fileName,timestep)
-
- return latitudes, longitudes, uniqueLevels, timesUnique, mdata
-
-def read_netcdf(netCD_fileName,timestep):
-
+ '''
+ This function will read the existed netCDF file, convert the hours from netCDF time variable
+ and return latitudes, longitudes, levels, times and values.
+ '''
# To use the created netCDF file
- print 'Retrieving data from cache (netCDF file)'
netcdf = Nio.open_file(netCD_fileName , 'r')
# To get all data from netCDF file
latitudes = netcdf.variables['lat'][:]
@@ -245,7 +167,7 @@ def read_netcdf(netCD_fileName,timestep)
# To get the base date
time_unit=netcdf.variables['time'].units
time_unit=time_unit.split(' ')
- base_data=time_unit[2] + " " + time_unit[3]
+ base_date=time_unit[2] + " " + time_unit[3]
netcdf.close()
@@ -254,11 +176,16 @@ def read_netcdf(netCD_fileName,timestep)
# Because time in netCDF file is based on hours since a specific date, it needs to be converted to date format
times=[]
# To convert the base date to the python datetime format
- dt = datetime.strptime(base_data, timeFormat)
- for t in range(len(hours)):
- d=timedelta(hours[t]/24)
- add=dt+d
- times.append(str(add.year) + '-' + str("%02d" % (add.month)) + '-' + str("%02d" % (add.day)) + ' ' + str("%02d" % (add.hour)) + ':' + str("%02d" % (add.minute)) + ':' + str("%02d" % (add.second)))
+ base_date = datetime.strptime(base_date, timeFormat)
+ for each in range(len(hours)):
+ hour=timedelta(hours[each]/24)
+ eachTime=base_date + hour
+ times.append(str(eachTime.year) + '-' + str("%02d" % (eachTime.month)) + '-' + str("%02d" % (eachTime.day)) + ' ' + str("%02d" % (eachTime.hour)) + ':' + str("%02d" % (eachTime.minute)) + ':' + str("%02d" % (eachTime.second)))
+
+ return latitudes, longitudes, levels, times, values
+
+
+def improve_data(latitudes, longitudes, levels, times, values, timestep):
# Make arrays of unique latitudes, longitudes, levels and times
uniqueLatitudes = np.unique(latitudes)
@@ -276,6 +203,7 @@ def read_netcdf(netCD_fileName,timestep)
# Convert each unique time from strings into list of Python datetime objects
# TODO - LIST COMPS!
+ timeFormat = "%Y-%m-%d %H:%M:%S"
timesUnique = [datetime.strptime(t, timeFormat) for t in uniqueTimestamps]
timesUnique.sort()
timesUnique = process.normalizeDatetimes(timesUnique, timestep)
@@ -296,5 +224,120 @@ def read_netcdf(netCD_fileName,timestep)
# -these make functions like values.mean(), values.max() etc ignore missing values
mdi = -9999 # TODO: extract this value from the DB retrieval metadata
mdata = ma.masked_array(values, mask=(values == mdi))
+
+
+ return latitudes, longitudes, uniqueLevels, timesUnique, mdata
+
+
+def extractData ( datasetID, paramID, latMin, latMax, lonMin, lonMax, userStartTime, userEndTime, cachedir, timestep ):
+ """
+ Main function to extract data from DB into numpy masked arrays, and also to create monthly netCDF file as cache
+
+ Input::
+ datasetID, paramID: required identifiers of data in database
+ latMin, latMax, lonMin, lonMax: location range to extract data for
+ startTime, endTime: python datetime objects describing required time range to extract
+ cachedir: directory path used to store temporary cache files
+ timestep: "daily" | "monthly" so we can be sure to query the RCMED properly
+ Output:
+ uniqueLatitudes,uniqueLongitudes: 1d-numpy array of latitude and longitude grid values
+ uniqueLevels: 1d-numpy array of vertical level values
+ timesUnique: list of python datetime objects describing times of returned data
+ mdata: masked numpy arrays of data values
+ """
+
+ url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, userStartTime, userEndTime, cachedir, timestep)
+
+ # To get the parameter's information from parameter table
+ database, timestep, realm, instrument, dbStartDate, dbEndDate, unit = get_param_info(url)
+
+ # Create a directory inside the cache directory
+ name = []
+ # activity is a fix value
+ activity = "obs4cmip5"
+ name.append(activity)
+ # product is a fix value
+ product = "observations"
+ name.append(product)
+ # realm, variable,frequency and instrument will be get from parameter table
+ realm = realm
+ name.append(realm)
+ variable = database
+ name.append(variable)
+ frequency = timestep
+ name.append(frequency)
+ data_structure = "grid"
+ name.append(data_structure)
+ institution = "NASA"
+ name.append(institution)
+ project = "RCMES"
+ name.append(project)
+ instrument = instrument
+ name.append(instrument)
+ version = "v1"
+ name.append(version)
+
+ # Check to see whether the folder is already created for netCDF or not, then it will be created
+ os.chdir(cachedir)
+ path=os.getcwd()
+ for n in name:
+ if os.path.exists(path + "/" + n):
+ os.chdir(path + "/" + n)
+ path=os.getcwd()
+ else:
+ os.mkdir(n)
+ os.chdir(path + "/" + n)
+ path=os.getcwd()
+
+ processing_level = 'L3'
+ processing_version = "processing_version" # the processing version is still unknown and can be added later
+
+ timeFormat = "%Y-%m-%d %H:%M:%S"
+
+ date_list, lats, longs, uniqueLevls, uniqueTimes, vals = [], [], [], [], [], []
+
+ # To make a list (date_list) of all months available based on user time request
+ while userStartTime <= userEndTime:
+ #To get the beginning of month
+ beginningOfMonth = str("%04d" % userStartTime.year) + "-" + str("%02d" % userStartTime.month) + "-" + "01 00:00:00"
+ #To get the end of month
+ endOfMonth = str("%04d" % userStartTime.year) + "-" + str("%02d" % userStartTime.month) + "-" + str(monthrange(userStartTime.year,userStartTime.month)[1]) + " 00:00:00"
+ #To convert both beginning and end of month from string to Python datetime format
+ beginningOfMonth = datetime.strptime(beginningOfMonth, timeFormat)
+ endOfMonth = datetime.strptime(endOfMonth, timeFormat)
+ #To add beginning and end of month as a list to the date_list list
+ date_list.append([beginningOfMonth, endOfMonth])
+ #To get the beginning of next month
+ userStartTime= endOfMonth + timedelta(days=1)
+
+ print 'Starting retrieval data (this may take several minutes) ...... '
+ # To loop over all months and return data
+ for date in date_list:
+ netCDF_name = variable + '_' + project + '_' + processing_level + '_' + processing_version + '_' + str(latMin) + '_' + str(latMax) + '_' + str(lonMin) + '_' + str(lonMax) + '_' + str("%04d" % date[0].year) + str("%02d" % date[0].month) + '.nc'
+
+ # To check if netCDF file exists, then use it
+ if os.path.exists(path+"/"+ netCDF_name):
+ latitudes, longitudes, levels, times, values = read_netcdf(path + "/" + netCDF_name)
+
+ # If the netCDF file does not exist, then create one and read it.
+ else:
+ # To just query for one year of data
+ url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, date[0], date[1], cachedir, timestep)
+
+ # To get data from DB
+ latitudes, longitudes, levels, values, timestamps = get_data(url)
+ create_netCDF(latitudes, longitudes, levels, values, timestamps, database, latMin, latMax, lonMin, lonMax, date[0], date[1], unit, path + "/" + netCDF_name)
+
+ # To read from netCDF files
+ latitudes, longitudes, levels, times, values = read_netcdf(path + "/" + netCDF_name)
+
+ lats=np.append(lats,latitudes)
+ longs=np.append(longs,longitudes)
+ uniqueLevls=np.append(uniqueLevls,levels)
+ uniqueTimes=np.append(uniqueTimes,times)
+ vals=np.append(vals,values)
+
+ latitudes, longitudes, uniqueLevels, timesUnique, mdata = improve_data(lats, longs, uniqueLevls, uniqueTimes, vals, timestep)
+
return latitudes, longitudes, uniqueLevels, timesUnique, mdata