You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@climate.apache.org by jo...@apache.org on 2014/07/01 16:49:27 UTC
[03/56] [partial] gh-pages clean up
http://git-wip-us.apache.org/repos/asf/climate/blob/a53e3af5/rcmet/src/main/python/rcmes/toolkit/metrics.py
----------------------------------------------------------------------
diff --git a/rcmet/src/main/python/rcmes/toolkit/metrics.py b/rcmet/src/main/python/rcmes/toolkit/metrics.py
deleted file mode 100644
index 7fbe174..0000000
--- a/rcmet/src/main/python/rcmes/toolkit/metrics.py
+++ /dev/null
@@ -1,1080 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-'''
-Module storing functions to calculate statistical metrics from numpy arrays
-'''
-
-import subprocess
-import os, sys
-import datetime
-import numpy as np
-import numpy.ma as ma
-import scipy.stats as stats
-import matplotlib.pyplot as plt
-from toolkit import plots, process
-from utils import misc
-from storage import files
-
-def calcAnnualCycleMeans(dataset1):
- '''
- Purpose::
- Calculate annual cycle in terms of monthly means at every grid point.
-
- Input::
- dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array timeseries
-
- Output::
- means - if 3d numpy was entered, 3d (# of months,lat,lon), if 1d data entered
- it is a timeseries of the data of length # of months
-
- '''
- data = misc.reshapeMonthlyData(dataset1)
- means = data.mean(axis = 0)
- return data, means
-
-def calcAnnualCycleMeansSubRegion(dataset1):
- '''
- Purpose::
- Calculate annual cycle in terms of monthly means at every sub-region.
-
- Input::
- dataset1 - 2d numpy array of data in (region, t)
-
- Output::
- means - (region, # of months)
-
- '''
- nregion, nT = dataset1.shape
- data = dataset1.reshape[nregion, nT/12, 12]
- means = data.mean(axis = 1)
- return data, means
-
-
-def calcClimYear(dataset1):
- '''
- Purpose::
- Calculate annual mean timeseries and climatology for both 2-D and point time series.
-
- Input::
- dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array timeseries
-
- Output::
- tSeries - if 3d numpy was entered, 3d (nYr,lat,lon), if 1d data entered
- it is a timeseries of the data of length nYr
- means - if 3d numpy was entered, 2d (lat,lon), if 1d data entered
- it is a floating point number representing the overall mean
- '''
- data = misc.reshapeMonthlyData(dataset1)
- tSeries = data.mean(axis = 1)
- means = tSeries.mean(axis = 0)
- return tSeries, means
-
-def calcClimSeason(monthBegin, monthEnd, dataset1):
- '''
- Purpose ::
- Calculate seasonal mean montheries and climatology for both 3-D and point time series.
- For example, to calculate DJF mean time series, monthBegin = 12, monthEnd =2
- This can handle monthBegin=monthEnd i.e. for climatology of a specific month
-
- Input::
- monthBegin - an integer for the beginning month (Jan =1)
- monthEnd - an integer for the ending month (Jan = 1)
- dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array montheries
-
- Output::
- tSeries - if 3d numpy was entered, 3d (number of years/number of years -1 if monthBegin > monthEnd,lat,lon),
- if 1d data entered it is a montheries of the data of length number of years
- means - if 3d numpy was entered, 2d (lat,lon), if 1d data entered
- it is a floating point number representing the overall mean
- '''
- if monthBegin > monthEnd:
- # Offset the original array so that the the first month
- # becomes monthBegin, note that this cuts off the first year of data
- offset = slice(monthBegin - 1, monthBegin - 13)
- data = misc.reshapeMonthlyData(dataset1[offset])
- monthIndex = slice(0, 13 - monthBegin + monthEnd)
- else:
- # Since monthBegin <= monthEnd, just take a slice containing those months
- data = misc.reshapeMonthlyData(dataset1)
- monthIndex = slice(monthBegin - 1, monthEnd)
-
- tSeries = data[:, monthIndex].mean(axis = 1)
- means = tSeries.mean(axis = 0)
- return tSeries, means
-
-def calcClimSeasonSubRegion(monthBegin, monthEnd, dataset1):
- '''
- Purpose ::
- Calculate seasonal mean montheries and climatology for both 3-D and point time series.
- For example, to calculate DJF mean time series, monthBegin = 12, monthEnd =2
- This can handle monthBegin=monthEnd i.e. for climatology of a specific month
-
- Input::
- monthBegin - an integer for the beginning month (Jan =1)
- monthEnd - an integer for the ending month (Jan = 1)
- dataset1 - 3d numpy array of data in (region, t) or 1d numpy array montheries
-
- Output::
- tSeries - (region, number of years/number of years -1 if monthBegin > monthEnd,lat,lon),
- means - (region)
- '''
- nregion, nT = dataset1.shape
- nYR = nT/12
- if monthBegin > monthEnd:
- # Offset the original array so that the the first month
- # becomes monthBegin, note that this cuts off the first year of data
- offset = slice(monthBegin - 1, monthBegin - 13)
- data = dataset1[:,offset].reshape([nregion,nYR-1, 12])
- monthIndex = slice(0, 13 - monthBegin + monthEnd)
- else:
- # Since monthBegin <= monthEnd, just take a slice containing those months
- data = dataset1.reshape([nregion,nYR,12])
- monthIndex = slice(monthBegin - 1, monthEnd)
-
- tSeries = data[:, :, monthIndex].mean(axis = 2)
- means = tSeries.mean(axis = 1)
- return tSeries, means
-
-def calcAnnualCycleStdev(dataset1):
- '''
- Purpose::
- Calculate monthly standard deviations for every grid point
-
- Input::
- dataset1 - 3d numpy array of data in (12* number of years,lat,lon)
-
- Output::
- stds - if 3d numpy was entered, 3d (12,lat,lon)
- '''
- data = misc.reshapeMonthlyData(dataset1)
- stds = data.std(axis = 0, ddof = 1)
- return stds
-
-def calcAnnualCycleStdevSubRegion(dataset1):
- '''
- Purpose::
- Calculate monthly standard deviations for every sub-region
-
- Input::
- dataset1 - 2d numpy array of data in (nregion, 12* number of years)
-
- Output::
- stds - (nregion, 12)
- '''
- nregion, nT = data1.shape
- data = dataset1.reshape([nregion, nT/12, 12])
- stds = data.std(axis = 1, ddof = 1)
- return stds
-
-def calcAnnualCycleDomainMeans(dataset1):
- '''
- Purpose::
- Calculate spatially averaged monthly climatology and standard deviation
-
- Input::
- dataset1 - 3d numpy array of data in (12* number of years,lat,lon)
-
- Output::
- means - time series (12)
- '''
- data = misc.reshapeMonthlyData(dataset1)
-
- # Calculate the means, month by month
- means = np.zeros(12)
- for i in np.arange(12):
- means[i] = data[:, i, :, :].mean()
-
- return means
-
-def calcSpatialStdevRatio(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the ratio of spatial standard deviation (model standard deviation)/(observed standard deviation)
-
- Input ::
- evaluationData - model data array (lat, lon)
- referenceData- observation data array (lat,lon)
-
- Output::
- ratio of standard deviation (a scholar)
-
- '''
- stdevRatio = evaluationData[(evaluationData.mask==False) & (referenceData.mask==False)].std()/ \
- referenceData[(evaluationData.mask==False) & (referenceData.mask==False)].std()
- return stdevRatio
-
-def calcTemporalStdev(dataset1):
- '''
- Purpose::
- Calculate sample standard deviations over the time
-
- Input::
- dataset1 - 3d numpy array of data in (time,lat,lon)
-
-
- Output::
- stds - time series (lat, lon)
- '''
- stds = dataset1.std(axis = 0, ddof = 1)
- return stds
-
-def calcAnnualCycleDomainStdev(dataset1):
- '''
- Purpose::
- Calculate sample standard deviations representing the domain in each month
-
- Input::
- dataset1 - 3d numpy array of data in (12* number of years,lat,lon)
-
- Output::
- stds - time series (12)
- '''
- data = misc.reshapeMonthlyData(dataset1)
-
- # Calculate the standard deviation, month by months
- stds = np.zeros(12)
- for i in np.arange(12):
- stds[i] = data[:, i, :, :].std(ddof = 1)
-
- return stds
-
-def calcBiasAveragedOverTime(evaluationData, referenceData, option): # Mean Bias
- '''
- Purpose::
- Calculate the mean difference between two fields over time for each grid point.
-
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
- option - string indicating absolute values or not
-
- Output::
- bias - difference between referenceData and evaluationData averaged over the first dimension
-
- '''
- # Calculate mean difference between two fields over time for each grid point
- # Precrocessing of both obs and model data ensures the absence of missing values
- diff = evaluationData - referenceData
- if(option == 'abs'):
- diff = abs(diff)
- bias = diff.mean(axis = 0)
- return bias
-
-
-def calcBiasAveragedOverTimeAndSigLev(evaluationData, referenceData):
- '''
- Purpose::
- Calculate mean difference between two fields over time for each grid point
-
- Classify missing data resulting from multiple times (using threshold
- data requirement)
-
- i.e. if the working time unit is monthly data, and we are dealing with
- multiple months of data then when we show mean of several months, we need
- to decide what threshold of missing data we tolerate before classifying a
- data point as missing data.
-
-
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- bias - difference between referenceData and evaluationData averaged over the first dimension
- sigLev - significance of the difference (masked array)
- For example: sig[iy,ix] = 0.95 means that the observation and model is different at 95% confidence level
- at X=ix and Y=iy
- '''
- # If either gridcell in each data set is missing, set that cell to
- # missing for the output significance level
- evaluationDataMask = process.create_mask_using_threshold(evaluationData, threshold = 0.75)
- referenceDataMask = process.create_mask_using_threshold(referenceData, threshold = 0.75)
-
- # The overall mask associated with missing data
- overallMask = np.logical_or(evaluationDataMask, referenceDataMask)
-
- diff = evaluationData - referenceData
- bias = diff.mean(axis = 0)
- sigLev = 1 - stats.ttest_rel(evaluationData, referenceData, axis = 0)[1]
- sigLev[overallMask] = -100.
- sigLev = ma.masked_equal(sigLev, -100.)
- # Set mask for bias metric using missing data in obs or model data series
- # i.e. if obs contains more than threshold (e.g.50%) missing data
- # then classify time average bias as missing data for that location.
- bias = ma.masked_array(bias.data, overallMask)
- return bias, sigLev
-
-
-def calcBiasAveragedOverTimeAndDomain(evaluationData, referenceData):
- '''
- Purpose::
- Calculate the mean difference between two fields over time and domain
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- bias - difference between referenceData and evaluationData averaged over time and space
-
- '''
-
- diff = evaluationData - referenceData
-
- bias = diff.mean()
- return bias
-
-def calcBias(evaluationData, referenceData):
- '''
- Purpose::
- Calculate the difference between two fields at each grid point
-
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- diff - difference between referenceData and evaluationData
-
- '''
-
- diff = evaluationData - referenceData
- return diff
-
-def calcRootMeanSquaredDifferenceAveragedOverTime(evaluationData, referenceData):
- '''
- Purpose::
- Calculate root mean squared difference (RMS errors) averaged over time between two fields for each grid point
-
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- rms - root mean squared difference, if the input is 1-d data, the output becomes a single floating number.
-
- '''
- sqdiff = (evaluationData - referenceData)** 2
- rms = np.sqrt(sqdiff.mean(axis = 0))
- return rms
-
-
-def calcRootMeanSquaredDifferenceAveragedOverTimeAndDomain(evaluationData, referenceData):
- '''
- Purpose::
- Calculate root mean squared difference (RMS errors) averaged over time and space between two fields
-
- Input::
- referenceData - array of data (should be 3-d array)
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- rms - root mean squared difference averaged over time and space
- '''
- sqdiff = (evaluationData - referenceData)** 2
- rms = np.sqrt(sqdiff.mean())
- return rms
-
-def calcTemporalCorrelation(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the temporal correlation.
-
- Assumption(s) ::
- The first dimension of two datasets is the time axis.
-
- Input ::
- evaluationData - model data array of any shape
- referenceData- observation data array of any shape
-
- Output::
- temporalCorelation - A 2-D array of temporal correlation coefficients at each subregion
- sigLev - A 2-D array of confidence levels related to temporalCorelation
-
- REF: 277-281 in Stat methods in atmos sci by Wilks, 1995, Academic Press, 467pp.
- sigLev: the correlation between model and observation is significant at sigLev * 100 %
- '''
- evaluationDataMask = process.create_mask_using_threshold(evaluationData, threshold = 0.75)
- referenceDataMask = process.create_mask_using_threshold(referenceData, threshold = 0.75)
-
- nregion = evaluationData.shape[0]
- temporalCorrelation = ma.zeros([nregion])-100.
- sigLev = ma.zeros([nregion])-100.
- for iregion in np.arange(nregion):
- temporalCorrelation[iregion], sigLev[iregion] = stats.pearsonr(evaluationData[iregion,:], referenceData[iregion,:])
- sigLev[iregion] = 1 - sigLev[iregion]
-
- temporalCorrelation=ma.masked_equal(temporalCorrelation.data, -100.)
- sigLev=ma.masked_equal(sigLev.data, -100.)
-
- return temporalCorrelation, sigLev
-
-def calcTemporalCorrelationSubRegion(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the temporal correlation.
-
- Assumption(s) ::
- both evaluation and reference data are subregion averaged time series
-
- Input ::
- evaluationData - model data array [region,t]
- referenceData- observation data [region, t]
-
- Output::
- temporalCorelation - A 1-D array of temporal correlation coefficients at each grid point.
- sigLev - A 1-D array of confidence levels related to temporalCorelation
-
- REF: 277-281 in Stat methods in atmos sci by Wilks, 1995, Academic Press, 467pp.
- sigLev: the correlation between model and observation is significant at sigLev * 100 %
- '''
- evaluationDataMask = process.create_mask_using_threshold(evaluationData, threshold = 0.75)
- referenceDataMask = process.create_mask_using_threshold(referenceData, threshold = 0.75)
-
- ngrdY = evaluationData.shape[1]
- ngrdX = evaluationData.shape[2]
- temporalCorrelation = ma.zeros([ngrdY,ngrdX])-100.
- sigLev = ma.zeros([ngrdY,ngrdX])-100.
- for iy in np.arange(ngrdY):
- for ix in np.arange(ngrdX):
- if not evaluationDataMask[iy,ix] and not referenceDataMask[iy,ix]:
- t1=evaluationData[:,iy,ix]
- t2=referenceData[:,iy,ix]
- if t1.min()!=t1.max() and t2.min()!=t2.max():
- temporalCorrelation[iy,ix], sigLev[iy,ix]=stats.pearsonr(t1[(t1.mask==False) & (t2.mask==False)],
- t2[(t1.mask==False) & (t2.mask==False)])
- sigLev[iy,ix]=1-sigLev[iy,ix] # p-value => confidence level
-
- temporalCorrelation=ma.masked_equal(temporalCorrelation.data, -100.)
- sigLev=ma.masked_equal(sigLev.data, -100.)
-
- return temporalCorrelation, sigLev
-
-def calcPatternCorrelation(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the spatial correlation.
-
- Input ::
- evaluationData - model data array (lat, lon)
- referenceData- observation data array (lat,lon)
-
- Output::
- patternCorrelation - a single floating point
- sigLev - a single floating point representing the confidence level
-
- '''
-
- patternCorrelation, sigLev = stats.pearsonr(evaluationData[(evaluationData.mask==False) & (referenceData.mask==False)],
- referenceData[(evaluationData.mask==False) & (referenceData.mask==False)])
- return patternCorrelation, sigLev
-
-
-def calcPatternCorrelationEachTime(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the spatial correlation for each time
-
- Assumption(s) ::
- The first dimension of two datasets is the time axis.
-
- Input ::
- evaluationData - model data array (time,lat, lon)
- referenceData- observation data array (time,lat,lon)
-
- Output::
- patternCorrelation - a timeseries (time)
- sigLev - a time series (time)
- '''
- nT = evaluationData.shape[0]
- patternCorrelation = ma.zeros(nT)-100.
- sigLev = ma.zeros(nT)-100.
- for it in np.arange(nT):
- patternCorrelation[it], sigLev[it] = calcPatternCorrelation(evaluationData[it,:,:], referenceData[it,:,:])
-
- return patternCorrelation,sigLev
-
-def calcNashSutcliff(evaluationData, referenceData):
- '''
- Assumption(s)::
- Both evaluationData and referenceData are the same shape.
- * lat, lon must match up
- * time steps must align (i.e. months vs. months)
-
- Input::
- evaluationData - 3d (time, lat, lon) array of data
- referenceData - 3d (time, lat, lon) array of data
-
- Output:
- nashcor - 1d array aligned along the time dimension of the input
- datasets. Time Series of Nash-Sutcliff Coefficient of efficiency
-
- '''
- # Flatten the spatial dimensions
- data1 = evaluationData[:]
- data2 = referenceData[:]
- nT = data1.shape[0]
- data1.shape = nT, data1.size / nT
- data2.shape = nT, data2.size / nT
- meanData2 = data2.mean(axis = 1)
-
- # meanData2 must be reshaped to 2D as to obey
- # numpy broadcasting rules
- meanData2.shape = nT, 1
- nashcor = 1 - ((((data2 - data1) ** 2).sum(axis = 1)) /
- (((data2 - meanData2) ** 2).sum(axis = 1)))
- return nashcor
-
-def getPdfInputValues():
- print '****PDF input values from user required **** \n'
- nbins = int (raw_input('Please enter the number of bins to use. \n'))
- minEdge = float(raw_input('Please enter the minimum value to use for the edge. \n'))
- maxEdge = float(raw_input('Please enter the maximum value to use for the edge. \n'))
-
- return nbins, minEdge, maxEdge
-
-
-def calcPdf(evaluationData, referenceData, settings=None):
- '''
- Routine to calculate a normalized Probability Distribution Function with
- bins set according to data range.
- Equation from Perkins et al. 2007
-
- PS=sum(min(Z_O_i, Z_M_i)) where Z is the distribution (histogram of the data for either set)
- called in do_rcmes_processing_sub.py
-
- Inputs::
- evaluationData (3D numpy array): array shape (time, lat, lon)
- referenceData (3D numpy array): array shape (time, lat, lon)
- settings (tuple): [optional] format (binCount, minEdge, maxEdge)
- binCount (int): number of bins to use
- minEdge (int|float): minimum edge
- maxEdge (int|float): maximum edge
-
- NB, time here is the number of time values eg for time period
- 199001010000 - 199201010000
-
- Assumptions::
- If annual means-opt 1, was chosen, then referenceData.shape = (YearsCount,lat,lon)
-
- If monthly means - opt 2, was choosen, then referenceData.shape = (MonthsCount,lat,lon)
-
- Output::
-
- one float which represents the PDF for the year
-
- TODO: Clean up this docstring so we have a single purpose statement
-
- Routine to calculate a normalised PDF with bins set according to data range.
-
- Input::
- 2 data arrays, modelData and obsData
-
- Output::
- PDF for the year
-
- '''
- similarityScore = 0.0
- print 'min modelData', evaluationData[:, :, :].min()
- print 'max modelData', evaluationData[:, :, :].max()
- print 'min obsData', referenceData[:, :, :].min()
- print 'max obsData', referenceData[:, :, :].max()
-
- if settings == None:
- nbins, minEdge, maxEdge = getPdfInputValues()
- else:
- nbins, minEdge, maxEdge = settings
-
-
- mybins = np.linspace(minEdge, maxEdge, nbins)
- print 'nbins is', nbins, 'mybins are', mybins
-
- pdfMod, edges = np.histogram(evaluationData, bins = mybins, normed = True, density = True)
- print 'evaluationData distribution and edges', pdfMod, edges
- pdfObs, edges = np.histogram(referenceData, bins = mybins, normed = True, density = True)
- print 'referenceData distribution and edges', pdfObs, edges
-
- #find minimum at each bin between lists
- i = 0
- for model_value in pdfMod :
- print 'model_value is', model_value, 'pdfObs[', i, '] is', pdfObs[i]
- if model_value < pdfObs[i]:
- similarityScore += model_value
- else:
- similarityScore += pdfObs[i]
- i += 1
- print 'similarity_score is', similarityScore
- return similarityScore
-
-
-def metrics_plots(varName, numOBS, numMDL, nT, ngrdY, ngrdX, Times, lons,
- lats, obsData, mdlData, obsList, mdlList, workdir,subRegions, FoutOption):
- '''
- Calculate evaluation metrics and generate plots.
- '''
- ##################################################################################################################
- # Routine to compute evaluation metrics and generate plots
- # (1) metric calculation
- # (2) plot production
- # Input:
- # numOBS - the number of obs data. either 1 or >2 as obs ensemble is added for multi-obs cases
- # numMDL - the number of mdl data. either 1 or >2 as obs ensemble is added for multi-mdl cases
- # nT - the length of the data in the time dimension
- # ngrdY - the length of the data in Y dimension
- # ngrdX - the length of the data in the X dimension
- # Times - time stamps
- # lons,lats - longitude & latitude values of the data domain (same for model & obs)
- # obsData - obs data, either a single or multiple + obs_ensemble, interpolated onto the time- and
- # grid for analysis
- # mdlData - mdl data, either a single or multiple + obs_ensemble, interpolated onto the time- and
- # spatial grid for analysis
- #JK2.0: obsRgn - obs time series averaged for subregions: Local variable in v2.0
- #JK2.0: mdlRgn - obs time series averaged for subregions: Local variable in v2.0
- # obsList - string describing the observation data files
- # mdlList - string describing model file names
- # workdir - string describing the directory path for storing results and plots
- #JK2.0: mdlSelect - the mdl data to be evaluated: Locally determined in v.2.0
- #JK2.0: obsSelect - the obs data to be used as the reference for evaluation: Locally determined in v.2.0
- #JK2.0: numSubRgn - the number of subregions: Locally determined in v.2.0
- #JK2.0: subRgnName - the names of subregions: Locally determined in v.2.0
- #JK2.0: rgnSelect - the region for which the area-mean time series is to be
- # evaluated/plotted: Locally determined in v.2.0
- # obsParameterId - int, db parameter id. ** this is non-essential once the correct
- # metadata use is implemented
- # precipFlag - to be removed once the correct metadata use is implemented
- # timeRegridOption - string: 'full'|'annual'|'monthly'|'daily'
- # seasonalCycleOption - int (=1 if set) (probably should be bool longterm)
- # metricOption - string: 'bias'|'mae'|'acc'|'pdf'|'patcor'|'rms'|'diff'
- # titleOption - string describing title to use in plot graphic
- # plotFileNameOption - string describing filename stub to use for plot graphic i.e. {stub}.png
- # Output: image files of plots + possibly data
- #******************************************************
- # JK2.0: Only the data interpolated temporally and spatially onto the analysis grid
- # are transferred into this routine. The rest of processing (e.g., area-averaging, etc.)
- # are to be performed in this routine. Do not overwrite obsData[numOBs,nt,ngrdY,ngrdX] &
- # mdlData[numMDL,nt,ngrdY,ngrdX]. These are the raw, re-gridded data to be used repeatedly
- # for multiple evaluation steps as desired by an evaluator
- ##################################################################################################################
-
- #####################################################################################################
- # JK2.0: Compute evaluation metrics and plots to visualize the results
- #####################################################################################################
- # (mp.001) Sub-regions for local timeseries analysis
- #--------------------------------
- # Enter the location of the subrgns via screen input of data;
- # modify this to add an option to read-in from data file(s)
- #----------------------------------------------------------------------------------------------------
- if subRegions:
- numSubRgn = len(subRegions)
- subRgnName = [ x.name for x in subRegions ]
- subRgnLon0 = [ x.lonMin for x in subRegions ]
- subRgnLon1 = [ x.lonMax for x in subRegions ]
- subRgnLat0 = [ x.latMin for x in subRegions ]
- subRgnLat1 = [ x.latMax for x in subRegions ]
- else:
- print ''
- ans = raw_input('Calculate area-mean timeseries for subregions? y/n: \n> ')
- print ''
- if ans == 'y':
- ans = raw_input('Input subregion info interactively? y/n: \n> ')
- if ans == 'y':
- numSubRgn, subRgnName, subRgnLon0, subRgnLon1, subRgnLat0, subRgnLat1 = misc.createSubRegionObjectInteractively()
- else:
- print 'Read subregion info from a pre-fabricated text file'
- ans = raw_input('Read from a defaule file (workdir + "/sub_regions.txt")? y/n: \n> ')
- if ans == 'y':
- subRgnFileName = workdir + "/sub_regions.txt"
- else:
- subRgnFileName = raw_input('Enter the subRgnFileName to read from \n')
- print 'subRgnFileName ', subRgnFileName
- numSubRgn, subRgnName, subRgnLon0, subRgnLon1, subRgnLat0, subRgnLat1 = misc.assign_subRgns_from_a_text_file(subRgnFileName)
- print subRgnName, subRgnLon0, subRgnLon1, subRgnLat0, subRgnLat1
- else:
- numSubRgn = 0
-
- # compute the area-mean timeseries for all subregions if subregion(s) are defined.
- # the number of subregions is usually small and memory usage is usually not a concern
- obsRgn = ma.zeros((numOBS, numSubRgn, nT))
- mdlRgn = ma.zeros((numMDL, numSubRgn, nT))
- if numSubRgn > 0:
- print 'Enter area-averaging: mdlData.shape, obsData.shape ', mdlData.shape, obsData.shape
- print 'Use Latitude/Longitude Mask for Area Averaging'
- for n in np.arange(numSubRgn):
- # Define mask using regular lat/lon box specified by users ('mask=True' defines the area to be excluded)
- maskLonMin = subRgnLon0[n]
- maskLonMax = subRgnLon1[n]
- maskLatMin = subRgnLat0[n]
- maskLatMax = subRgnLat1[n]
- mask = np.logical_or(np.logical_or(lats <= maskLatMin, lats >= maskLatMax),
- np.logical_or(lons <= maskLonMin, lons >= maskLonMax))
- # Calculate area-weighted averages within this region and store in a new list
- for k in np.arange(numOBS): # area-average obs data
- Store = []
- for t in np.arange(nT):
- Store.append(process.calc_area_mean(obsData[k, t, :, :], lats, lons, mymask = mask))
- obsRgn[k, n, :] = ma.array(Store[:])
- for k in np.arange(numMDL): # area-average mdl data
- Store = []
- for t in np.arange(nT):
- Store.append(process.calc_area_mean(mdlData[k, t, :, :], lats, lons, mymask = mask))
- mdlRgn[k, n, :] = ma.array(Store[:])
- Store = [] # release the memory allocated by temporary vars
-
- #-------------------------------------------------------------------------
- # (mp.002) FoutOption: The option to create a binary or netCDF file of processed
- # (re-gridded and regionally-averaged) data for user-specific processing.
- # This option is useful for advanced users who need more than
- # the metrics and vidualization provided in the basic package.
- #----------------------------------------------------------------------------------------------------
- print ''
- if not FoutOption:
- FoutOption = raw_input('Option for output files of obs/model data: Enter no/bn/nc \
- for no, binary, netCDF file \n> ').lower()
- print ''
- # write a binary file for post-processing if desired
- if FoutOption == 'bn':
- fileName = workdir + '/lonlat_eval_domain' + '.bn'
- if(os.path.exists(fileName) == True):
- cmnd = 'rm -f ' + fileName
- subprocess.call(cmnd, shell=True)
- files.writeBN_lola(fileName, lons, lats)
- fileName = workdir + '/Tseries_' + varName + '.bn'
- print "Create regridded data file ", fileName, " for offline processingr"
- print 'The file includes time series of ', numOBS, ' obs and ', numMDL, \
- ' models ', nT, ' steps ', ngrdX, 'x', ngrdY, ' grids'
- if(os.path.exists(fileName) == True):
- cmnd = 'rm -f ' + fileName
- subprocess.call(cmnd, shell=True)
- files.writeBNdata(fileName, numOBS, numMDL, nT, ngrdX, ngrdY, numSubRgn, obsData, mdlData, obsRgn, mdlRgn)
- # write a netCDF file for post-processing if desired
- if FoutOption == 'nc':
- fileName = workdir + '/'+varName+'_Tseries.nc'
- if(os.path.exists(fileName) == True):
- print "removing %s from the local filesystem, so it can be replaced..." % (fileName,)
- files.writeNCfile(fileName, numSubRgn, lons, lats, obsData, mdlData, obsRgn, mdlRgn, obsList, mdlList, subRegions)
- if FoutOption == 'bn':
- print 'The regridded obs and model data are written in the binary file ', fileName
- elif FoutOption == 'nc':
- print 'The regridded obs and model data are written in the netCDF file ', fileName
-
- #####################################################################################################
- ###################### Metrics calculation and plotting cycle starts from here ######################
- #####################################################################################################
-
- print ''
- print 'OBS and MDL data have been prepared for the evaluation step'
- print ''
- doMetricsOption = raw_input('Want to calculate metrics and plot them? [y/n]\n> ').lower()
- print ''
-
- ####################################################
- # Terminate job if no metrics are to be calculated #
- ####################################################
-
- neval = 0
-
- while doMetricsOption == 'y':
- neval += 1
- print ' '
- print 'neval= ', neval
- print ' '
- #--------------------------------
- # (mp.003) Preparation
- #----------------------------------------------------------------------------------------------------
- # Determine info on years (the years in the record [YR] and its number[numYR])
- yy = ma.zeros(nT, 'i')
- for n in np.arange(nT):
- yy[n] = Times[n].strftime("%Y")
- YR = np.unique(yy)
- yy = 0
- nYR = len(YR)
- print 'nYR, YR = ', nYR, YR
-
- # Select the eval domain: over the entire domain (spatial distrib) or regional time series
- anlDomain = 'n'
- anlRgn = 'n'
- print ' '
- analSelect = int(raw_input('Eval over domain (Enter 0) or time series of selected regions (Enter 1) \n> '))
- print ' '
- if analSelect == 0:
- anlDomain = 'y'
- elif analSelect == 1:
- anlRgn = 'y'
- else:
- print 'analSelect= ', analSelect, ' is Not a valid option: CRASH'
- sys.exit()
-
- #--------------------------------
- # (mp.004) Select the model and data to be used in the evaluation step
- #----------------------------------------------------------------------------------------------------
- mdlSelect = misc.select_data(numMDL, Times, mdlList, 'mdl')
- obsSelect = misc.select_data(numOBS, Times, obsList, 'obs')
- mdlName = mdlList[mdlSelect]
- obsName = obsList[obsSelect]
- print 'selected obs and model for evaluation= ', obsName, mdlName
-
-
- #--------------------------------
- # (mp.005) Spatial distribution analysis/Evaluation (anlDomain='y')
- # All climatology variables are 2-d arrays (e.g., mClim = ma.zeros((ngrdY,ngrdX))
- #----------------------------------------------------------------------------------------------------
- if anlDomain == 'y':
- # first determine the temporal properties to be evaluated
- print ''
- timeOption = misc.select_timOpt()
- print 'timeOption=',timeOption
- if timeOption == 1:
- timeScale = 'Annual'
- # compute the annual-mean time series and climatology.
- # mTser=ma.zeros((nYR,ngrdY,ngrdX)), mClim = ma.zeros((ngrdY,ngrdX))
- mTser, mClim = calcClimYear(mdlData[mdlSelect, :, :, :])
- oTser, oClim = calcClimYear(obsData[obsSelect, :, :, :])
- elif timeOption == 2:
- timeScale = 'Seasonal'
- # select the timeseries and climatology for a season specifiec by a user
- print ' '
- moBgn = int(raw_input('Enter the beginning month for your season. 1-12: \n> '))
- moEnd = int(raw_input('Enter the ending month for your season. 1-12: \n> '))
- print ' '
- mTser, mClim = calcClimSeason(moBgn, moEnd, mdlData[mdlSelect, :, :, :])
- oTser, oClim = calcClimSeason(moBgn, moEnd, obsData[obsSelect, :, :, :])
- elif timeOption == 3:
- timeScale = 'Monthly'
- # compute the monthly-mean time series and climatology
- # Note that the shapes of the output vars are:
- # mTser = ma.zeros((nYR,12,ngrdY,ngrdX)) & mClim = ma.zeros((12,ngrdY,ngrdX))
- # Also same for oTser = ma.zeros((nYR,12,ngrdY,ngrdX)) &,oClim = ma.zeros((12,ngrdY,ngrdX))
- mTser, mClim = calcAnnualCycleMeans(mdlData[mdlSelect, :, :, :])
- oTser, oClim = calcAnnualCycleMeans(obsData[obsSelect, :, :, :])
- else:
- # undefined process options. exit
- print 'The desired temporal scale is not available this time. END the job'
- sys.exit()
-
- #--------------------------------
- # (mp.006) Select metric to be calculated
- # bias, mae, acct, accs, pcct, pccs, rmst, rmss, pdfSkillScore
- #----------------------------------------------------------------------------------------------------
- print ' '
- metricOption = misc.select_metrics()
- print ' '
-
- # metrics below yields 2-d array, i.e., metricDat = ma.array((ngrdY,ngrdX))
- if metricOption == 'BIAS':
- metricDat, sigLev = calcBiasAveragedOverTimeAndSigLev(mTser, oTser)
- oStdv = calcTemporalStdev(oTser)
- elif metricOption == 'MAE':
- metricDat= calcBiasAveragedOverTime(mTser, oTser, 'abs')
- # metrics below yields 1-d time series
- elif metricOption == 'PCt':
- metricDat, sigLev = calcPatternCorrelationEachTime(mTser, oTser)
- # metrics below yields 2-d array, i.e., metricDat = ma.array((ngrdY,ngrdX))
- elif metricOption == 'TC':
-
- metricDat, sigLev = calcTemporalCorrelation(mTser, oTser)
- # metrics below yields a scalar values
- elif metricOption == 'PCC':
- metricDat, sigLev = calcPatternCorrelation(mClim, oClim)
- # metrics below yields 2-d array, i.e., metricDat = ma.array((ngrdY,ngrdX))
- elif metricOption == 'RMSt':
- metricDat = calcRootMeanSquaredDifferenceAveragedOverTime(mTser, oTser)
-
- #--------------------------------
- # (mp.007) Plot the metrics. First, enter plot info
- #----------------------------------------------------------------------------------------------------
-
- # 2-d contour plots
- if metricDat.ndim == 2:
- # assign plot file name and delete old (un-saved) plot files
- plotFileName = workdir + '/' + timeScale + '_' + varName + '_' + metricOption
- if(os.path.exists(plotFileName) == True):
- cmnd = 'rm -f ' + plotFileName
- subprocess.call(cmnd, shell=True)
- # assign plot title
- plotTitle = metricOption + '_' + varName
- # Data-specific plot options: i.e. adjust model data units & set plot color bars
- #cMap = 'rainbow'
- cMap = plt.cm.RdBu_r
- #cMap = 'BlWhRe'
- #cMap = 'BlueRed'
- #cMap = 'GreyWhiteGrey'
- # Calculate color bar ranges for data such that same range is used
- # in obs and model plots for like-with-like comparison.
- obsDataMask = np.zeros_like(oClim.data[:, :])
- metricDat = ma.masked_array(metricDat, obsDataMask)
- oClim = ma.masked_array(oClim, obsDataMask)
- oStdv = ma.masked_array(oClim, obsDataMask)
- plotDat = metricDat
- mymax = plotDat.max()
- mymin = plotDat.min()
- if metricOption == 'BIAS':
- abs_mymin = abs(mymin)
- if abs_mymin <= mymax:
- mymin = -mymax
- else:
- mymax = abs_mymin
- print 'Plot bias over the model domain: data MIN/MAX= ', mymin, mymax
- ans = raw_input('Do you want to use different scale for plotting? [y/n]\n> ').lower()
- if ans == 'y':
- mymin = float(raw_input('Enter the minimum plot scale \n> '))
- mymax = float(raw_input('Enter the maximum plot scale \n> '))
- wksType = 'png'
- # TODO This shouldn't return anything. Handle a "status" the proper way
- _ = plots.draw_cntr_map_single(plotDat, lats, lons, mymin, mymax,
- plotTitle, plotFileName, cMap=cMap)
- # if bias, plot also normalzied values and means: first, normalized by mean
- if metricOption == 'BIAS':
- print ''
- makePlot = raw_input('Plot bias in terms of % of OBS mean? [y/n]\n> ').lower()
- print ''
- if makePlot == 'y':
- plotDat = 100.*metricDat / oClim
- mymax = plotDat.max()
- mymin = plotDat.min()
- mymn = -100.
- mymx = 105.
- print 'Plot mean-normalized bias: data MIN/MAX= ', mymin, mymax, \
- ' Default MIN/MAX= ', mymn, mymx
- ans = raw_input('Do you want to use different scale for plotting? [y/n]\n> ').lower()
- if ans == 'y':
- mymin = float(raw_input('Enter the minimum plot scale \n> '))
- mymax = float(raw_input('Enter the maximum plot scale \n> '))
- else:
- mymin = mymn
- mymax = mymx
- plotFileName = workdir + '/' + timeScale + '_' + varName + '_' + metricOption + '_Mean'
- if(os.path.exists(plotFileName) == True):
- cmnd = 'rm -f ' + plotFileName
- subprocess.call(cmnd, shell = True)
- plotTitle = 'Bias (% MEAN)'
- # TODO Again, this shouldn't return a status
- _ = plots.draw_cntr_map_single(plotDat, lats, lons, mymin, mymax,
- plotTitle, plotFileName, wksType, cMap)
- # normalized by sigma
- makePlot = raw_input('Plot bias in terms of % of interann sigma? [y/n]\n> ').lower()
- if makePlot == 'y':
- plotDat = 100.*metricDat / oStdv
- mymax = plotDat.max()
- mymin = plotDat.min()
- mymn = -200.
- mymx = 205.
- print 'Plot STD-normalized bias: data MIN/MAX= ', mymin, mymax, ' Default MIN/MAX= ', mymn, mymx
- ans = raw_input('Do you want to use different scale for plotting? [y/n]\n> ').lower()
- if ans == 'y':
- mymin = float(raw_input('Enter the minimum plot scale \n> '))
- mymax = float(raw_input('Enter the maximum plot scale \n> '))
- else:
- mymin = mymn
- mymax = mymx
- plotFileName = workdir + '/' + timeScale + '_' + varName + '_' + metricOption + '_SigT'
- if(os.path.exists(plotFileName) == True):
- cmnd = 'rm -f ' + plotFileName
- subprocess.call(cmnd, shell = True)
- plotTitle = 'Bias (% SIGMA_time)'
- # TODO Hay look! A todo re. status returns!
- _ = plots.draw_cntr_map_single(plotDat, lats, lons, mymin, mymax,
- plotTitle, plotFileName, wksType, cMap)
- # obs climatology
- makePlot = raw_input('Plot observation? [y/n]\n> ').lower()
- if makePlot == 'y':
- if varName == 'pr':
- cMap = plt.cm.jet
- else:
- cMap = plt.cm.jet
- plotDat = oClim
- mymax = plotDat.max()
- mymin = plotDat.min()
- print 'Plot STD-normalized bias over the model domain: data MIN/MAX= ', mymin, mymax
- ans = raw_input('Do you want to use different scale for plotting? [y/n]\n> ').lower()
- if ans == 'y':
- mymin = float(raw_input('Enter the minimum plot scale \n> '))
- mymax = float(raw_input('Enter the maximum plot scale \n> '))
- plotFileName = workdir + '/' + timeScale + '_' + varName + '_OBS'
- if(os.path.exists(plotFileName) == True):
- cmnd = 'rm -f ' + plotFileName
- subprocess.call(cmnd, shell=True)
- plotTitle = 'OBS (mm/day)'
- # TODO Yep
- _ = plots.draw_cntr_map_single(plotDat, lats, lons, mymin, mymax,
- plotTitle, plotFileName, wksType, cMap)
-
- # Repeat for another metric
- print ''
- print 'Evaluation completed'
- print ''
- doMetricsOption = raw_input('Do you want to perform another evaluation? [y/n]\n> ').lower()
- print ''
-
- # metrics and plots for regional time series
- elif anlRgn == 'y':
- # select the region(s) for evaluation. model and obs have been selected before entering this if block
- print 'There are ', numSubRgn, ' subregions. Select the subregion(s) for evaluation'
- rgnSelect = misc.selectSubRegion(subRegions)
- print 'selected region for evaluation= ', rgnSelect
- # Select the model & obs data to be evaluated
- oData = ma.zeros(nT)
- mData = ma.zeros(nT)
- oData = obsRgn[obsSelect, rgnSelect, :]
- mData = mdlRgn[mdlSelect, rgnSelect, :]
-
- # compute the monthly-mean climatology to construct the annual cycle
- obsTseries, obsAnnualCycle = calcAnnualCycleMeans(oData)
- mdlTseries, mdlAnnualCycle = calcAnnualCycleMeans(mData)
- print 'obsAnnCyc= ', obsAnnualCycle
- print 'mdlAnnCyc= ', mdlAnnualCycle
-
- #--------------------------------
- # (mp.008) Select performance metric
- #----------------------------------------------------------------------------------------------------
- #metricOption = misc.select_metrics()
- # Temporarily, compute the RMSE and pattern correlation for the simulated
- # and observed annual cycle based on monthly means
- # TODO This aren't used. Missing something here???
-
- tempRMS = calcRootMeanSquaredDifferenceAveragedOverTime(mdlAnnualCycle, obsAnnualCycle)
- tempCOR, tempSIG = stats.pearsonr(mdlAnnualCycle, obsAnnualCycle)
-
- #--------------------------------
- # (mp.009) Plot results
- #----------------------------------------------------------------------------------------------------
- # Data-specific plot options: i.e. adjust model data units & set plot color bars
- colorbar = 'rainbow'
- if varName == 'pr': # set color bar for prcp
- colorbar = plt.cm.jet
-
- # 1-d data, e.g. Time series plots
- plotFileName = 'anncyc_' + varName + '_' + subRgnName[rgnSelect][0:3]
- if(os.path.exists(plotFileName) == True):
- cmnd = 'rm -f ' + plotFileName
- subprocess.call(cmnd, shell = True)
- year_labels = False # for annual cycle plots
- mytitle = 'Annual Cycle of ' + varName + ' at Sub-Region ' + subRgnName[rgnSelect][0:3]
- # Create a list of datetimes to represent the annual cycle, one per month.
- times = []
- for m in xrange(12):
- times.append(datetime.datetime(2000, m + 1, 1, 0, 0, 0, 0))
- #for i in np.arange(12):
- # times.append(i+1)
- _ = plots.draw_time_series_plot(mdlAnnualCycle, times, plotFileName, workdir,
- data2 = obsAnnualCycle, mytitle = mytitle, ytitle = 'Y',
- xtitle = 'MONTH', year_labels = year_labels)
-
- # Repeat for another metric
- doMetricsOption = raw_input('Do you want to perform another evaluation? [y/n]\n> ').lower()
-
- # Processing complete if a user enters 'n' for 'doMetricsOption'
- print 'RCMES processing completed.'
http://git-wip-us.apache.org/repos/asf/climate/blob/a53e3af5/rcmet/src/main/python/rcmes/toolkit/metrics_kyo.py
----------------------------------------------------------------------
diff --git a/rcmet/src/main/python/rcmes/toolkit/metrics_kyo.py b/rcmet/src/main/python/rcmes/toolkit/metrics_kyo.py
deleted file mode 100644
index cdc4c9d..0000000
--- a/rcmet/src/main/python/rcmes/toolkit/metrics_kyo.py
+++ /dev/null
@@ -1,718 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-'''
-Module storing functions to calculate statistical metrics from numpy arrays
-'''
-
-import subprocess
-import os, sys
-import datetime
-import numpy as np
-import numpy.ma as ma
-import scipy.stats as stats
-import matplotlib.pyplot as plt
-from toolkit import plots, process
-from ocw import plotter
-from utils import misc
-from storage import files
-
-def calcAnnualCycleMeans(dataset1):
- '''
- Purpose::
- Calculate annual cycle in terms of monthly means at every grid point.
-
- Input::
- dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array timeseries
-
- Output::
- means - if 3d numpy was entered, 3d (# of months,lat,lon), if 1d data entered
- it is a timeseries of the data of length # of months
-
- '''
- data = misc.reshapeMonthlyData(dataset1)
- means = data.mean(axis = 0)
- return data, means
-
-def calcAnnualCycleMeansSubRegion(dataset1):
- '''
- Purpose::
- Calculate annual cycle in terms of monthly means at every sub-region.
-
- Input::
- dataset1 - 2d numpy array of data in (region, t)
-
- Output::
- means - (region, # of months)
-
- '''
- nregion, nT = dataset1.shape
- data = dataset1.reshape([nregion, nT/12, 12])
- means = data.mean(axis = 1)
- return data, means
-
-
-def calcClimYear(dataset1):
- '''
- Purpose::
- Calculate annual mean timeseries and climatology for both 2-D and point time series.
-
- Input::
- dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array timeseries
-
- Output::
- tSeries - if 3d numpy was entered, 3d (nYr,lat,lon), if 1d data entered
- it is a timeseries of the data of length nYr
- means - if 3d numpy was entered, 2d (lat,lon), if 1d data entered
- it is a floating point number representing the overall mean
- '''
- data = misc.reshapeMonthlyData(dataset1)
- tSeries = data.mean(axis = 1)
- means = tSeries.mean(axis = 0)
- return tSeries, means
-
-def calcClimSeason(monthBegin, monthEnd, dataset1):
- '''
- Purpose ::
- Calculate seasonal mean montheries and climatology for both 3-D and point time series.
- For example, to calculate DJF mean time series, monthBegin = 12, monthEnd =2
- This can handle monthBegin=monthEnd i.e. for climatology of a specific month
-
- Input::
- monthBegin - an integer for the beginning month (Jan =1)
- monthEnd - an integer for the ending month (Jan = 1)
- dataset1 - 3d numpy array of data in (t,lat,lon) or 1d numpy array montheries
-
- Output::
- tSeries - if 3d numpy was entered, 3d (number of years/number of years -1 if monthBegin > monthEnd,lat,lon),
- if 1d data entered it is a montheries of the data of length number of years
- means - if 3d numpy was entered, 2d (lat,lon), if 1d data entered
- it is a floating point number representing the overall mean
- '''
- if monthBegin > monthEnd:
- # Offset the original array so that the the first month
- # becomes monthBegin, note that this cuts off the first year of data
- offset = slice(monthBegin - 1, monthBegin - 13)
- data = misc.reshapeMonthlyData(dataset1[offset])
- monthIndex = slice(0, 13 - monthBegin + monthEnd)
- else:
- # Since monthBegin <= monthEnd, just take a slice containing those months
- data = misc.reshapeMonthlyData(dataset1)
- monthIndex = slice(monthBegin - 1, monthEnd)
-
- tSeries = data[:, monthIndex].mean(axis = 1)
- means = tSeries.mean(axis = 0)
- return tSeries, means
-
-def calcClimSeasonSubRegion(monthBegin, monthEnd, dataset1):
- '''
- Purpose ::
- Calculate seasonal mean montheries and climatology for both 3-D and point time series.
- For example, to calculate DJF mean time series, monthBegin = 12, monthEnd =2
- This can handle monthBegin=monthEnd i.e. for climatology of a specific month
-
- Input::
- monthBegin - an integer for the beginning month (Jan =1)
- monthEnd - an integer for the ending month (Jan = 1)
- dataset1 - 3d numpy array of data in (region, t) or 1d numpy array montheries
-
- Output::
- tSeries - (region, number of years/number of years -1 if monthBegin > monthEnd,lat,lon),
- means - (region)
- '''
- nregion, nT = dataset1.shape
- nYR = nT/12
- if monthBegin > monthEnd:
- # Offset the original array so that the the first month
- # becomes monthBegin, note that this cuts off the first year of data
- offset = slice(monthBegin - 1, monthBegin - 13)
- data = dataset1[:,offset].reshape([nregion,nYR-1, 12])
- monthIndex = slice(0, 13 - monthBegin + monthEnd)
- else:
- # Since monthBegin <= monthEnd, just take a slice containing those months
- data = dataset1.reshape([nregion,nYR,12])
- monthIndex = slice(monthBegin - 1, monthEnd)
-
- tSeries = data[:, :, monthIndex].mean(axis = 2)
- means = tSeries.mean(axis = 1)
- return tSeries, means
-
-def calcAnnualCycleStdev(dataset1):
- '''
- Purpose::
- Calculate monthly standard deviations for every grid point
-
- Input::
- dataset1 - 3d numpy array of data in (12* number of years,lat,lon)
-
- Output::
- stds - if 3d numpy was entered, 3d (12,lat,lon)
- '''
- data = misc.reshapeMonthlyData(dataset1)
- stds = data.std(axis = 0, ddof = 1)
- return stds
-
-def calcAnnualCycleStdevSubRegion(dataset1):
- '''
- Purpose::
- Calculate monthly standard deviations for every sub-region
-
- Input::
- dataset1 - 2d numpy array of data in (nregion, 12* number of years)
-
- Output::
- stds - (nregion, 12)
- '''
- nregion, nT = dataset1.shape
- data = dataset1.reshape([nregion, nT/12, 12])
- stds = data.std(axis = 1, ddof = 1)
- return stds
-
-def calcAnnualCycleDomainMeans(dataset1):
- '''
- Purpose::
- Calculate spatially averaged monthly climatology and standard deviation
-
- Input::
- dataset1 - 3d numpy array of data in (12* number of years,lat,lon)
-
- Output::
- means - time series (12)
- '''
- data = misc.reshapeMonthlyData(dataset1)
-
- # Calculate the means, month by month
- means = np.zeros(12)
- for i in np.arange(12):
- means[i] = data[:, i, :, :].mean()
-
- return means
-
-def calcSpatialStdevRatio(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the ratio of spatial standard deviation (model standard deviation)/(observed standard deviation)
-
- Input ::
- evaluationData - model data array (lat, lon)
- referenceData- observation data array (lat,lon)
-
- Output::
- ratio of standard deviation (a scholar)
-
- '''
- stdevRatio = evaluationData[(evaluationData.mask==False) & (referenceData.mask==False)].std()/ \
- referenceData[(evaluationData.mask==False) & (referenceData.mask==False)].std()
- return stdevRatio
-
-def calcTemporalStdev(dataset1):
- '''
- Purpose::
- Calculate sample standard deviations over the time
-
- Input::
- dataset1 - 3d numpy array of data in (time,lat,lon)
-
-
- Output::
- stds - time series (lat, lon)
- '''
- stds = dataset1.std(axis = 0, ddof = 1)
- return stds
-
-def calcAnnualCycleDomainStdev(dataset1):
- '''
- Purpose::
- Calculate sample standard deviations representing the domain in each month
-
- Input::
- dataset1 - 3d numpy array of data in (12* number of years,lat,lon)
-
- Output::
- stds - time series (12)
- '''
- data = misc.reshapeMonthlyData(dataset1)
-
- # Calculate the standard deviation, month by months
- stds = np.zeros(12)
- for i in np.arange(12):
- stds[i] = data[:, i, :, :].std(ddof = 1)
-
- return stds
-
-def calcBiasAveragedOverTime(evaluationData, referenceData, option): # Mean Bias
- '''
- Purpose::
- Calculate the mean difference between two fields over time for each grid point.
-
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
- option - string indicating absolute values or not
-
- Output::
- bias - difference between referenceData and evaluationData averaged over the first dimension
-
- '''
- # Calculate mean difference between two fields over time for each grid point
- # Precrocessing of both obs and model data ensures the absence of missing values
- diff = evaluationData - referenceData
- if(option == 'abs'):
- diff = abs(diff)
- bias = diff.mean(axis = 0)
- return bias
-
-
-def calcBiasAveragedOverTimeAndSigLev(evaluationData, referenceData):
- '''
- Purpose::
- Calculate mean difference between two fields over time for each grid point
-
- Classify missing data resulting from multiple times (using threshold
- data requirement)
-
- i.e. if the working time unit is monthly data, and we are dealing with
- multiple months of data then when we show mean of several months, we need
- to decide what threshold of missing data we tolerate before classifying a
- data point as missing data.
-
-
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- bias - difference between referenceData and evaluationData averaged over the first dimension
- sigLev - significance of the difference (masked array)
- For example: sig[iy,ix] = 0.95 means that the observation and model is different at 95% confidence level
- at X=ix and Y=iy
- '''
- # If either gridcell in each data set is missing, set that cell to
- # missing for the output significance level
- evaluationDataMask = process.create_mask_using_threshold(evaluationData, threshold = 0.75)
- referenceDataMask = process.create_mask_using_threshold(referenceData, threshold = 0.75)
-
- # The overall mask associated with missing data
- overallMask = np.logical_or(evaluationDataMask, referenceDataMask)
-
- diff = evaluationData - referenceData
- bias = diff.mean(axis = 0)
- sigLev = 1 - stats.ttest_rel(evaluationData, referenceData, axis = 0)[1]
- sigLev[overallMask] = -100.
- sigLev = ma.masked_equal(sigLev, -100.)
- # Set mask for bias metric using missing data in obs or model data series
- # i.e. if obs contains more than threshold (e.g.50%) missing data
- # then classify time average bias as missing data for that location.
- bias = ma.masked_array(bias.data, overallMask)
- return bias, sigLev
-
-
-def calcBiasAveragedOverTimeAndDomain(evaluationData, referenceData):
- '''
- Purpose::
- Calculate the mean difference between two fields over time and domain
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- bias - difference between referenceData and evaluationData averaged over time and space
-
- '''
-
- diff = evaluationData - referenceData
-
- bias = diff.mean()
- return bias
-
-def calcBias(evaluationData, referenceData):
- '''
- Purpose::
- Calculate the difference between two fields at each grid point
-
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- diff - difference between referenceData and evaluationData
-
- '''
-
- diff = evaluationData - referenceData
- return diff
-
-def calcRootMeanSquaredDifferenceAveragedOverTime(evaluationData, referenceData):
- '''
- Purpose::
- Calculate root mean squared difference (RMS errors) averaged over time between two fields for each grid point
-
- Input::
- referenceData - array of data
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- rms - root mean squared difference, if the input is 1-d data, the output becomes a single floating number.
-
- '''
- sqdiff = (evaluationData - referenceData)** 2
- rms = np.sqrt(sqdiff.mean(axis = 0))
- return rms
-
-
-def calcRootMeanSquaredDifferenceAveragedOverTimeAndDomain(evaluationData, referenceData):
- '''
- Purpose::
- Calculate root mean squared difference (RMS errors) averaged over time and space between two fields
-
- Input::
- referenceData - array of data (should be 3-d array)
- evaluationData - array of data with same dimension of referenceData
-
- Output::
- rms - root mean squared difference averaged over time and space
- '''
- sqdiff = (evaluationData - referenceData)** 2
- rms = np.sqrt(sqdiff.mean())
- return rms
-
-def calcTemporalCorrelation(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the temporal correlation.
-
- Assumption(s) ::
- The first dimension of two datasets is the time axis.
-
- Input ::
- evaluationData - model data array of any shape
- referenceData- observation data array of any shape
-
- Output::
- temporalCorelation - A 2-D array of temporal correlation coefficients at each subregion
- sigLev - A 2-D array of confidence levels related to temporalCorelation
-
- REF: 277-281 in Stat methods in atmos sci by Wilks, 1995, Academic Press, 467pp.
- sigLev: the correlation between model and observation is significant at sigLev * 100 %
- '''
- evaluationDataMask = process.create_mask_using_threshold(evaluationData, threshold = 0.75)
- referenceDataMask = process.create_mask_using_threshold(referenceData, threshold = 0.75)
-
- nregion = evaluationData.shape[0]
- temporalCorrelation = ma.zeros([nregion])-100.
- sigLev = ma.zeros([nregion])-100.
- for iregion in np.arange(nregion):
- temporalCorrelation[iregion], sigLev[iregion] = stats.pearsonr(evaluationData[iregion,:], referenceData[iregion,:])
- sigLev[iregion] = 1 - sigLev[iregion]
-
- temporalCorrelation=ma.masked_equal(temporalCorrelation.data, -100.)
- sigLev=ma.masked_equal(sigLev.data, -100.)
-
- return temporalCorrelation, sigLev
-
-def calcTemporalCorrelationSubRegion(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the temporal correlation.
-
- Assumption(s) ::
- both evaluation and reference data are subregion averaged time series
-
- Input ::
- evaluationData - model data array [region,t]
- referenceData- observation data [region, t]
-
- Output::
- temporalCorelation - A 1-D array of temporal correlation coefficients at each grid point.
- sigLev - A 1-D array of confidence levels related to temporalCorelation
-
- REF: 277-281 in Stat methods in atmos sci by Wilks, 1995, Academic Press, 467pp.
- sigLev: the correlation between model and observation is significant at sigLev * 100 %
- '''
-
-
- temporalCorrelation = 0.
- sigLev = 0.
- t1=evaluationData[:]
- t2=referenceData[:]
- if t1.min()!=t1.max() and t2.min()!=t2.max():
- temporalCorrelation, sigLev=stats.pearsonr(t1,t2)
- sigLev=1.-sigLev # p-value => confidence level
-
- return temporalCorrelation
-
-def calcPatternCorrelation(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the spatial correlation.
-
- Input ::
- evaluationData - model data array (lat, lon)
- referenceData- observation data array (lat,lon)
-
- Output::
- patternCorrelation - a single floating point
- sigLev - a single floating point representing the confidence level
-
- '''
-
- patternCorrelation, sigLev = stats.pearsonr(evaluationData[(evaluationData.mask==False) & (referenceData.mask==False)],
- referenceData[(evaluationData.mask==False) & (referenceData.mask==False)])
- return patternCorrelation, sigLev
-
-
-def calcPatternCorrelationEachTime(evaluationData, referenceData):
- '''
- Purpose ::
- Calculate the spatial correlation for each time
-
- Assumption(s) ::
- The first dimension of two datasets is the time axis.
-
- Input ::
- evaluationData - model data array (time,lat, lon)
- referenceData- observation data array (time,lat,lon)
-
- Output::
- patternCorrelation - a timeseries (time)
- sigLev - a time series (time)
- '''
- nT = evaluationData.shape[0]
- patternCorrelation = ma.zeros(nT)-100.
- sigLev = ma.zeros(nT)-100.
- for it in np.arange(nT):
- patternCorrelation[it], sigLev[it] = calcPatternCorrelation(evaluationData[it,:,:], referenceData[it,:,:])
-
- return patternCorrelation,sigLev
-
-def calcNashSutcliff(evaluationData, referenceData):
- '''
- Assumption(s)::
- Both evaluationData and referenceData are the same shape.
- * lat, lon must match up
- * time steps must align (i.e. months vs. months)
-
- Input::
- evaluationData - 3d (time, lat, lon) array of data
- referenceData - 3d (time, lat, lon) array of data
-
- Output:
- nashcor - 1d array aligned along the time dimension of the input
- datasets. Time Series of Nash-Sutcliff Coefficient of efficiency
-
- '''
- # Flatten the spatial dimensions
- data1 = evaluationData[:]
- data2 = referenceData[:]
- nT = data1.shape[0]
- data1.shape = nT, data1.size / nT
- data2.shape = nT, data2.size / nT
- meanData2 = data2.mean(axis = 1)
-
- # meanData2 must be reshaped to 2D as to obey
- # numpy broadcasting rules
- meanData2.shape = nT, 1
- nashcor = 1 - ((((data2 - data1) ** 2).sum(axis = 1)) /
- (((data2 - meanData2) ** 2).sum(axis = 1)))
- return nashcor
-
-
-def calcPdf(evaluationData, referenceData):
- '''
- Routine to calculate a normalized Probability Distribution Function with
- bins set according to data range.
- Equation from Perkins et al. 2007
-
- PS=sum(min(Z_O_i, Z_M_i)) where Z is the distribution (histogram of the data for either set)
- called in do_rcmes_processing_sub.py
-
- Inputs::
- 2 arrays of data
- t1 is the modelData and t2 is 3D obsdata - time,lat, lon NB, time here
- is the number of time values eg for time period 199001010000 - 199201010000
-
- if annual means-opt 1, was chosen, then t2.shape = (2,lat,lon)
-
- if monthly means - opt 2, was choosen, then t2.shape = (24,lat,lon)
-
- User inputs: number of bins to use and edges (min and max)
- Output:
-
- one float which represents the PDF for the year
-
- TODO: Clean up this docstring so we have a single purpose statement
-
- Routine to calculate a normalised PDF with bins set according to data range.
-
- Input::
- 2 data arrays, modelData and obsData
-
- Output::
- PDF for the year
-
- '''
- # float to store the final PDF similarity score
- similarityScore = 0.0
-
- print 'min modelData', evaluationData[:, :, :].min()
- print 'max modelData', evaluationData[:, :, :].max()
- print 'min obsData', referenceData[:, :, :].min()
- print 'max obsData', referenceData[:, :, :].max()
- # find a distribution for the entire dataset
- #prompt the user to enter the min, max and number of bin values.
- # The max, min info above is to help guide the user with these choises
- print '****PDF input values from user required **** \n'
- nbins = int (raw_input('Please enter the number of bins to use. \n'))
- minEdge = float(raw_input('Please enter the minimum value to use for the edge. \n'))
- maxEdge = float(raw_input('Please enter the maximum value to use for the edge. \n'))
-
- mybins = np.linspace(minEdge, maxEdge, nbins)
- print 'nbins is', nbins, 'mybins are', mybins
-
- pdfMod, edges = np.histogram(evaluationData, bins = mybins, normed = True, density = True)
- print 'evaluationData distribution and edges', pdfMod, edges
- pdfObs, edges = np.histogram(referenceData, bins = mybins, normed = True, density = True)
- print 'referenceData distribution and edges', pdfObs, edges
-
- #find minimum at each bin between lists
- i = 0
- for model_value in pdfMod :
- print 'model_value is', model_value, 'pdfObs[', i, '] is', pdfObs[i]
- if model_value < pdfObs[i]:
- similarityScore += model_value
- else:
- similarityScore += pdfObs[i]
- i += 1
- print 'similarity_score is', similarityScore
- return similarityScore
-
-
-
-def calculate_metrics_and_make_plots(varName, workdir, lons, lats, obsData, mdlData, obsRgn, mdlRgn, obsList, mdlList, subRegions, \
- subRgnLon0, subRgnLon1, subRgnLat0, subRgnLat1):
- '''
- Purpose::
- Calculate all the metrics used in Kim et al. [2013] paper and plot them
-
- Input::
- varName - evaluating variable
- workdir -
- lons -
- lats -
- obsData -
- mdlData -
- obsRgn -
- mdlRgn -
- obsList -
- mdlList -
- subRegions -
- subRgnLon0, subRgnLat0 - southwest boundary of sub-regions [numSubRgn]
- subRgnLon1, subRgnLat1 - northeast boundary of sub-regions [numSubRgn]
- Output::
- png files
-
- '''
-
-
- nobs, nt, ny, nx = obsData.shape
- nmodel = mdlData.shape[0]
- ### TODO: unit conversion (K to C)
- if varName == 'temp':
- obsData[0, :, :, :] = obsData[0, :, :, :] - 273.15
- if subRegions:
- obsRgn[0, :, :] = obsRgn[0, :, :] - 273.15
- if varName == 'prec' and obsData.max() > mdlData.max()*1000.:
- mdlData[:, :, :, :] = mdlData[:, :, :, :]*86400.
- if subRegions:
- mdlRgn[:, :, :] = mdlRgn[:, :, :]*86400.
-
- oTser, oClim = calcClimYear( obsData[0, :, :, :])
- bias_of_overall_average = ma.zeros([nmodel, ny, nx])
- spatial_stdev_ratio = np.zeros([nmodel])
- spatial_corr = np.zeros([nmodel])
- mdlList.append('ENS')
-
- for imodel in np.arange(nmodel):
- mTser, mClim = calcClimYear( mdlData[imodel,:,:,:])
- bias_of_overall_average[imodel,:,:] = calcBias(mClim, oClim)
- spatial_corr[imodel], sigLev = calcPatternCorrelation(oClim, mClim)
- spatial_stdev_ratio[imodel] = calcSpatialStdevRatio(mClim, oClim)
- fig_return = plotter.draw_contour_map(oClim, lats, lons, workdir+'/observed_climatology_'+varName, fmt='png', gridshape=(1, 1),
- clabel='', ptitle='', subtitles=obsList, cmap=None,
- clevs=None, nlevs=10, parallels=None, meridians=None,
- extend='neither')
- # TODO:
- # Be sure to update "gridshape" argument to be the number of sub plots (rows,columns). This should be improved so that the
- # gridshape is optimally determined for a given number of models. For example:
- # For 3 models, a gridshape of (2,2) would be sensible:
- # X X
- # X
- #
- fig_return = plotter.draw_contour_map(bias_of_overall_average, lats, lons, workdir+'/bias_of_climatology_'+varName, fmt='png', gridshape=(6, 2),
- clabel='', ptitle='', subtitles=mdlList, cmap=None,
- clevs=None, nlevs=10, parallels=None, meridians=None,
- extend='neither')
- Taylor_data = np.array([spatial_stdev_ratio, spatial_corr]).transpose()
-
- fig_return = plotter.draw_taylor_diagram(Taylor_data, mdlList, refname='CRU', fname = workdir+'/Taylor_'+varName, fmt='png',frameon=False)
-
- if subRegions:
- nseason = 2 # (0: summer and 1: winter)
- nregion = len(subRgnLon0)
- season_name = ['summer','winter']
- rowlabels = ['PNw','PNe','CAn','CAs','SWw','SWe','COL','GPn','GPc','GC','GL','NE','SE','FL']
- collabels = ['M1','M2','M3','M4','M5','M6','ENS']
- collabels[nmodel-1] = 'ENS'
-
- for iseason in [0,1]:
- portrait_subregion = np.zeros([4, nregion, nmodel])
- portrait_titles = ['(a) Normalized Bias', '(b) Normalized STDV', '(c) Normalized RMSE', '(d) Correlation']
- if iseason == 0:
- monthBegin=6
- monthEnd=8
- if iseason == 1:
- monthBegin=12
- monthEnd=2
-
- obsTser,obsClim = calcClimSeasonSubRegion(6,8,obsRgn[0,:,:])
- for imodel in np.arange(nmodel):
- mTser, mClim = calcClimSeasonSubRegion(6,8,mdlRgn[imodel,:,:])
- for iregion in np.arange(nregion):
- portrait_subregion[0,iregion,imodel] = calcBias(mClim[iregion],obsClim[iregion])/calcTemporalStdev(obsTser[iregion,:])
- portrait_subregion[1,iregion,imodel] = calcTemporalStdev(mTser[iregion,:])/ calcTemporalStdev(obsTser[iregion,:])
- portrait_subregion[2,iregion,imodel] = calcRootMeanSquaredDifferenceAveragedOverTime(mTser[iregion,:], obsTser[iregion,:])/calcTemporalStdev(obsTser[iregion,:])
- portrait_subregion[3,iregion, imodel] = calcTemporalCorrelationSubRegion(mTser[iregion,:],obsTser[iregion,:])
- portrait_return = plotter.draw_portrait_diagram(portrait_subregion, rowlabels, collabels[0:nmodel], workdir+'/portrait_diagram_'+season_name[iseason]+'_'+varName, fmt='png',
- gridshape=(2, 2), xlabel='', ylabel='', clabel='',
- ptitle='', subtitles=portrait_titles, cmap=None, clevs=None,
- nlevs=10, extend='neither')
- # annual cycle
- nmonth = 12
- times = np.arange(nmonth)
- data_names = [obsList[0]] + list(mdlList)
- annual_cycle = np.zeros([nregion, nmonth, nmodel+1])
- obsTser, annual_cycle[:, :, 0] = calcAnnualCycleMeansSubRegion(obsRgn[0,:,:])
- obsStd = calcAnnualCycleStdevSubRegion(obsRgn[0,:,:])
- for imodel in np.arange(nmodel):
- mdlTser, annual_cycle[:, :, imodel+1] = calcAnnualCycleMeansSubRegion(mdlRgn[imodel, :, :])
- # Make annual_cycle shape compatible with draw_time_series
- annual_cycle = annual_cycle.swapaxes(1, 2)
- tseries_return = plotter.draw_time_series(annual_cycle, times, data_names, workdir+'/time_series_'+varName, gridshape=(7, 2),
- subtitles=rowlabels, label_month=True)
-
-
-