You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by Apache Wiki <wi...@apache.org> on 2009/01/16 00:33:00 UTC
[Couchdb Wiki] Update of "View Snippets" by MarcaJames

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Couchdb Wiki" for change notification.

The following page has been changed by MarcaJames:
http://wiki.apache.org/couchdb/View_Snippets

------------------------------------------------------------------------------
  }
  }}}
  
+ 
+ == Computing simple summary statistics (min,max,mean,standard deviation)  ==
+ 
+ Here is some code I have developed to compute standard deviation.  I do it two ways, both of which are different from jchris' github version (add link?).  In practice of course you wouldn't need both ways.  The view is specialized to my dataset, but the reduce function might be useful to others.
+ 
+ I've only ever tested it on futon, and have no idea what the "group" parameter does to the output.  Probably nothing!
+ 
+ {{{
+ // Map function
+ function(doc) {
+     var risk_exponent = 
+ 	-3.194 +
+ 	doc.CV_VOLOCC_1                 *1.080 +
+ 	doc.CV_VOLOCC_M                 *0.627 +
+ 	doc.CV_VOLOCC_R                 *0.553 +
+ 	doc.CORR_VOLOCC_1M              *1.439 +
+ 	doc.CORR_VOLOCC_MR              *0.658 +
+ 	doc.LAG1_OCC_M                  *0.412 +
+ 	doc.LAG1_OCC_R                  *1.424 +
+ 	doc.MU_VOL_1                    *0.038 +
+ 	doc.MU_VOL_M                    *0.100 +
+ 	doc["CORR_OCC_1M X MU_VOL_M"]      *-0.168 +
+ 	doc["CORR_OCC_1M X SD_VOL_R" ]     *0.479 +
+ 	doc["CORR_OCC_1M X LAG1_OCC_R"]    *-1.462 ;
+     
+     var risk = Math.exp(risk_exponent);
+     
+     // parse the date and "chunk" it up
+     var pattern = new RegExp("(.*)-0?(.*)-0?(.*)T0?(.*):0?(.*):0?(.*)(-0800)");
+     var result = pattern.exec(doc.EstimateTime);
+     var day;
+     if(result){
+         //new Date(year, month, day, hours, minutes, seconds, ms)
+         // force rounding to 5 minutes, 0 seconds, for aggregation of 5 minute chunks
+         var fivemin = 5 * Math.floor(result[5]/5)
+         day = new Date(result[1],result[2]-1,result[3],result[4], fivemin, 0);
+     }
+     var weekdays = ["Sun","Mon","Tue","Wed","Thu","Fri","Sat"];
+     emit([weekdays[day.getDay()],day.toLocaleTimeString( )],{'risk':risk});
+ }
+ 
+ // Reduce function
+ function (keys, values, rereduce) {
+ 
+     // algorithm for on-line computation of moments from 
+     // 
+     //    Tony F. Chan, Gene H. Golub, and Randall J. LeVeque: "Updating
+     //    Formulae and a Pairwise Algorithm for Computing Sample
+     //    Variances." Technical Report STAN-CS-79-773, Department of
+     //    Computer Science, Stanford University, November 1979.  url:
+     //    ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
+     
+     // so there is some wierdness in that the original was Fortran, index from 1,
+     // and lots of arrays (no lists, no hash tables)
+     
+ 
+     // also consulted http://people.xiph.org/~tterribe/notes/homs.html
+     // and http://www.jstor.org/stable/2683386
+     // and (ick!) the wikipedia description of Knuth's algorithm
+     // to clarify what was going on with http://www.slamb.org/svn/repos/trunk/projects/common/src/java/org/slamb/common/stats/Sample.java
+ 
+     /* 
+        combine the variance esitmates for two partitions, A and B.
+        partitionA and partitionB both should contain
+         { S :  the current estimate of the second moment
+           Sum : the sum of observed values
+           M : the number of observations used in the partition to calculate S and Sum
+         }
+ 
+     The output will be an identical object, containing the S, Sum and
+     M for the combination of partitions A and B
+        
+     This routine is derived from original fortran code in Chan et al,
+     (1979)
+ 
+     But it is easily derived by recognizing that all you're doing is
+     multiplying each partition's S and Sum by its respective count M,
+     and then dividing by the new count Ma + Mb.  The arrangement of
+     the diff etc is just rearranging terms to make it look nice.
+ 
+     And then summing up the sums, and summing up the counts
+ 
+     */
+     function combine_S(partitionA,partitionB){
+ 	var NewS=partitionA.S;
+ 	var NewSum=partitionA.Sum;
+ 	var min = partitionA.min;
+ 	var max = partitionA.max;
+ 	var M = partitionB.M;
+ 	if(!M){M=0;}
+ 	if(M){
+ 	    var diff = 
+ 		((partitionA.M * partitionB.Sum / partitionB.M) - partitionA.Sum );
+ 	    
+ 	    NewS += partitionB.S + partitionB.M*diff*diff/(partitionA.M * (partitionA.M+partitionB.M) );
+ 	    NewSum += partitionB.Sum ;
+ 
+ 	    min = Math.min(partitionB.min, min);
+ 	    max = Math.max(partitionB.max, max);
+ 	}
+ 	return {'S':NewS,'Sum':NewSum, 'M': partitionA.M+M, 'min':min, 'max':max };
+     }
+ 	    
+ 
+     /*
+ 
+     This routine is derived from original fortran code in Chan et al,
+     (1979), with the combination step split out above to allow that to
+     be called independently in the rereduce step.
+ 
+     Arguments:  
+ 
+     The first argument (values) is an array of objects.  The
+     assumption is that the key to the variable of interest is 'risk'.
+     If this is not the case, the seventh argument should be the correct
+     key to use.  More complicated data structures are not supported.
+ 
+     The second, third, and fourth arguments are in case this is a
+     running tally.  You can pass in exiting values for M (the number
+     of observations already processed), Sum (the running sum of those
+     M observations) and S (the current estimate of variance for those
+     M observations).  Totally optional, defaulting to zero.  
+ 
+     The fifth parameter is for the running min, and the sixth for the
+     max.
+ 
+     Pass for parameters 2 through 6 if you need to pass a key in the
+     seventh slot.
+ 
+     Some notes on the algorithm.  There is a precious bit of trickery
+     with stack pointers, etc that make for a minimal amount of
+     temporary storage.  All this was included in the original
+     algorithm.  I can't see that it makes much sense to include all
+     that effort given that I've got gobs of RAM and am instead most
+     likely processor bound, but it reminded me of programming in
+     assembly so I kept it in.  
+ 
+     If you watch the progress of this algorithm in a debugger or
+     firebug, you'll see that the size of the stack stays pretty small,
+     with the bottom (0) entry staying at zero, then the [1] entry
+     containing a power of two (2,4,8,16, etc), and the [2] entry
+     containing the next power of two down from [1] and so on.  As the
+     slots of the stack get filled up, they get cascaded together by
+     the inner loop.
+ 
+     You could skip all that, and just pairwise process repeatedly
+     until the list of intermediate values is empty, but whatever.  And
+     there seems to be some super small gain in efficiency in using
+     identical support for two groups being combined, in that you don't
+     have to consider different Ma and Mb in the computation.  One less
+     divide I guess)
+ 
+     */
+     function pairwise_update (values, M, Sum, S, min, max, key){
+ 	if(!key){key='risk';}
+ 	if(!Sum){Sum = 0; S = 0; M=0;}
+ 	if(!S){Sum = 0; S = 0; M=0;}
+ 	if(!M){Sum = 0; S = 0; M=0;}
+ 	if(!min){ min = Infinity; }
+ 	if(!max){ max = -Infinity; }
+ 	var T;
+ 	var stack_ptr=1;
+ 	var N = values.length;
+ 	var half = Math.floor(N/2);
+ 	var NewSum;
+ 	var NewS ;
+ 	var SumA=[];
+ 	var SA=[];
+ 	var Terms=[];
+ 	Terms[0]=0;
+ 	if(N == 1){
+ 	    Nsum=values[0][key];
+ 	    Ns=0;
+ 	}else if(N > 1){
+ 	    // loop over the data pairwise
+ 	    for(var i = 0; i < half; i++){
+ 		// check min max
+ 		if(values[2*i+1][key] < values[2*i][key] ){
+ 		    min = Math.min(values[2*i+1][key], min);
+ 		    max = Math.max(values[2*i][key], max);
+ 		}else{
+ 		    min = Math.min(values[2*i][key], min);
+ 		    max = Math.max(values[2*i+1][key], max);
+ 		}
+ 		SumA[stack_ptr]=values[2*i+1][key] + values[2*i][key];
+ 		var diff = values[2*i + 1][key] - values[2*i][key] ;
+ 		SA[stack_ptr]=( diff * diff ) / 2;
+ 		Terms[stack_ptr]=2;
+ 		while( Terms[stack_ptr] == Terms[stack_ptr-1]){
+ 		    // combine the top two elements in storage, as
+ 		    // they have equal numbers of support terms.  this
+ 		    // should happen for powers of two (2, 4, 8, etc).
+ 		    // Everything else gets cleaned up below
+ 		    stack_ptr--;
+ 		    Terms[stack_ptr]*=2;
+ 		    // compare this diff with the below diff.  Here
+ 		    // there is no multiplication and division of the
+ 		    // first sum (SumA[stack_ptr]) because it is the
+ 		    // same size as the other.
+ 		    var diff = SumA[stack_ptr] - SumA[stack_ptr+1];
+ 		    SA[stack_ptr]=  SA[stack_ptr] + SA[stack_ptr+1] +
+ 			(diff * diff)/Terms[stack_ptr];
+ 		    SumA[stack_ptr] += SumA[stack_ptr+1];
+ 		} // repeat as needed
+ 		stack_ptr++;
+ 	    }
+ 	    stack_ptr--;
+ 	    // check if N is odd
+ 	    if(N % 2 !=  0){
+ 		// handle that dangling entry
+ 		stack_ptr++;
+ 		Terms[stack_ptr]=1;
+ 		SumA[stack_ptr]=values[N-1][key];
+ 		SA[stack_ptr]=0;  // the variance of a single observation is zero!
+ 		min = Math.min(values[N-1][key], min);
+ 		max = Math.max(values[N-1][key], max);
+ 	    }
+ 	    T=Terms[stack_ptr];
+ 	    NewSum=SumA[stack_ptr];
+ 	    NewS= SA[stack_ptr];
+ 	    if(stack_ptr > 1){
+ 		// values.length is not power of two, so not
+ 		// everything has been scooped up in the inner loop
+ 		// above.  Here handle the remainders
+ 		for(var i = stack_ptr-1; i>=1 ; i--){
+ 		    // compare this diff with the above diff---one
+ 		    // more multiply and divide on the current sum,
+ 		    // because the size of the sets (SumA[i] and NewSum)
+ 		    // are different.
+ 		    var diff = Terms[i]*NewSum/T-SumA[i]; 
+ 		    NewS = NewS + SA[i] + 
+ 			( T * diff * diff )/
+ 			(Terms[i] * (Terms[i] + T));
+ 		    NewSum += SumA[i];
+ 		    T += Terms[i];
+ 		}
+ 	    }
+ 	}
+ 	// finally, combine NewS and NewSum with S and Sum
+ 	return 	combine_S(
+ 	    {'S':NewS,'Sum':NewSum, 'M': T ,  'min':min, 'max':max},
+ 	    {'S':S,'Sum':Sum, 'M': M ,  'min':min, 'max':max});
+     }
+ 
+ 
+     /*
+ 
+     This function is attributed to Knuth, the Art of Computer
+     Programming.  Donald Knuth is a math god, so I am sure that it is
+     numerically stable, but I haven't read the source so who knows.
+ 
+     The first parameter is again values, a list of objects with the expectation that the variable of interest is contained under the key 'risk'.  If this is not the case, pass the correct variable in the 7th field.
+     
+     Parameters 2 through 6 are all optional.  Pass nulls if you need to pass a key.
+ 
+     In order they are 
+ 
+     mean:  the current mean value estimate 
+     M2: the current estimate of the second moment (variance)
+     n:  the count of observations used in the current estimate
+     min:   the current min value observed
+     max:   the current max value observed
+ 
+     */
+     function KnuthianOnLineVariance(values, M2, n, mean, min, max,  key){
+ 	if(!M2){ M2 = 0; }
+ 	if(!n){ n = 0; }
+ 	if(!mean){ mean  = 0; }
+ 	if(!min){ min = Infinity; }
+ 	if(!max){ max = -Infinity; }
+ 	if(!key){ key = 'risk'; }
+ 
+ 	// this algorithm is apparently a special case of the above
+ 	// pairwise algorithm, in which you just apply one more value
+ 	// to the running total.  I don't know why bun Chan et al
+ 	// (1979) and again in their later paper claim that using M
+ 	// greater than 1 is always better than not.
+ 
+ 	// but this code is certainly cleaner!  code based on Scott
+ 	// Lamb's Java found at
+ 	// http://www.slamb.org/svn/repos/trunk/projects/common/src/java/org/slamb/common/stats/Sample.java
+ 	// but modified a bit
+ 
+ 	for(var i=0; i<values.length; i++ ){
+ 	    var diff = (values[i][key] - mean);
+             var newmean = mean +  diff / (n+i+1);
+             M2 += diff * (values[i][key] - newmean);
+             mean = newmean;
+             min = Math.min(values[i][key], min);
+             max = Math.max(values[i][key], max);
+         }
+ 	return {'M2': M2, 'n': n + values.length, 'mean': mean, 'min':min, 'max':max };
+     }
+ 
+     function KnuthCombine(partitionA,partitionB){
+ 	if(partitionB.n){
+ 	    var newn = partitionA.n + partitionB.n;
+             var diff = partitionB.mean - partitionA.mean;
+             var newmean = partitionA.mean + diff*(partitionB.n/newn)
+             var M2 = partitionA.M2 + partitionB.M2 + (diff * diff * partitionA.n * partitionB.n / newn );
+             min = Math.min(partitionB.min, partitionA.min);
+             max = Math.max(partitionB.max, partitionA.max);
+ 	    return {'M2': M2, 'n': newn, 'mean': newmean, 'min':min, 'max':max };
+         } else {
+             return partitionA;
+         }
+     }
+ 
+     var output={};
+     var knuthOutput={};
+ 
+     // two cases in the application of reduce.  In the first reduce
+     // case the rereduce flag is false, and we have raw values.  We
+     // also have keys, but that isn't applicable here.
+     // 
+     // In the rereduce case, rereduce is true, and we are being passed
+     // output for identical keys that needs to be combined further.
+ 
+     if(!rereduce)
+     {
+ 	output = pairwise_update(values);
+ 	output.variance_n=output.S/output.M;
+ 	output.mean = output.Sum/output.M;
+ 	knuthOutput = KnuthianOnLineVariance(values);
+ 	knuthOutput.variance_n=knuthOutput.M2/knuthOutput.n;
+ 	output.knuthOutput=knuthOutput;
+ 
+     } else {
+ 	/*
+            we have an existing pass, so should have multiple outputs to combine  
+         */
+ 	for(var v in values){
+ 	    output = combine_S(values[v],output);
+ 	    knuthOutput = KnuthCombine(values[v].knuthOutput, knuthOutput);
+ 	}
+ 	output.variance_n=output.S/output.M;
+ 	output.mean = output.Sum/output.M;
+ 	knuthOutput.variance_n=knuthOutput.M2/knuthOutput.n;
+ 	output.knuthOutput=knuthOutput;
+     }
+     // and done
+     return output;
+ }
+ 
+ }}}
+ 
+ Sample output.  Note the difference in the very last few decimal places between the two methods.  
+ 
+ ||`["Tue", "08:00:00"] `|| ` {"S": 1276.8988123975391, "Sum": 1257.4497350063903, "M": 955, "min": 0.033031734767263086, "max": 6.011336961717487,`  `"variance_n": 1.3370668192644388, "mean": 1.3167012932004087,`  `"knuthOutput": {"M2": 1276.898812397539, "n": 955, "mean": 1.3167012932004083, "min": 0.033031734767263086, "max": 6.011336961717487,` `"variance_n": 1.3370668192644386}} `||
+ ||`["Tue", "08:05:00"]`||` {"S": 1363.1444727834003, "Sum": 1303.08214106713, "M": 939, "min": 0.03216066554751794, "max": 5.93544645899576,`  `"variance_n": 1.4516980540824285, "mean": 1.387733909549659,`  `"knuthOutput": {"M2": 1363.1444727834005, "n": 939, "mean": 1.3877339095496595, "min": 0.03216066554751794, "max": 5.93544645899576,` `"variance_n": 1.4516980540824287}} `||
+