You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jv...@apache.org on 2010/08/17 03:00:56 UTC
svn commit: r986163 - in /hadoop/hive/trunk: ./ data/files/
ql/src/java/org/apache/hadoop/hive/ql/exec/
ql/src/java/org/apache/hadoop/hive/ql/udf/generic/
ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/
Author: jvs
Date: Tue Aug 17 01:00:56 2010
New Revision: 986163
URL: http://svn.apache.org/viewvc?rev=986163&view=rev
Log:
HIVE-1529. Add ANSI SQL covariance aggregate functions: covar_pop
and covar_samp
(Pierre Huyn via jvs)
Added:
hadoop/hive/trunk/data/files/covar_tab.txt
hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java
hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java
hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q
hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q
hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out
hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out
Modified:
hadoop/hive/trunk/CHANGES.txt
hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out
Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=986163&r1=986162&r2=986163&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Tue Aug 17 01:00:56 2010
@@ -42,6 +42,10 @@ Trunk - Unreleased
HIVE-1528. JSON UDTF function
(Ning Zhang via jvs)
+ HIVE-1529. Add ANSI SQL covariance aggregate functions: covar_pop
+ and covar_samp
+ (Pierre Huyn via jvs)
+
IMPROVEMENTS
HIVE-1394. Do not update transient_lastDdlTime if the partition is modified by a housekeeping
Added: hadoop/hive/trunk/data/files/covar_tab.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/data/files/covar_tab.txt?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/data/files/covar_tab.txt (added)
+++ hadoop/hive/trunk/data/files/covar_tab.txt Tue Aug 17 01:00:56 2010
@@ -0,0 +1,6 @@
+1 15
+2 3
+3 7 12
+4 4 14
+5 8 17
+6 2 11
Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java?rev=986163&r1=986162&r2=986163&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java Tue Aug 17 01:00:56 2010
@@ -134,6 +134,8 @@ import org.apache.hadoop.hive.ql.udf.gen
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBridge;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCollectSet;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCovariance;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCovarianceSample;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFHistogramNumeric;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax;
@@ -366,6 +368,8 @@ public final class FunctionRegistry {
registerGenericUDAF("variance", new GenericUDAFVariance());
registerGenericUDAF("var_pop", new GenericUDAFVariance());
registerGenericUDAF("var_samp", new GenericUDAFVarianceSample());
+ registerGenericUDAF("covar_pop", new GenericUDAFCovariance());
+ registerGenericUDAF("covar_samp", new GenericUDAFCovarianceSample());
registerGenericUDAF("histogram_numeric", new GenericUDAFHistogramNumeric());
registerGenericUDAF("percentile_approx", new GenericUDAFPercentileApprox());
Added: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java (added)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java Tue Aug 17 01:00:56 2010
@@ -0,0 +1,335 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * Compute the covariance covar_pop(x, y), using the following one-pass method
+ * (ref. "Formulas for Robust, One-Pass Parallel Computation of Covariances and
+ * Arbitrary-Order Statistical Moments", Philippe Pebay, Sandia Labs):
+ *
+ * Incremental:
+ * n : <count>
+ * mx_n = mx_(n-1) + [x_n - mx_(n-1)]/n : <xavg>
+ * my_n = my_(n-1) + [y_n - my_(n-1)]/n : <yavg>
+ * c_n = c_(n-1) + (x_n - mx_(n-1))*(y_n - my_n) : <covariance * n>
+ *
+ * Merge:
+ * c_X = c_A + c_B + (mx_A - mx_B)*(my_A - my_B)*n_A*n_B/n_X
+ *
+ */
+@Description(name = "covariance,covar_pop",
+ value = "_FUNC_(x,y) - Returns the population covariance of a set of number pairs",
+ extended = "The function takes as arguments any pair of numeric types and returns a double.\n"
+ + "Any pair with a NULL is ignored. If the function is applied to an empty set, NULL\n"
+ + "will be returned. Otherwise, it computes the following:\n"
+ + " (SUM(x*y)-SUM(x)*SUM(y)/COUNT(x,y))/COUNT(x,y)\n"
+ + "where neither x nor y is null.")
+public class GenericUDAFCovariance extends AbstractGenericUDAFResolver {
+
+ static final Log LOG = LogFactory.getLog(GenericUDAFCovariance.class.getName());
+
+ @Override
+ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
+ if (parameters.length != 2) {
+ throw new UDFArgumentTypeException(parameters.length - 1,
+ "Exactly two arguments are expected.");
+ }
+
+ if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(0,
+ "Only primitive type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+
+ if (parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(1,
+ "Only primitive type arguments are accepted but "
+ + parameters[1].getTypeName() + " is passed.");
+ }
+
+ switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ switch (((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ return new GenericUDAFCovarianceEvaluator();
+ case STRING:
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(1,
+ "Only numeric or string type arguments are accepted but "
+ + parameters[1].getTypeName() + " is passed.");
+ }
+ case STRING:
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(0,
+ "Only numeric or string type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ }
+
+ /**
+ * Evaluate the variance using the algorithm described in
+ * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance,
+ * presumably by Pébay, Philippe (2008), in "Formulas for Robust,
+ * One-Pass Parallel Computation of Covariances and Arbitrary-Order
+ * Statistical Moments", Technical Report SAND2008-6212,
+ * Sandia National Laboratories,
+ * http://infoserve.sandia.gov/sand_doc/2008/086212.pdf
+ *
+ * Incremental:
+ * n : <count>
+ * mx_n = mx_(n-1) + [x_n - mx_(n-1)]/n : <xavg>
+ * my_n = my_(n-1) + [y_n - my_(n-1)]/n : <yavg>
+ * c_n = c_(n-1) + (x_n - mx_(n-1))*(y_n - my_n) : <covariance * n>
+ *
+ * Merge:
+ * c_X = c_A + c_B + (mx_A - mx_B)*(my_A - my_B)*n_A*n_B/n_X
+ *
+ * This one-pass algorithm is stable.
+ *
+ */
+ public static class GenericUDAFCovarianceEvaluator extends GenericUDAFEvaluator {
+
+ // For PARTIAL1 and COMPLETE
+ private PrimitiveObjectInspector xInputOI;
+ private PrimitiveObjectInspector yInputOI;
+
+ // For PARTIAL2 and FINAL
+ private StructObjectInspector soi;
+ private StructField countField;
+ private StructField xavgField;
+ private StructField yavgField;
+ private StructField covarField;
+ private LongObjectInspector countFieldOI;
+ private DoubleObjectInspector xavgFieldOI;
+ private DoubleObjectInspector yavgFieldOI;
+ private DoubleObjectInspector covarFieldOI;
+
+ // For PARTIAL1 and PARTIAL2
+ private Object[] partialResult;
+
+ // For FINAL and COMPLETE
+ private DoubleWritable result;
+
+ @Override
+ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
+ super.init(m, parameters);
+
+ // init input
+ if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
+ assert (parameters.length == 2);
+ xInputOI = (PrimitiveObjectInspector) parameters[0];
+ yInputOI = (PrimitiveObjectInspector) parameters[1];
+ } else {
+ assert (parameters.length == 1);
+ soi = (StructObjectInspector) parameters[0];
+
+ countField = soi.getStructFieldRef("count");
+ xavgField = soi.getStructFieldRef("xavg");
+ yavgField = soi.getStructFieldRef("yavg");
+ covarField = soi.getStructFieldRef("covar");
+
+ countFieldOI =
+ (LongObjectInspector) countField.getFieldObjectInspector();
+ xavgFieldOI =
+ (DoubleObjectInspector) xavgField.getFieldObjectInspector();
+ yavgFieldOI =
+ (DoubleObjectInspector) yavgField.getFieldObjectInspector();
+ covarFieldOI =
+ (DoubleObjectInspector) covarField.getFieldObjectInspector();
+ }
+
+ // init output
+ if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
+ // The output of a partial aggregation is a struct containing
+ // a long count, two double averages, and a double covariance.
+
+ ArrayList<ObjectInspector> foi = new ArrayList<ObjectInspector>();
+
+ foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+
+ ArrayList<String> fname = new ArrayList<String>();
+ fname.add("count");
+ fname.add("xavg");
+ fname.add("yavg");
+ fname.add("covar");
+
+ partialResult = new Object[4];
+ partialResult[0] = new LongWritable(0);
+ partialResult[1] = new DoubleWritable(0);
+ partialResult[2] = new DoubleWritable(0);
+ partialResult[3] = new DoubleWritable(0);
+
+ return ObjectInspectorFactory.getStandardStructObjectInspector(fname, foi);
+
+ } else {
+ setResult(new DoubleWritable(0));
+ return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
+ }
+ }
+
+ static class StdAgg implements AggregationBuffer {
+ long count; // number n of elements
+ double xavg; // average of x elements
+ double yavg; // average of y elements
+ double covar; // n times the covariance
+ };
+
+ @Override
+ public AggregationBuffer getNewAggregationBuffer() throws HiveException {
+ StdAgg result = new StdAgg();
+ reset(result);
+ return result;
+ }
+
+ @Override
+ public void reset(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+ myagg.count = 0;
+ myagg.xavg = 0;
+ myagg.yavg = 0;
+ myagg.covar = 0;
+ }
+
+ private boolean warned = false;
+
+ @Override
+ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
+ assert (parameters.length == 2);
+ Object px = parameters[0];
+ Object py = parameters[1];
+ if (px != null && py != null) {
+ StdAgg myagg = (StdAgg) agg;
+ double vx = PrimitiveObjectInspectorUtils.getDouble(px, xInputOI);
+ double vy = PrimitiveObjectInspectorUtils.getDouble(py, yInputOI);
+ myagg.count++;
+ myagg.yavg = myagg.yavg + (vy - myagg.yavg) / myagg.count;
+ if (myagg.count > 1) {
+ myagg.covar += (vx - myagg.xavg) * (vy - myagg.yavg);
+ }
+ myagg.xavg = myagg.xavg + (vx - myagg.xavg) / myagg.count;
+ }
+ }
+
+ @Override
+ public Object terminatePartial(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+ ((LongWritable) partialResult[0]).set(myagg.count);
+ ((DoubleWritable) partialResult[1]).set(myagg.xavg);
+ ((DoubleWritable) partialResult[2]).set(myagg.yavg);
+ ((DoubleWritable) partialResult[3]).set(myagg.covar);
+ return partialResult;
+ }
+
+ @Override
+ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
+ if (partial != null) {
+ StdAgg myagg = (StdAgg) agg;
+
+ Object partialCount = soi.getStructFieldData(partial, countField);
+ Object partialXAvg = soi.getStructFieldData(partial, xavgField);
+ Object partialYAvg = soi.getStructFieldData(partial, yavgField);
+ Object partialCovar = soi.getStructFieldData(partial, covarField);
+
+ long nA = myagg.count;
+ long nB = countFieldOI.get(partialCount);
+
+ if (nA == 0) {
+ // Just copy the information since there is nothing so far
+ myagg.count = countFieldOI.get(partialCount);
+ myagg.xavg = xavgFieldOI.get(partialXAvg);
+ myagg.yavg = yavgFieldOI.get(partialYAvg);
+ myagg.covar = covarFieldOI.get(partialCovar);
+ }
+
+ if (nA != 0 && nB != 0) {
+ // Merge the two partials
+ double xavgA = myagg.xavg;
+ double yavgA = myagg.yavg;
+ double xavgB = xavgFieldOI.get(partialXAvg);
+ double yavgB = yavgFieldOI.get(partialYAvg);
+ double covarB = covarFieldOI.get(partialCovar);
+
+ myagg.count += nB;
+ myagg.xavg = (xavgA * nA + xavgB * nB) / myagg.count;
+ myagg.yavg = (yavgA * nA + yavgB * nB) / myagg.count;
+ myagg.covar +=
+ covarB + (xavgA - xavgB) * (yavgA - yavgB) * ((double) (nA * nB) / myagg.count);
+ }
+ }
+ }
+
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+
+ if (myagg.count == 0) { // SQL standard - return null for zero elements
+ return null;
+ } else {
+ getResult().set(myagg.covar / (myagg.count));
+ return getResult();
+ }
+ }
+
+ public void setResult(DoubleWritable result) {
+ this.result = result;
+ }
+
+ public DoubleWritable getResult() {
+ return result;
+ }
+ }
+
+}
Added: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java (added)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java Tue Aug 17 01:00:56 2010
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+
+/**
+ * Compute the sample covariance by extending GenericUDAFCovariance and overriding
+ * the terminate() method of the evaluator.
+ *
+ */
+@Description(name = "covar_samp",
+ value = "_FUNC_(x,y) - Returns the sample covariance of a set of number pairs",
+ extended = "The function takes as arguments any pair of numeric types and returns a double.\n"
+ + "Any pair with a NULL is ignored. If the function is applied to an empty set, NULL\n"
+ + "will be returned. Otherwise, it computes the following:\n"
+ + " (SUM(x*y)-SUM(x)*SUM(y)/COUNT(x,y))/(COUNT(x,y)-1)\n"
+ + "where neither x nor y is null.")
+public class GenericUDAFCovarianceSample extends GenericUDAFCovariance {
+
+ @Override
+ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
+ throws SemanticException {
+ if (parameters.length != 2) {
+ throw new UDFArgumentTypeException(parameters.length - 1,
+ "Exactly two arguments are expected.");
+ }
+
+ if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(0,
+ "Only primitive type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+
+ if (parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(1,
+ "Only primitive type arguments are accepted but "
+ + parameters[1].getTypeName() + " is passed.");
+ }
+
+ switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ switch (((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ return new GenericUDAFCovarianceSampleEvaluator();
+ case STRING:
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(1,
+ "Only numeric or string type arguments are accepted but "
+ + parameters[1].getTypeName() + " is passed.");
+ }
+ case STRING:
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(0,
+ "Only numeric or string type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ }
+
+ /**
+ * Compute the sample covariance by extending GenericUDAFCovarianceEvaluator and
+ * overriding the terminate() method of the evaluator.
+ */
+ public static class GenericUDAFCovarianceSampleEvaluator extends
+ GenericUDAFCovarianceEvaluator {
+
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+
+ if (myagg.count == 0) { // SQL standard - return null for zero elements
+ return null;
+ } else {
+ if (myagg.count > 1) {
+ getResult().set(myagg.covar / (myagg.count - 1));
+ } else { // the covariance of a singleton set is always 0
+ getResult().set(0);
+ }
+ return getResult();
+ }
+ }
+ }
+
+}
Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q Tue Aug 17 01:00:56 2010
@@ -0,0 +1,16 @@
+DROP TABLE covar_tab;
+CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab;
+
+DESCRIBE FUNCTION covar_pop;
+DESCRIBE FUNCTION EXTENDED covar_pop;
+SELECT covar_pop(b, c) FROM covar_tab WHERE a < 1;
+SELECT covar_pop(b, c) FROM covar_tab WHERE a < 3;
+SELECT covar_pop(b, c) FROM covar_tab WHERE a = 3;
+SELECT a, covar_pop(b, c) FROM covar_tab GROUP BY a ORDER BY a;
+SELECT covar_pop(b, c) FROM covar_tab;
+
+DROP TABLE covar_tab;
Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q Tue Aug 17 01:00:56 2010
@@ -0,0 +1,16 @@
+DROP TABLE covar_tab;
+CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab;
+
+DESCRIBE FUNCTION covar_samp;
+DESCRIBE FUNCTION EXTENDED covar_samp;
+SELECT covar_samp(b, c) FROM covar_tab WHERE a < 1;
+SELECT covar_samp(b, c) FROM covar_tab WHERE a < 3;
+SELECT covar_samp(b, c) FROM covar_tab WHERE a = 3;
+SELECT a, covar_samp(b, c) FROM covar_tab GROUP BY a ORDER BY a;
+SELECT covar_samp(b, c) FROM covar_tab;
+
+DROP TABLE covar_tab;
Modified: hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out?rev=986163&r1=986162&r2=986163&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out (original)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out Tue Aug 17 01:00:56 2010
@@ -40,6 +40,8 @@ concat_ws
conv
cos
count
+covar_pop
+covar_samp
date_add
date_sub
datediff
@@ -166,6 +168,8 @@ concat_ws
conv
cos
count
+covar_pop
+covar_samp
PREHOOK: query: SHOW FUNCTIONS '.*e$'
PREHOOK: type: SHOWFUNCTIONS
POSTHOOK: query: SHOW FUNCTIONS '.*e$'
Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out Tue Aug 17 01:00:56 2010
@@ -0,0 +1,90 @@
+PREHOOK: query: DROP TABLE covar_tab
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE covar_tab
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@covar_tab
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab
+PREHOOK: type: LOAD
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@covar_tab
+PREHOOK: query: DESCRIBE FUNCTION covar_pop
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION covar_pop
+POSTHOOK: type: DESCFUNCTION
+covar_pop(x,y) - Returns the population covariance of a set of number pairs
+PREHOOK: query: DESCRIBE FUNCTION EXTENDED covar_pop
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION EXTENDED covar_pop
+POSTHOOK: type: DESCFUNCTION
+covar_pop(x,y) - Returns the population covariance of a set of number pairs
+The function takes as arguments any pair of numeric types and returns a double.
+Any pair with a NULL is ignored. If the function is applied to an empty set, NULL
+will be returned. Otherwise, it computes the following:
+ (SUM(x*y)-SUM(x)*SUM(y)/COUNT(x,y))/COUNT(x,y)
+where neither x nor y is null.
+PREHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a < 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-32-57_079_1117526904113027917/-mr-10000
+POSTHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a < 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-32-57_079_1117526904113027917/-mr-10000
+NULL
+PREHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a < 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-00_492_6431652878883191819/-mr-10000
+POSTHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a < 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-00_492_6431652878883191819/-mr-10000
+NULL
+PREHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-03_895_3704658650575135394/-mr-10000
+POSTHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-03_895_3704658650575135394/-mr-10000
+0.0
+PREHOOK: query: SELECT a, covar_pop(b, c) FROM covar_tab GROUP BY a ORDER BY a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-06_957_9138283116312967779/-mr-10000
+POSTHOOK: query: SELECT a, covar_pop(b, c) FROM covar_tab GROUP BY a ORDER BY a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-06_957_9138283116312967779/-mr-10000
+1 NULL
+2 NULL
+3 0.0
+4 0.0
+5 0.0
+6 0.0
+PREHOOK: query: SELECT covar_pop(b, c) FROM covar_tab
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-12_804_1964811148333306251/-mr-10000
+POSTHOOK: query: SELECT covar_pop(b, c) FROM covar_tab
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-12_804_1964811148333306251/-mr-10000
+3.624999999999999
+PREHOOK: query: DROP TABLE covar_tab
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE covar_tab
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: default@covar_tab
Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out Tue Aug 17 01:00:56 2010
@@ -0,0 +1,90 @@
+PREHOOK: query: DROP TABLE covar_tab
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE covar_tab
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@covar_tab
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab
+PREHOOK: type: LOAD
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@covar_tab
+PREHOOK: query: DESCRIBE FUNCTION covar_samp
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION covar_samp
+POSTHOOK: type: DESCFUNCTION
+covar_samp(x,y) - Returns the sample covariance of a set of number pairs
+PREHOOK: query: DESCRIBE FUNCTION EXTENDED covar_samp
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION EXTENDED covar_samp
+POSTHOOK: type: DESCFUNCTION
+covar_samp(x,y) - Returns the sample covariance of a set of number pairs
+The function takes as arguments any pair of numeric types and returns a double.
+Any pair with a NULL is ignored. If the function is applied to an empty set, NULL
+will be returned. Otherwise, it computes the following:
+ (SUM(x*y)-SUM(x)*SUM(y)/COUNT(x,y))/(COUNT(x,y)-1)
+where neither x nor y is null.
+PREHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a < 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-34-57_138_3229670210627723371/-mr-10000
+POSTHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a < 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-34-57_138_3229670210627723371/-mr-10000
+NULL
+PREHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a < 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-00_608_8390642785069332434/-mr-10000
+POSTHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a < 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-00_608_8390642785069332434/-mr-10000
+NULL
+PREHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-04_028_8390943423795233918/-mr-10000
+POSTHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-04_028_8390943423795233918/-mr-10000
+0.0
+PREHOOK: query: SELECT a, covar_samp(b, c) FROM covar_tab GROUP BY a ORDER BY a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-07_002_4813467946285728110/-mr-10000
+POSTHOOK: query: SELECT a, covar_samp(b, c) FROM covar_tab GROUP BY a ORDER BY a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-07_002_4813467946285728110/-mr-10000
+1 NULL
+2 NULL
+3 0.0
+4 0.0
+5 0.0
+6 0.0
+PREHOOK: query: SELECT covar_samp(b, c) FROM covar_tab
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-13_103_2218406934546016491/-mr-10000
+POSTHOOK: query: SELECT covar_samp(b, c) FROM covar_tab
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-13_103_2218406934546016491/-mr-10000
+4.833333333333332
+PREHOOK: query: DROP TABLE covar_tab
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE covar_tab
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: default@covar_tab