You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jv...@apache.org on 2010/08/17 03:00:56 UTC

svn commit: r986163 - in /hadoop/hive/trunk: ./ data/files/ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/

Author: jvs
Date: Tue Aug 17 01:00:56 2010
New Revision: 986163

URL: http://svn.apache.org/viewvc?rev=986163&view=rev
Log:
HIVE-1529. Add ANSI SQL covariance aggregate functions: covar_pop
and covar_samp
(Pierre Huyn via jvs)


Added:
    hadoop/hive/trunk/data/files/covar_tab.txt
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java
    hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q
    hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q
    hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out
    hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out
Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
    hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=986163&r1=986162&r2=986163&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Tue Aug 17 01:00:56 2010
@@ -42,6 +42,10 @@ Trunk -  Unreleased
     HIVE-1528. JSON UDTF function
     (Ning Zhang via jvs)
 
+    HIVE-1529. Add ANSI SQL covariance aggregate functions: covar_pop
+    and covar_samp
+    (Pierre Huyn via jvs)
+
   IMPROVEMENTS
 
     HIVE-1394. Do not update transient_lastDdlTime if the partition is modified by a housekeeping

Added: hadoop/hive/trunk/data/files/covar_tab.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/data/files/covar_tab.txt?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/data/files/covar_tab.txt (added)
+++ hadoop/hive/trunk/data/files/covar_tab.txt Tue Aug 17 01:00:56 2010
@@ -0,0 +1,6 @@
+1		15
+2	3	
+3	7	12
+4	4	14
+5	8	17
+6	2	11

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java?rev=986163&r1=986162&r2=986163&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java Tue Aug 17 01:00:56 2010
@@ -134,6 +134,8 @@ import org.apache.hadoop.hive.ql.udf.gen
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBridge;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCollectSet;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCovariance;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCovarianceSample;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFHistogramNumeric;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax;
@@ -366,6 +368,8 @@ public final class FunctionRegistry {
     registerGenericUDAF("variance", new GenericUDAFVariance());
     registerGenericUDAF("var_pop", new GenericUDAFVariance());
     registerGenericUDAF("var_samp", new GenericUDAFVarianceSample());
+    registerGenericUDAF("covar_pop", new GenericUDAFCovariance());
+    registerGenericUDAF("covar_samp", new GenericUDAFCovarianceSample());
 
     registerGenericUDAF("histogram_numeric", new GenericUDAFHistogramNumeric());
     registerGenericUDAF("percentile_approx", new GenericUDAFPercentileApprox());

Added: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java (added)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java Tue Aug 17 01:00:56 2010
@@ -0,0 +1,335 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * Compute the covariance covar_pop(x, y), using the following one-pass method
+ * (ref. "Formulas for Robust, One-Pass Parallel Computation of Covariances and
+ *  Arbitrary-Order Statistical Moments", Philippe Pebay, Sandia Labs):
+ *
+ *  Incremental:
+ *   n : <count>
+ *   mx_n = mx_(n-1) + [x_n - mx_(n-1)]/n : <xavg>
+ *   my_n = my_(n-1) + [y_n - my_(n-1)]/n : <yavg>
+ *   c_n = c_(n-1) + (x_n - mx_(n-1))*(y_n - my_n) : <covariance * n>
+ *
+ *  Merge:
+ *   c_X = c_A + c_B + (mx_A - mx_B)*(my_A - my_B)*n_A*n_B/n_X
+ *
+ */
+@Description(name = "covariance,covar_pop",
+    value = "_FUNC_(x,y) - Returns the population covariance of a set of number pairs",
+    extended = "The function takes as arguments any pair of numeric types and returns a double.\n"
+        + "Any pair with a NULL is ignored. If the function is applied to an empty set, NULL\n"
+        + "will be returned. Otherwise, it computes the following:\n"
+        + "   (SUM(x*y)-SUM(x)*SUM(y)/COUNT(x,y))/COUNT(x,y)\n"
+        + "where neither x nor y is null.")
+public class GenericUDAFCovariance extends AbstractGenericUDAFResolver {
+
+  static final Log LOG = LogFactory.getLog(GenericUDAFCovariance.class.getName());
+
+  @Override
+  public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
+    if (parameters.length != 2) {
+      throw new UDFArgumentTypeException(parameters.length - 1,
+          "Exactly two arguments are expected.");
+    }
+
+    if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+      throw new UDFArgumentTypeException(0,
+          "Only primitive type arguments are accepted but "
+          + parameters[0].getTypeName() + " is passed.");
+    }
+
+    if (parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+        throw new UDFArgumentTypeException(1,
+            "Only primitive type arguments are accepted but "
+            + parameters[1].getTypeName() + " is passed.");
+    }
+
+    switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
+    case BYTE:
+    case SHORT:
+    case INT:
+    case LONG:
+    case FLOAT:
+    case DOUBLE:
+      switch (((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory()) {
+      case BYTE:
+      case SHORT:
+      case INT:
+      case LONG:
+      case FLOAT:
+      case DOUBLE:
+        return new GenericUDAFCovarianceEvaluator();
+      case STRING:
+      case BOOLEAN:
+      default:
+        throw new UDFArgumentTypeException(1,
+            "Only numeric or string type arguments are accepted but "
+            + parameters[1].getTypeName() + " is passed.");
+      }
+    case STRING:
+    case BOOLEAN:
+    default:
+      throw new UDFArgumentTypeException(0,
+          "Only numeric or string type arguments are accepted but "
+          + parameters[0].getTypeName() + " is passed.");
+    }
+  }
+
+  /**
+   * Evaluate the variance using the algorithm described in
+   * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance,
+   * presumably by  Pébay, Philippe (2008), in "Formulas for Robust,
+   * One-Pass Parallel Computation of Covariances and Arbitrary-Order
+   * Statistical Moments", Technical Report SAND2008-6212,
+   * Sandia National Laboratories,
+   * http://infoserve.sandia.gov/sand_doc/2008/086212.pdf
+   *
+   *  Incremental:
+   *   n : <count>
+   *   mx_n = mx_(n-1) + [x_n - mx_(n-1)]/n : <xavg>
+   *   my_n = my_(n-1) + [y_n - my_(n-1)]/n : <yavg>
+   *   c_n = c_(n-1) + (x_n - mx_(n-1))*(y_n - my_n) : <covariance * n>
+   *
+   *  Merge:
+   *   c_X = c_A + c_B + (mx_A - mx_B)*(my_A - my_B)*n_A*n_B/n_X
+   *
+   *  This one-pass algorithm is stable.
+   *
+   */
+  public static class GenericUDAFCovarianceEvaluator extends GenericUDAFEvaluator {
+
+    // For PARTIAL1 and COMPLETE
+    private PrimitiveObjectInspector xInputOI;
+    private PrimitiveObjectInspector yInputOI;
+
+    // For PARTIAL2 and FINAL
+    private StructObjectInspector soi;
+    private StructField countField;
+    private StructField xavgField;
+    private StructField yavgField;
+    private StructField covarField;
+    private LongObjectInspector countFieldOI;
+    private DoubleObjectInspector xavgFieldOI;
+    private DoubleObjectInspector yavgFieldOI;
+    private DoubleObjectInspector covarFieldOI;
+
+    // For PARTIAL1 and PARTIAL2
+    private Object[] partialResult;
+
+    // For FINAL and COMPLETE
+    private DoubleWritable result;
+
+    @Override
+    public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
+      super.init(m, parameters);
+
+      // init input
+      if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
+        assert (parameters.length == 2);
+        xInputOI = (PrimitiveObjectInspector) parameters[0];
+        yInputOI = (PrimitiveObjectInspector) parameters[1];
+      } else {
+        assert (parameters.length == 1);
+        soi = (StructObjectInspector) parameters[0];
+
+        countField = soi.getStructFieldRef("count");
+        xavgField = soi.getStructFieldRef("xavg");
+        yavgField = soi.getStructFieldRef("yavg");
+        covarField = soi.getStructFieldRef("covar");
+
+        countFieldOI =
+            (LongObjectInspector) countField.getFieldObjectInspector();
+        xavgFieldOI =
+            (DoubleObjectInspector) xavgField.getFieldObjectInspector();
+        yavgFieldOI =
+            (DoubleObjectInspector) yavgField.getFieldObjectInspector();
+        covarFieldOI =
+            (DoubleObjectInspector) covarField.getFieldObjectInspector();
+      }
+
+      // init output
+      if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
+        // The output of a partial aggregation is a struct containing
+        // a long count, two double averages, and a double covariance.
+
+        ArrayList<ObjectInspector> foi = new ArrayList<ObjectInspector>();
+
+        foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
+        foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+        foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+        foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+
+        ArrayList<String> fname = new ArrayList<String>();
+        fname.add("count");
+        fname.add("xavg");
+        fname.add("yavg");
+        fname.add("covar");
+
+        partialResult = new Object[4];
+        partialResult[0] = new LongWritable(0);
+        partialResult[1] = new DoubleWritable(0);
+        partialResult[2] = new DoubleWritable(0);
+        partialResult[3] = new DoubleWritable(0);
+
+        return ObjectInspectorFactory.getStandardStructObjectInspector(fname, foi);
+
+      } else {
+        setResult(new DoubleWritable(0));
+        return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
+      }
+    }
+
+    static class StdAgg implements AggregationBuffer {
+      long count; // number n of elements
+      double xavg; // average of x elements
+      double yavg; // average of y elements
+      double covar; // n times the covariance
+    };
+
+    @Override
+    public AggregationBuffer getNewAggregationBuffer() throws HiveException {
+      StdAgg result = new StdAgg();
+      reset(result);
+      return result;
+    }
+
+    @Override
+    public void reset(AggregationBuffer agg) throws HiveException {
+      StdAgg myagg = (StdAgg) agg;
+      myagg.count = 0;
+      myagg.xavg = 0;
+      myagg.yavg = 0;
+      myagg.covar = 0;
+    }
+
+    private boolean warned = false;
+
+    @Override
+    public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
+      assert (parameters.length == 2);
+      Object px = parameters[0];
+      Object py = parameters[1];
+      if (px != null && py != null) {
+        StdAgg myagg = (StdAgg) agg;
+        double vx = PrimitiveObjectInspectorUtils.getDouble(px, xInputOI);
+        double vy = PrimitiveObjectInspectorUtils.getDouble(py, yInputOI);
+        myagg.count++;
+        myagg.yavg = myagg.yavg + (vy - myagg.yavg) / myagg.count;
+        if (myagg.count > 1) {
+            myagg.covar += (vx - myagg.xavg) * (vy - myagg.yavg);
+        }
+        myagg.xavg = myagg.xavg + (vx - myagg.xavg) / myagg.count;
+      }
+    }
+
+    @Override
+    public Object terminatePartial(AggregationBuffer agg) throws HiveException {
+      StdAgg myagg = (StdAgg) agg;
+      ((LongWritable) partialResult[0]).set(myagg.count);
+      ((DoubleWritable) partialResult[1]).set(myagg.xavg);
+      ((DoubleWritable) partialResult[2]).set(myagg.yavg);
+      ((DoubleWritable) partialResult[3]).set(myagg.covar);
+      return partialResult;
+    }
+
+    @Override
+    public void merge(AggregationBuffer agg, Object partial) throws HiveException {
+      if (partial != null) {
+        StdAgg myagg = (StdAgg) agg;
+
+        Object partialCount = soi.getStructFieldData(partial, countField);
+        Object partialXAvg = soi.getStructFieldData(partial, xavgField);
+        Object partialYAvg = soi.getStructFieldData(partial, yavgField);
+        Object partialCovar = soi.getStructFieldData(partial, covarField);
+
+        long nA = myagg.count;
+        long nB = countFieldOI.get(partialCount);
+
+        if (nA == 0) {
+            // Just copy the information since there is nothing so far
+            myagg.count = countFieldOI.get(partialCount);
+            myagg.xavg = xavgFieldOI.get(partialXAvg);
+            myagg.yavg = yavgFieldOI.get(partialYAvg);
+            myagg.covar = covarFieldOI.get(partialCovar);
+        }
+
+        if (nA != 0 && nB != 0) {
+          // Merge the two partials
+          double xavgA = myagg.xavg;
+          double yavgA = myagg.yavg;
+          double xavgB = xavgFieldOI.get(partialXAvg);
+          double yavgB = yavgFieldOI.get(partialYAvg);
+          double covarB = covarFieldOI.get(partialCovar);
+
+          myagg.count += nB;
+          myagg.xavg = (xavgA * nA + xavgB * nB) / myagg.count;
+          myagg.yavg = (yavgA * nA + yavgB * nB) / myagg.count;
+          myagg.covar +=
+              covarB + (xavgA - xavgB) * (yavgA - yavgB) * ((double) (nA * nB) / myagg.count);
+        }
+      }
+    }
+
+    @Override
+    public Object terminate(AggregationBuffer agg) throws HiveException {
+      StdAgg myagg = (StdAgg) agg;
+
+      if (myagg.count == 0) { // SQL standard - return null for zero elements
+          return null;
+      } else {
+          getResult().set(myagg.covar / (myagg.count));
+          return getResult();
+      }
+    }
+
+    public void setResult(DoubleWritable result) {
+      this.result = result;
+    }
+
+    public DoubleWritable getResult() {
+      return result;
+    }
+  }
+
+}

Added: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java (added)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovarianceSample.java Tue Aug 17 01:00:56 2010
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+
+/**
+ * Compute the sample covariance by extending GenericUDAFCovariance and overriding
+ * the terminate() method of the evaluator.
+ *
+ */
+@Description(name = "covar_samp",
+    value = "_FUNC_(x,y) - Returns the sample covariance of a set of number pairs",
+    extended = "The function takes as arguments any pair of numeric types and returns a double.\n"
+        + "Any pair with a NULL is ignored. If the function is applied to an empty set, NULL\n"
+        + "will be returned. Otherwise, it computes the following:\n"
+        + "   (SUM(x*y)-SUM(x)*SUM(y)/COUNT(x,y))/(COUNT(x,y)-1)\n"
+        + "where neither x nor y is null.")
+public class GenericUDAFCovarianceSample extends GenericUDAFCovariance {
+
+  @Override
+  public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
+      throws SemanticException {
+    if (parameters.length != 2) {
+      throw new UDFArgumentTypeException(parameters.length - 1,
+          "Exactly two arguments are expected.");
+    }
+
+    if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+      throw new UDFArgumentTypeException(0,
+          "Only primitive type arguments are accepted but "
+          + parameters[0].getTypeName() + " is passed.");
+    }
+
+    if (parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+        throw new UDFArgumentTypeException(1,
+            "Only primitive type arguments are accepted but "
+            + parameters[1].getTypeName() + " is passed.");
+    }
+
+    switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
+    case BYTE:
+    case SHORT:
+    case INT:
+    case LONG:
+    case FLOAT:
+    case DOUBLE:
+      switch (((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory()) {
+      case BYTE:
+      case SHORT:
+      case INT:
+      case LONG:
+      case FLOAT:
+      case DOUBLE:
+        return new GenericUDAFCovarianceSampleEvaluator();
+      case STRING:
+      case BOOLEAN:
+      default:
+        throw new UDFArgumentTypeException(1,
+            "Only numeric or string type arguments are accepted but "
+            + parameters[1].getTypeName() + " is passed.");
+      }
+    case STRING:
+    case BOOLEAN:
+    default:
+      throw new UDFArgumentTypeException(0,
+          "Only numeric or string type arguments are accepted but "
+          + parameters[0].getTypeName() + " is passed.");
+    }
+  }
+
+  /**
+   * Compute the sample covariance by extending GenericUDAFCovarianceEvaluator and
+   * overriding the terminate() method of the evaluator.
+   */
+  public static class GenericUDAFCovarianceSampleEvaluator extends
+      GenericUDAFCovarianceEvaluator {
+
+    @Override
+    public Object terminate(AggregationBuffer agg) throws HiveException {
+      StdAgg myagg = (StdAgg) agg;
+
+      if (myagg.count == 0) { // SQL standard - return null for zero elements
+        return null;
+      } else {
+        if (myagg.count > 1) {
+          getResult().set(myagg.covar / (myagg.count - 1));
+        } else { // the covariance of a singleton set is always 0
+          getResult().set(0);
+        }
+        return getResult();
+      }
+    }
+  }
+
+}

Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_pop.q Tue Aug 17 01:00:56 2010
@@ -0,0 +1,16 @@
+DROP TABLE covar_tab;
+CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab;
+
+DESCRIBE FUNCTION covar_pop;
+DESCRIBE FUNCTION EXTENDED covar_pop;
+SELECT covar_pop(b, c) FROM covar_tab WHERE a < 1;
+SELECT covar_pop(b, c) FROM covar_tab WHERE a < 3;
+SELECT covar_pop(b, c) FROM covar_tab WHERE a = 3;
+SELECT a, covar_pop(b, c) FROM covar_tab GROUP BY a ORDER BY a;
+SELECT covar_pop(b, c) FROM covar_tab;
+
+DROP TABLE covar_tab;

Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/udaf_covar_samp.q Tue Aug 17 01:00:56 2010
@@ -0,0 +1,16 @@
+DROP TABLE covar_tab;
+CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab;
+
+DESCRIBE FUNCTION covar_samp;
+DESCRIBE FUNCTION EXTENDED covar_samp;
+SELECT covar_samp(b, c) FROM covar_tab WHERE a < 1;
+SELECT covar_samp(b, c) FROM covar_tab WHERE a < 3;
+SELECT covar_samp(b, c) FROM covar_tab WHERE a = 3;
+SELECT a, covar_samp(b, c) FROM covar_tab GROUP BY a ORDER BY a;
+SELECT covar_samp(b, c) FROM covar_tab;
+
+DROP TABLE covar_tab;

Modified: hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out?rev=986163&r1=986162&r2=986163&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out (original)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/show_functions.q.out Tue Aug 17 01:00:56 2010
@@ -40,6 +40,8 @@ concat_ws
 conv
 cos
 count
+covar_pop
+covar_samp
 date_add
 date_sub
 datediff
@@ -166,6 +168,8 @@ concat_ws
 conv
 cos
 count
+covar_pop
+covar_samp
 PREHOOK: query: SHOW FUNCTIONS '.*e$'
 PREHOOK: type: SHOWFUNCTIONS
 POSTHOOK: query: SHOW FUNCTIONS '.*e$'

Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_pop.q.out Tue Aug 17 01:00:56 2010
@@ -0,0 +1,90 @@
+PREHOOK: query: DROP TABLE covar_tab
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE covar_tab
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@covar_tab
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab
+PREHOOK: type: LOAD
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@covar_tab
+PREHOOK: query: DESCRIBE FUNCTION covar_pop
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION covar_pop
+POSTHOOK: type: DESCFUNCTION
+covar_pop(x,y) - Returns the population covariance of a set of number pairs
+PREHOOK: query: DESCRIBE FUNCTION EXTENDED covar_pop
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION EXTENDED covar_pop
+POSTHOOK: type: DESCFUNCTION
+covar_pop(x,y) - Returns the population covariance of a set of number pairs
+The function takes as arguments any pair of numeric types and returns a double.
+Any pair with a NULL is ignored. If the function is applied to an empty set, NULL
+will be returned. Otherwise, it computes the following:
+   (SUM(x*y)-SUM(x)*SUM(y)/COUNT(x,y))/COUNT(x,y)
+where neither x nor y is null.
+PREHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a < 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-32-57_079_1117526904113027917/-mr-10000
+POSTHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a < 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-32-57_079_1117526904113027917/-mr-10000
+NULL
+PREHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a < 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-00_492_6431652878883191819/-mr-10000
+POSTHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a < 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-00_492_6431652878883191819/-mr-10000
+NULL
+PREHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-03_895_3704658650575135394/-mr-10000
+POSTHOOK: query: SELECT covar_pop(b, c) FROM covar_tab WHERE a = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-03_895_3704658650575135394/-mr-10000
+0.0
+PREHOOK: query: SELECT a, covar_pop(b, c) FROM covar_tab GROUP BY a ORDER BY a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-06_957_9138283116312967779/-mr-10000
+POSTHOOK: query: SELECT a, covar_pop(b, c) FROM covar_tab GROUP BY a ORDER BY a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-06_957_9138283116312967779/-mr-10000
+1	NULL
+2	NULL
+3	0.0
+4	0.0
+5	0.0
+6	0.0
+PREHOOK: query: SELECT covar_pop(b, c) FROM covar_tab
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-12_804_1964811148333306251/-mr-10000
+POSTHOOK: query: SELECT covar_pop(b, c) FROM covar_tab
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-33-12_804_1964811148333306251/-mr-10000
+3.624999999999999
+PREHOOK: query: DROP TABLE covar_tab
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE covar_tab
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: default@covar_tab

Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out?rev=986163&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/udaf_covar_samp.q.out Tue Aug 17 01:00:56 2010
@@ -0,0 +1,90 @@
+PREHOOK: query: DROP TABLE covar_tab
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE covar_tab
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE covar_tab (a INT, b INT, c INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@covar_tab
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab
+PREHOOK: type: LOAD
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/covar_tab.txt' OVERWRITE
+INTO TABLE covar_tab
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@covar_tab
+PREHOOK: query: DESCRIBE FUNCTION covar_samp
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION covar_samp
+POSTHOOK: type: DESCFUNCTION
+covar_samp(x,y) - Returns the sample covariance of a set of number pairs
+PREHOOK: query: DESCRIBE FUNCTION EXTENDED covar_samp
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION EXTENDED covar_samp
+POSTHOOK: type: DESCFUNCTION
+covar_samp(x,y) - Returns the sample covariance of a set of number pairs
+The function takes as arguments any pair of numeric types and returns a double.
+Any pair with a NULL is ignored. If the function is applied to an empty set, NULL
+will be returned. Otherwise, it computes the following:
+   (SUM(x*y)-SUM(x)*SUM(y)/COUNT(x,y))/(COUNT(x,y)-1)
+where neither x nor y is null.
+PREHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a < 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-34-57_138_3229670210627723371/-mr-10000
+POSTHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a < 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-34-57_138_3229670210627723371/-mr-10000
+NULL
+PREHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a < 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-00_608_8390642785069332434/-mr-10000
+POSTHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a < 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-00_608_8390642785069332434/-mr-10000
+NULL
+PREHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a = 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-04_028_8390943423795233918/-mr-10000
+POSTHOOK: query: SELECT covar_samp(b, c) FROM covar_tab WHERE a = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-04_028_8390943423795233918/-mr-10000
+0.0
+PREHOOK: query: SELECT a, covar_samp(b, c) FROM covar_tab GROUP BY a ORDER BY a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-07_002_4813467946285728110/-mr-10000
+POSTHOOK: query: SELECT a, covar_samp(b, c) FROM covar_tab GROUP BY a ORDER BY a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-07_002_4813467946285728110/-mr-10000
+1	NULL
+2	NULL
+3	0.0
+4	0.0
+5	0.0
+6	0.0
+PREHOOK: query: SELECT covar_samp(b, c) FROM covar_tab
+PREHOOK: type: QUERY
+PREHOOK: Input: default@covar_tab
+PREHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-13_103_2218406934546016491/-mr-10000
+POSTHOOK: query: SELECT covar_samp(b, c) FROM covar_tab
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@covar_tab
+POSTHOOK: Output: file:/tmp/hadoop/hive_2010-08-13_13-35-13_103_2218406934546016491/-mr-10000
+4.833333333333332
+PREHOOK: query: DROP TABLE covar_tab
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE covar_tab
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: default@covar_tab