You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2010/10/22 06:57:08 UTC

svn commit: r1026210 - in /hadoop/hive/trunk: CHANGES.txt ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ListObjectsEqualComparer.java

Author: namit
Date: Fri Oct 22 04:57:08 2010
New Revision: 1026210

URL: http://svn.apache.org/viewvc?rev=1026210&view=rev
Log:
HIVE-1738. Optimize key comparison in groupby
(Siying Dong via namit)


Added:
    hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ListObjectsEqualComparer.java
Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=1026210&r1=1026209&r2=1026210&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Fri Oct 22 04:57:08 2010
@@ -199,6 +199,9 @@ Trunk -  Unreleased
     HIVE-1660. Change get_partitions_ps to pass partition filter to
     database (Paul Yang via namit)
 
+    HIVE-1738. Optimize key comparison in groupby
+    (Siying Dong via namit)
+
   OPTIMIZATIONS
 
   BUG FIXES

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java?rev=1026210&r1=1026209&r2=1026210&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java Fri Oct 22 04:57:08 2010
@@ -40,14 +40,15 @@ import org.apache.hadoop.hive.ql.plan.Gr
 import org.apache.hadoop.hive.ql.plan.api.OperatorType;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectsEqualComparer;
 import org.apache.hadoop.hive.serde2.lazy.LazyPrimitive;
 import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
 import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
 import org.apache.hadoop.io.Text;
@@ -117,6 +118,8 @@ public class GroupByOperator extends Ope
   // new Key ObjectInspectors are objectInspectors from the parent
   transient StructObjectInspector newKeyObjectInspector;
   transient StructObjectInspector currentKeyObjectInspector;
+  transient ListObjectsEqualComparer currentStructEqualComparer;
+  transient ListObjectsEqualComparer newKeyStructEqualComparer;
 
   /**
    * This is used to store the position and field names for variable length
@@ -245,7 +248,7 @@ public class GroupByOperator extends Ope
       aggregations = newAggregations();
       hashAggr = false;
     } else {
-      hashAggregations = new HashMap<KeyWrapper, AggregationBuffer[]>();
+      hashAggregations = new HashMap<KeyWrapper, AggregationBuffer[]>(256);
       aggregations = newAggregations();
       hashAggr = true;
       keyPositionsSize = new ArrayList<Integer>();
@@ -280,6 +283,8 @@ public class GroupByOperator extends Ope
     currentKeyObjectInspector = ObjectInspectorFactory
         .getStandardStructObjectInspector(keyNames, Arrays
         .asList(currentKeyObjectInspectors));
+    currentStructEqualComparer = new ListObjectsEqualComparer(currentKeyObjectInspectors, currentKeyObjectInspectors);
+    newKeyStructEqualComparer = new ListObjectsEqualComparer(currentKeyObjectInspectors, keyObjectInspectors);
 
     outputObjInspector = ObjectInspectorFactory
         .getStandardStructObjectInspector(fieldNames, objectInspectors);
@@ -634,15 +639,15 @@ public class GroupByOperator extends Ope
     public boolean equals(Object obj) {
       ArrayList<Object> copied_in_hashmap = ((KeyWrapper) obj).keys;
       if (!copy) {
-        return ObjectInspectorUtils.compare(copied_in_hashmap,
-            currentKeyObjectInspector, keys, newKeyObjectInspector) == 0;
+        return newKeyStructEqualComparer.areEqual(copied_in_hashmap, keys);
       } else {
-        return ObjectInspectorUtils.compare(copied_in_hashmap,
-            currentKeyObjectInspector, keys, currentKeyObjectInspector) == 0;
+        return currentStructEqualComparer.areEqual(copied_in_hashmap, keys);
       }
     }
   }
 
+
+
   KeyWrapper keyProber = new KeyWrapper();
 
   private void processHashAggr(Object row, ObjectInspector rowInspector,
@@ -705,8 +710,9 @@ public class GroupByOperator extends Ope
     // Prepare aggs for updating
     AggregationBuffer[] aggs = null;
     Object[][] lastInvoke = null;
-    boolean keysAreEqual = ObjectInspectorUtils.compare(newKeys,
-        newKeyObjectInspector, currentKeys, currentKeyObjectInspector) == 0;
+    boolean keysAreEqual = (currentKeys != null && newKeys != null)?
+      newKeyStructEqualComparer.areEqual(currentKeys, newKeys) : false;
+
 
     // Forward the current keys if needed for sort-based aggregation
     if (currentKeys != null && !keysAreEqual) {

Added: hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ListObjectsEqualComparer.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ListObjectsEqualComparer.java?rev=1026210&view=auto
==============================================================================
--- hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ListObjectsEqualComparer.java (added)
+++ hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ListObjectsEqualComparer.java Fri Oct 22 04:57:08 2010
@@ -0,0 +1,180 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.objectinspector;
+
+import java.util.ArrayList;
+
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+
+/**
+ * Compare two list of objects.
+ * Two lists are expected to have same types. Type information for every object is
+ * passed when calling Constructor to avoid the step of figuring out types from
+ * ObjectInspetor and determine how to compare the types when comparing.
+ * Also, for string and text elements, it performs slightly better than
+ * using ObjectInspectorUtils.compare() == 0, which instead of calling .compare()
+ * calls .equalTo(), which compares size before byte by byte comparison.
+ *
+ */
+public class ListObjectsEqualComparer {
+  enum CompareType {
+    // Now only string, text, int, long, byte and boolean comparisons
+    // are treated as special cases.
+    // For other types, we reuse ObjectInspectorUtils.compare()
+    COMPARE_STRING, COMPARE_TEXT, COMPARE_INT, COMPARE_LONG, COMPARE_BYTE,
+    COMPARE_BOOL, OTHER
+  }
+
+  class FieldComparer {
+    protected ObjectInspector oi0, oi1;
+    protected ObjectInspector compareOI;
+    protected CompareType compareType;
+    protected Converter converter0, converter1;
+    protected StringObjectInspector soi0, soi1;
+    protected IntObjectInspector ioi0, ioi1;
+    protected LongObjectInspector loi0, loi1;
+    protected ByteObjectInspector byoi0, byoi1;
+    protected BooleanObjectInspector boi0, boi1;
+
+    public FieldComparer(ObjectInspector oi0, ObjectInspector oi1) {
+      this.oi0 = oi0;
+      this.oi1 = oi1;
+
+      TypeInfo type0 = TypeInfoUtils.getTypeInfoFromObjectInspector(oi0);
+      TypeInfo type1 = TypeInfoUtils.getTypeInfoFromObjectInspector(oi1);
+      if (type0.equals(TypeInfoFactory.stringTypeInfo) &&
+          type1.equals(TypeInfoFactory.stringTypeInfo)) {
+        soi0 = (StringObjectInspector) oi0;
+        soi1 = (StringObjectInspector) oi1;
+        if (soi0.preferWritable() || soi1.preferWritable()) {
+          compareType = CompareType.COMPARE_TEXT;
+        } else {
+          compareType = CompareType.COMPARE_STRING;
+        }
+      } else if (type0.equals(TypeInfoFactory.intTypeInfo) &&
+          type1.equals(TypeInfoFactory.intTypeInfo)) {
+        compareType = CompareType.COMPARE_INT;
+        ioi0 = (IntObjectInspector) oi0;
+        ioi1 = (IntObjectInspector) oi1;
+      } else if (type0.equals(TypeInfoFactory.longTypeInfo) &&
+          type1.equals(TypeInfoFactory.longTypeInfo)) {
+        compareType = CompareType.COMPARE_LONG;
+        loi0 = (LongObjectInspector) oi0;
+        loi1 = (LongObjectInspector) oi1;
+      } else if (type0.equals(TypeInfoFactory.byteTypeInfo) &&
+          type1.equals(TypeInfoFactory.byteTypeInfo)) {
+        compareType = CompareType.COMPARE_BYTE;
+        byoi0 = (ByteObjectInspector) oi0;
+        byoi1 = (ByteObjectInspector) oi1;
+      } else if (type0.equals(TypeInfoFactory.booleanTypeInfo) &&
+          type1.equals(TypeInfoFactory.booleanTypeInfo)) {
+        compareType = CompareType.COMPARE_BOOL;
+        boi0 = (BooleanObjectInspector) oi0;
+        boi1 = (BooleanObjectInspector) oi1;
+      } else {
+        // We don't check compatibility of two object inspectors, but directly
+        // pass them into ObjectInspectorUtils.compare(), users of this class
+        // should make sure ObjectInspectorUtils.compare() doesn't throw exceptions
+        // and returns correct results.
+        compareType = CompareType.OTHER;
+      }
+    }
+
+    public boolean areEqual(Object o0, Object o1) {
+      if (o0 == null && o1 == null) {
+        return true;
+      } else if (o0 == null || o1 == null) {
+        return false;
+      }
+      switch (compareType) {
+      case COMPARE_TEXT:
+        return (soi0.getPrimitiveWritableObject(o0).equals(
+            soi1.getPrimitiveWritableObject(o1)));
+      case COMPARE_INT:
+        return (ioi0.get(o0) == ioi1.get(o1));
+      case COMPARE_LONG:
+        return (loi0.get(o0) == loi1.get(o1));
+      case COMPARE_BYTE:
+        return (byoi0.get(o0) == byoi1.get(o1));
+      case COMPARE_BOOL:
+        return (boi0.get(o0) == boi1.get(o1));
+      case COMPARE_STRING:
+        return (soi0.getPrimitiveJavaObject(o0).equals(
+            soi1.getPrimitiveJavaObject(o1)));
+      default:
+        return (ObjectInspectorUtils.compare(
+            o0, oi0, o1, oi1) == 0);
+      }
+    }
+  }
+
+  FieldComparer[] fieldComparers;
+  int numFields;
+
+  public ListObjectsEqualComparer(ObjectInspector[] oi0, ObjectInspector[] oi1) {
+    if (oi0.length != oi1.length) {
+      throw new RuntimeException("Sizes of two lists of object inspectors don't match.");
+    }
+    numFields = oi0.length;
+    fieldComparers = new FieldComparer[numFields];
+    for (int i = 0; i < oi0.length; i++) {
+      fieldComparers[i] = new FieldComparer(oi0[i], oi1[i]);
+    }
+  }
+
+
+  /**
+   * ol0, ol1 should have equal or less number of elements than objectinspectors
+   * passed in constructor.
+   *
+   * @param ol0
+   * @param ol1
+   * @return True if object in ol0 and ol1 are all identical
+   */
+  public boolean areEqual(ArrayList<Object> ol0, ArrayList<Object> ol1) {
+    if (ol0.size() != numFields || ol1.size() != numFields) {
+      if (ol0.size() != ol1.size()) {
+        return false;
+      }
+      assert (ol0.size() <= numFields);
+      assert (ol1.size() <= numFields);
+      for (int i = 0; i < Math.min(ol0.size(), ol1.size()); i++) {
+        if (!fieldComparers[i].areEqual(ol0.get(i), ol1.get(i))) {
+          return false;
+        }
+      }
+      return true;
+    }
+
+    for (int i = 0; i < numFields; i++) {
+      if (!fieldComparers[i].areEqual(ol0.get(i), ol1.get(i))) {
+        return false;
+      }
+    }
+    return true;
+  }
+}