You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2010/11/04 22:33:36 UTC

svn commit: r1031264 - in /hive/trunk: ./ ql/src/java/org/apache/hadoop/hive/ql/exec/ serde/src/java/org/apache/hadoop/hive/serde2/columnar/ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/

Author: namit
Date: Thu Nov  4 21:33:35 2010
New Revision: 1031264

URL: http://svn.apache.org/viewvc?rev=1031264&view=rev
Log:
HIVE-1751. Optimize ColumnarStructObjectInspector.getStructFieldData()
(Siying Dong via namit)


Modified:
    hive/trunk/CHANGES.txt
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java
    hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java
    hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java
    hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java

Modified: hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hive/trunk/CHANGES.txt?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/CHANGES.txt (original)
+++ hive/trunk/CHANGES.txt Thu Nov  4 21:33:35 2010
@@ -217,6 +217,9 @@ Trunk -  Unreleased
     HIVE-1761. Support show locks for a particular table
     (namit via He Yongqiang)
 
+    HIVE-1751. Optimize ColumnarStructObjectInspector.getStructFieldData()
+    (Siying Dong via namit)
+
   OPTIMIZATIONS
 
   BUG FIXES

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java Thu Nov  4 21:33:35 2010
@@ -33,6 +33,9 @@ public class ExprNodeColumnEvaluator ext
 
   protected ExprNodeColumnDesc expr;
 
+  transient boolean simpleCase;
+  transient StructObjectInspector inspector;
+  transient StructField field;
   transient StructObjectInspector[] inspectors;
   transient StructField[] fields;
   transient boolean[] unionField;
@@ -47,40 +50,53 @@ public class ExprNodeColumnEvaluator ext
     // We need to support field names like KEY.0, VALUE.1 between
     // map-reduce boundary.
     String[] names = expr.getColumn().split("\\.");
-    inspectors = new StructObjectInspector[names.length];
-    fields = new StructField[names.length];
-    unionField = new boolean[names.length];
-    int unionIndex = -1;
-
-    for (int i = 0; i < names.length; i++) {
-      if (i == 0) {
-        inspectors[0] = (StructObjectInspector) rowInspector;
-      } else {
-        if (unionIndex != -1) {
-          inspectors[i] = (StructObjectInspector) (
-            (UnionObjectInspector)fields[i-1].getFieldObjectInspector()).
-            getObjectInspectors().get(unionIndex);
+    String[] unionfields = names[0].split("\\:");
+    if (names.length == 1 && unionfields.length == 1) {
+      simpleCase = true;
+      inspector = (StructObjectInspector) rowInspector;
+      field = inspector.getStructFieldRef(names[0]);
+      return field.getFieldObjectInspector();
+    }
+    else {
+      simpleCase = false;
+      inspectors = new StructObjectInspector[names.length];
+      fields = new StructField[names.length];
+      unionField = new boolean[names.length];
+      int unionIndex = -1;
+
+      for (int i = 0; i < names.length; i++) {
+        if (i == 0) {
+          inspectors[0] = (StructObjectInspector) rowInspector;
         } else {
-          inspectors[i] = (StructObjectInspector) fields[i - 1]
-            .getFieldObjectInspector();
+          if (unionIndex != -1) {
+            inspectors[i] = (StructObjectInspector) (
+              (UnionObjectInspector)fields[i-1].getFieldObjectInspector()).
+              getObjectInspectors().get(unionIndex);
+          } else {
+            inspectors[i] = (StructObjectInspector) fields[i - 1]
+              .getFieldObjectInspector();
+	  }
+        }
+	// to support names like _colx:1._coly
+        unionfields = names[i].split("\\:");
+        fields[i] = inspectors[i].getStructFieldRef(unionfields[0]);
+        if (unionfields.length > 1) {
+          unionIndex = Integer.parseInt(unionfields[1]);
+          unionField[i] = true;
+        } else {
+          unionIndex = -1;
+          unionField[i] = false;
         }
       }
-      // to support names like _colx:1._coly
-      String[] unionfields = names[i].split("\\:");
-      fields[i] = inspectors[i].getStructFieldRef(unionfields[0]);
-      if (unionfields.length > 1) {
-        unionIndex = Integer.parseInt(unionfields[1]);
-        unionField[i] = true;
-      } else {
-        unionIndex = -1;
-        unionField[i] = false;
-      }
+      return fields[names.length - 1].getFieldObjectInspector();
     }
-    return fields[names.length - 1].getFieldObjectInspector();
   }
 
   @Override
   public Object evaluate(Object row) throws HiveException {
+    if (simpleCase) {
+      return inspector.getStructFieldData(row, field);
+    }
     Object o = row;
     for (int i = 0; i < fields.length; i++) {
       o = inspectors[i].getStructFieldData(o, fields[i]);

Modified: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java
URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java (original)
+++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java Thu Nov  4 21:33:35 2010
@@ -94,7 +94,7 @@ public class ColumnarSerDe implements Se
 
     java.util.ArrayList<Integer> notSkipIDs = ColumnProjectionUtils.getReadColumnIDs(job);
 
-    cachedLazyStruct = new ColumnarStruct(cachedObjectInspector, notSkipIDs);
+    cachedLazyStruct = new ColumnarStruct(cachedObjectInspector, notSkipIDs, serdeParams.getNullSequence());
 
     int size = serdeParams.getColumnTypes().size();
     field = new BytesRefWritable[size];

Modified: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java
URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java (original)
+++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java Thu Nov  4 21:33:35 2010
@@ -38,49 +38,49 @@ import org.apache.hadoop.io.Text;
  * Object get parsed at its initialize time when call
  * {@link #init(BytesRefArrayWritable cols)}, while LazyStruct parse fields in a
  * lazy way.
- * 
+ *
  */
 public class ColumnarStruct {
 
-  /**
-   * The fields of the struct.
-   */
-  LazyObject[] fields;
-
   private static final Log LOG = LogFactory.getLog(ColumnarStruct.class);
 
   int[] prjColIDs = null; // list of projected column IDs
 
+  Text nullSequence;
+  int lengthNullSequence;
+
   /**
    * Construct a ColumnarStruct object with the TypeInfo. It creates the first
    * level object at the first place
-   * 
+   *
    * @param oi
    *          the ObjectInspector representing the type of this LazyStruct.
    */
   public ColumnarStruct(ObjectInspector oi) {
-    this(oi, null);
+    this(oi, null, null);
   }
 
   /**
    * Construct a ColumnarStruct object with the TypeInfo. It creates the first
    * level object at the first place
-   * 
+   *
    * @param oi
    *          the ObjectInspector representing the type of this LazyStruct.
    * @param notSkippedColumnIDs
    *          the column ids that should not be skipped
    */
   public ColumnarStruct(ObjectInspector oi,
-      ArrayList<Integer> notSkippedColumnIDs) {
+      ArrayList<Integer> notSkippedColumnIDs, Text nullSequence) {
     List<? extends StructField> fieldRefs = ((StructObjectInspector) oi)
         .getAllStructFieldRefs();
     int num = fieldRefs.size();
-    fields = new LazyObject[num];
-    cachedByteArrayRef = new ByteArrayRef[num];
-    rawBytesField = new BytesRefWritable[num];
-    fieldSkipped = new boolean[num];
-    inited = new boolean[num];
+
+    fieldInfoList = new FieldInfo[num];
+
+    if (nullSequence != null) {
+      this.nullSequence = nullSequence;
+      this.lengthNullSequence = nullSequence.getLength();
+    }
 
     // if no columns is set to be skipped, add all columns in
     // 'notSkippedColumnIDs'
@@ -91,15 +91,10 @@ public class ColumnarStruct {
     }
 
     for (int i = 0; i < num; i++) {
-      fields[i] = LazyFactory.createLazyObject(fieldRefs.get(i)
-          .getFieldObjectInspector());
-      cachedByteArrayRef[i] = new ByteArrayRef();
-      if (!notSkippedColumnIDs.contains(i)) {
-        fieldSkipped[i] = true;
-        inited[i] = true;
-      } else {
-        inited[i] = false;
-      }
+      fieldInfoList[i] = new FieldInfo(
+          LazyFactory.createLazyObject(fieldRefs.get(i)
+          .getFieldObjectInspector()),
+          !notSkippedColumnIDs.contains(i));
     }
 
     // maintain a list of non-NULL column IDs
@@ -117,73 +112,110 @@ public class ColumnarStruct {
 
   /**
    * Get one field out of the struct.
-   * 
+   *
    * If the field is a primitive field, return the actual object. Otherwise
    * return the LazyObject. This is because PrimitiveObjectInspector does not
    * have control over the object used by the user - the user simply directly
    * use the Object instead of going through Object
    * PrimitiveObjectInspector.get(Object).
-   * 
+   *
    * NOTE: separator and nullSequence has to be the same each time this method
    * is called. These two parameters are used only once to parse each record.
-   * 
+   *
    * @param fieldID
    *          The field ID
    * @param nullSequence
    *          The sequence for null value
    * @return The field as a LazyObject
    */
-  public Object getField(int fieldID, Text nullSequence) {
-    return uncheckedGetField(fieldID, nullSequence);
+  public Object getField(int fieldID) {
+    return fieldInfoList[fieldID].uncheckedGetField();
   }
 
-  /*
-   * use an array instead of only one object in case in future hive does not do
-   * the byte copy.
-   */
-  ByteArrayRef[] cachedByteArrayRef = null;
-  BytesRefWritable[] rawBytesField = null;
-  boolean[] inited = null;
-  boolean[] fieldSkipped = null;
-
-  /**
-   * Get the field out of the row without checking parsed. This is called by
-   * both getField and getFieldsAsList.
-   * 
-   * @param fieldID
-   *          The id of the field starting from 0.
-   * @param nullSequence
-   *          The sequence representing NULL value.
-   * @return The value of the field
-   */
-  protected Object uncheckedGetField(int fieldID, Text nullSequence) {
-    if (fieldSkipped[fieldID]) {
-      return null;
-    }
-    if (!inited[fieldID]) {
-      BytesRefWritable passedInField = rawBytesField[fieldID];
-      try {
-        cachedByteArrayRef[fieldID].setData(passedInField.getData());
-      } catch (IOException e) {
-        throw new RuntimeException(e);
+  class FieldInfo {
+    LazyObject field;
+    /*
+     * use an array instead of only one object in case in future hive does not do
+     * the byte copy.
+     */
+    ByteArrayRef cachedByteArrayRef;
+    BytesRefWritable rawBytesField;
+    boolean inited;
+    boolean fieldSkipped;
+
+    public FieldInfo(LazyObject lazyObject, boolean fieldSkipped) {
+      field = lazyObject;
+      cachedByteArrayRef = new ByteArrayRef();
+      if (fieldSkipped) {
+        this.fieldSkipped = true;
+        inited = true;
+      } else {
+        inited = false;
       }
-      fields[fieldID].init(cachedByteArrayRef[fieldID], passedInField
-          .getStart(), passedInField.getLength());
-      inited[fieldID] = true;
     }
 
-    byte[] data = cachedByteArrayRef[fieldID].getData();
-    int fieldLen = rawBytesField[fieldID].length;
+    /*
+     * ============================ [PERF] ===================================
+     * This function is called for every row. Setting up the selected/projected
+     * columns at the first call, and don't do that for the following calls.
+     * Ideally this should be done in the constructor where we don't need to
+     * branch in the function for each row.
+     * =========================================================================
+     */
+    public void init(BytesRefWritable col) {
+        if (col != null) {
+          rawBytesField= col;
+          inited = false;
+        } else {
+          // select columns that actually do not exist in the file.
+          fieldSkipped = true;
+        }
+    }
+
+    /**
+     * Get the field out of the row without checking parsed. This is called by
+     * both getField and getFieldsAsList.
+     *
+     * @param fieldID
+     *          The id of the field starting from 0.
+     * @param nullSequence
+     *          The sequence representing NULL value.
+     * @return The value of the field
+     */
+    protected Object uncheckedGetField() {
+      if (fieldSkipped) {
+        return null;
+      }
+      if (!inited) {
+        try {
+          cachedByteArrayRef.setData(rawBytesField.getData());
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+        field.init(cachedByteArrayRef, rawBytesField
+            .getStart(), rawBytesField.getLength());
+        inited = true;
+      }
 
-    if (fieldLen == nullSequence.getLength()
-        && LazyUtils.compare(data, rawBytesField[fieldID].getStart(), fieldLen,
-        nullSequence.getBytes(), 0, nullSequence.getLength()) == 0) {
-      return null;
-    }
 
-    return fields[fieldID].getObject();
+      int fieldLen = rawBytesField.length;
+      if (fieldLen == lengthNullSequence) {
+        byte[] data = cachedByteArrayRef.getData();
+
+        if (LazyUtils.compare(data, rawBytesField.getStart(), fieldLen,
+            nullSequence.getBytes(), 0, lengthNullSequence) == 0) {
+          return null;
+        }
+      }
+
+      return field.getObject();
+
+    }
   }
 
+  FieldInfo[] fieldInfoList = null;
+
+
   /*
    * ============================ [PERF] ===================================
    * This function is called for every row. Setting up the selected/projected
@@ -196,11 +228,10 @@ public class ColumnarStruct {
     for (int i = 0; i < prjColIDs.length; ++i) {
       int fieldIndex = prjColIDs[i];
       if (fieldIndex < cols.size()) {
-        rawBytesField[fieldIndex] = cols.unCheckedGet(fieldIndex);
-        inited[fieldIndex] = false;
+        fieldInfoList[fieldIndex].init(cols.unCheckedGet(fieldIndex));
       } else {
         // select columns that actually do not exist in the file.
-        fieldSkipped[fieldIndex] = true;
+        fieldInfoList[fieldIndex].init(null);
       }
     }
   }
@@ -209,19 +240,19 @@ public class ColumnarStruct {
 
   /**
    * Get the values of the fields as an ArrayList.
-   * 
+   *
    * @param nullSequence
    *          The sequence for the NULL value
    * @return The values of the fields as an ArrayList.
    */
-  public ArrayList<Object> getFieldsAsList(Text nullSequence) {
+  public ArrayList<Object> getFieldsAsList() {
     if (cachedList == null) {
       cachedList = new ArrayList<Object>();
     } else {
       cachedList.clear();
     }
-    for (int i = 0; i < fields.length; i++) {
-      cachedList.add(uncheckedGetField(i, nullSequence));
+    for (int i = 0; i < fieldInfoList.length; i++) {
+      cachedList.add(fieldInfoList[i].uncheckedGetField());
     }
     return cachedList;
   }

Modified: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java
URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java (original)
+++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java Thu Nov  4 21:33:35 2010
@@ -29,10 +29,10 @@ import org.apache.hadoop.io.Text;
 /**
  * ColumnarStructObjectInspector works on struct data that is stored in
  * ColumnarStruct.
- * 
+ *
  * The names of the struct fields and the internal structure of the struct
  * fields are specified in the ctor of the ColumnarStructObjectInspector.
- * 
+ *
  * Always use the ObjectInspectorFactory to create new ObjectInspector objects,
  * instead of directly creating an instance of this class.
  */
@@ -144,7 +144,7 @@ class ColumnarStructObjectInspector exte
     int fieldID = f.getFieldID();
     assert (fieldID >= 0 && fieldID < fields.size());
 
-    return struct.getField(fieldID, nullSequence);
+    return struct.getField(fieldID);
   }
 
   @Override
@@ -153,6 +153,6 @@ class ColumnarStructObjectInspector exte
       return null;
     }
     ColumnarStruct struct = (ColumnarStruct) data;
-    return struct.getFieldsAsList(nullSequence);
+    return struct.getFieldsAsList();
   }
 }