You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/07/02 00:19:35 UTC

svn commit: r1498723 - in /hive/branches/vectorization/ql/src: java/org/apache/hadoop/hive/ql/exec/vector/ java/org/apache/hadoop/hive/ql/io/ java/org/apache/hadoop/hive/ql/io/orc/ test/org/apache/hadoop/hive/ql/exec/vector/

Author: hashutosh
Date: Mon Jul  1 22:19:34 2013
New Revision: 1498723

URL: http://svn.apache.org/r1498723
Log:
HIVE-4544 : VectorizedRowBatchCtx::CreateVectorizedRowBatch should create only the projected columns and not all columns (Sarvesh Sakalanaga via Ashutosh Chauhan)

Modified:
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java
    hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java

Modified: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java?rev=1498723&r1=1498722&r2=1498723&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java (original)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatchCtx.java Mon Jul  1 22:19:34 2013
@@ -31,6 +31,7 @@ import org.apache.hadoop.hive.ql.io.Hive
 import org.apache.hadoop.hive.ql.io.IOPrepareCache;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.hive.serde2.Deserializer;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -64,6 +65,10 @@ public class VectorizedRowBatchCtx {
   // Hash map of partition values. Key=TblColName value=PartitionValue
   private LinkedHashMap<String, String> partitionValues;
 
+  // Column projection list - List of column indexes to include. This
+  // list does not contain partition columns
+  private List<Integer> colsToInclude;
+
   /**
    * Constructor for VectorizedRowBatchCtx
    *
@@ -106,7 +111,7 @@ public class VectorizedRowBatchCtx {
    * @throws IllegalAccessException
    * @throws HiveException
    */
-  public void Init(Configuration hiveConf, FileSplit split) throws ClassNotFoundException,
+  public void init(Configuration hiveConf, FileSplit split) throws ClassNotFoundException,
       IOException,
       SerDeException,
       InstantiationException,
@@ -158,7 +163,16 @@ public class VectorizedRowBatchCtx {
       for (int i = 0; i < partKeys.length; i++) {
         String key = partKeys[i];
         partNames.add(key);
-        partitionValues.put(key, partSpec.get(key));
+        if (partSpec == null) {
+          // for partitionless table, initialize partValue to empty string.
+          // We can have partitionless table even if we have partition keys
+          // when there is only only partition selected and the partition key is not
+          // part of the projection/include list.
+          partitionValues.put(key, "");
+        } else {
+          partitionValues.put(key, partSpec.get(key));
+        }
+
         partObjectInspectors
             .add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
       }
@@ -179,6 +193,8 @@ public class VectorizedRowBatchCtx {
       rowOI = partRawRowObjectInspector;
       rawRowOI = partRawRowObjectInspector;
     }
+
+    colsToInclude = ColumnProjectionUtils.getReadColumnIDs(hiveConf);
   }
 
   /**
@@ -187,48 +203,54 @@ public class VectorizedRowBatchCtx {
    * @return VectorizedRowBatch
    * @throws HiveException
    */
-  public VectorizedRowBatch CreateVectorizedRowBatch() throws HiveException
+  public VectorizedRowBatch createVectorizedRowBatch() throws HiveException
   {
     List<? extends StructField> fieldRefs = rowOI.getAllStructFieldRefs();
     VectorizedRowBatch result = new VectorizedRowBatch(fieldRefs.size());
     for (int j = 0; j < fieldRefs.size(); j++) {
-      ObjectInspector foi = fieldRefs.get(j).getFieldObjectInspector();
-      switch (foi.getCategory()) {
-      case PRIMITIVE: {
-        PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi;
-        // Vectorization currently only supports the following data types:
-        // BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING and TIMESTAMP
-        switch (poi.getPrimitiveCategory()) {
-        case BOOLEAN:
-        case BYTE:
-        case SHORT:
-        case INT:
-        case LONG:
-        case TIMESTAMP:
-          result.cols[j] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
-          break;
-        case FLOAT:
-        case DOUBLE:
-          result.cols[j] = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+      // If the column is included in the include list or if the column is a
+      // partition column then create the column vector. Also note that partition columns are not
+      // in the included list.
+      if ((colsToInclude == null) || colsToInclude.contains(j)
+          || ((partitionValues != null) && (partitionValues.get(fieldRefs.get(j).getFieldName()) != null))) {
+        ObjectInspector foi = fieldRefs.get(j).getFieldObjectInspector();
+        switch (foi.getCategory()) {
+        case PRIMITIVE: {
+          PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi;
+          // Vectorization currently only supports the following data types:
+          // BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING and TIMESTAMP
+          switch (poi.getPrimitiveCategory()) {
+          case BOOLEAN:
+          case BYTE:
+          case SHORT:
+          case INT:
+          case LONG:
+          case TIMESTAMP:
+            result.cols[j] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+            break;
+          case FLOAT:
+          case DOUBLE:
+            result.cols[j] = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+            break;
+          case STRING:
+            result.cols[j] = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+            break;
+          default:
+            throw new RuntimeException("Vectorizaton is not supported for datatype:"
+                + poi.getPrimitiveCategory());
+          }
           break;
-        case STRING:
-          result.cols[j] = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
-          break;
-        default:
-          throw new RuntimeException("Vectorizaton is not supported for datatype:"
-              + poi.getPrimitiveCategory());
         }
-        break;
-      }
-      case LIST:
-      case MAP:
-      case STRUCT:
-      case UNION:
-        throw new HiveException("Vectorizaton is not supported for datatype:"
-            + foi.getCategory());
-      default:
-        throw new HiveException("Unknown ObjectInspector category!");
+        case LIST:
+        case MAP:
+        case STRUCT:
+        case UNION:
+          throw new HiveException("Vectorizaton is not supported for datatype:"
+              + foi.getCategory());
+        default:
+          throw new HiveException("Unknown ObjectInspector category!");
 
+        }
       }
     }
     result.numCols = fieldRefs.size();
@@ -247,7 +269,7 @@ public class VectorizedRowBatchCtx {
    * @throws HiveException
    * @throws SerDeException
    */
-  public void AddRowToBatch(int rowIndex, Writable rowBlob, VectorizedRowBatch batch)
+  public void addRowToBatch(int rowIndex, Writable rowBlob, VectorizedRowBatch batch)
       throws HiveException, SerDeException
   {
     Object row = this.deserializer.deserialize(rowBlob);
@@ -263,7 +285,7 @@ public class VectorizedRowBatchCtx {
    *          Vectorized row batch which contains deserialized data
    * @throws SerDeException
    */
-  public void ConvertRowBatchBlobToVectorizedBatch(Object rowBlob, int rowsInBlob,
+  public void convertRowBatchBlobToVectorizedBatch(Object rowBlob, int rowsInBlob,
       VectorizedRowBatch batch)
       throws SerDeException {
 
@@ -275,7 +297,7 @@ public class VectorizedRowBatchCtx {
     }
   }
 
-  private int GetColIndexBasedOnColName(String colName) throws HiveException
+  private int getColIndexBasedOnColName(String colName) throws HiveException
   {
     List<? extends StructField> fieldRefs = rowOI.getAllStructFieldRefs();
     for (int i = 0; i < fieldRefs.size(); i++) {
@@ -292,14 +314,14 @@ public class VectorizedRowBatchCtx {
    * @param batch
    * @throws HiveException
    */
-  public void AddPartitionColsToBatch(VectorizedRowBatch batch) throws HiveException
+  public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveException
   {
     int colIndex;
     String value;
     BytesColumnVector bcv;
     if (partitionValues != null) {
       for (String key : partitionValues.keySet()) {
-        colIndex = GetColIndexBasedOnColName(key);
+        colIndex = getColIndexBasedOnColName(key);
         value = partitionValues.get(key);
         bcv = (BytesColumnVector) batch.cols[colIndex];
         bcv.setRef(0, value.getBytes(), 0, value.length());

Modified: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java?rev=1498723&r1=1498722&r2=1498723&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java (original)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/VectorizedRCFileRecordReader.java Mon Jul  1 22:19:34 2013
@@ -126,7 +126,7 @@ public class VectorizedRCFileRecordReade
     more = start < end;
     try {
       rbCtx = new VectorizedRowBatchCtx();
-      rbCtx.Init(conf, split);
+      rbCtx.init(conf, split);
     } catch (Exception e) {
       throw new RuntimeException(e);
     }
@@ -149,7 +149,7 @@ public class VectorizedRCFileRecordReade
   public VectorizedRowBatch createValue() {
     VectorizedRowBatch result = null;
     try {
-      result = rbCtx.CreateVectorizedRowBatch();
+      result = rbCtx.createVectorizedRowBatch();
     } catch (HiveException e) {
       new RuntimeException("Error creating a batch", e);
     }
@@ -181,13 +181,13 @@ public class VectorizedRCFileRecordReade
           // CombineHiveRecordReader and as this does not call CreateValue() for
           // each new RecordReader it creates, this check is required in next()
           if (addPartitionCols) {
-            rbCtx.AddPartitionColsToBatch(value);
+            rbCtx.addPartitionColsToBatch(value);
             addPartitionCols = false;
           }
           in.getCurrentRow(colsCache);
           // Currently RCFile reader does not support reading vectorized
           // data. Populating the batch by adding one row at a time.
-          rbCtx.AddRowToBatch(i, (Writable) colsCache, value);
+          rbCtx.addRowToBatch(i, (Writable) colsCache, value);
         } else {
           break;
         }

Modified: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java?rev=1498723&r1=1498722&r2=1498723&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java (original)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java Mon Jul  1 22:19:34 2013
@@ -66,7 +66,7 @@ public class VectorizedOrcInputFormat ex
 
       try {
         rbCtx = new VectorizedRowBatchCtx();
-        rbCtx.Init(conf, fileSplit);
+        rbCtx.init(conf, fileSplit);
       } catch (Exception e) {
         throw new RuntimeException(e);
       }
@@ -85,13 +85,13 @@ public class VectorizedOrcInputFormat ex
         // as this does not call CreateValue for each new RecordReader it creates, this check is
         // required in next()
         if (addPartitionCols) {
-          rbCtx.AddPartitionColsToBatch(value);
+          rbCtx.addPartitionColsToBatch(value);
           addPartitionCols = false;
         }
         reader.nextBatch(value);
-        rbCtx.ConvertRowBatchBlobToVectorizedBatch((Object) value, value.size, value);
+        rbCtx.convertRowBatchBlobToVectorizedBatch((Object) value, value.size, value);
       } catch (Exception e) {
-        new RuntimeException(e);
+        throw new RuntimeException(e);
       }
       progress = reader.getProgress();
       return true;
@@ -106,9 +106,9 @@ public class VectorizedOrcInputFormat ex
     public VectorizedRowBatch createValue() {
       VectorizedRowBatch result = null;
       try {
-        result = rbCtx.CreateVectorizedRowBatch();
+        result = rbCtx.createVectorizedRowBatch();
       } catch (HiveException e) {
-        new RuntimeException("Error creating a batch", e);
+        throw new RuntimeException("Error creating a batch", e);
       }
       return result;
     }

Modified: hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java?rev=1498723&r1=1498722&r2=1498723&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java (original)
+++ hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatchCtx.java Mon Jul  1 22:19:34 2013
@@ -203,7 +203,7 @@ public class TestVectorizedRowBatchCtx {
 
     // Create the context
     VectorizedRowBatchCtx ctx = new VectorizedRowBatchCtx(oi, oi, serDe, null);
-    VectorizedRowBatch batch = ctx.CreateVectorizedRowBatch();
+    VectorizedRowBatch batch = ctx.createVectorizedRowBatch();
     VectorizedBatchUtil.SetNoNullFields(true, batch);
 
     // Iterate thru the rows and populate the batch
@@ -213,7 +213,7 @@ public class TestVectorizedRowBatchCtx {
       BytesRefArrayWritable cols = new BytesRefArrayWritable();
       reader.getCurrentRow(cols);
       cols.resetValid(colCount);
-      ctx.AddRowToBatch(i, cols, batch);
+      ctx.addRowToBatch(i, cols, batch);
     }
     reader.close();
     batch.size = 10;