You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by br...@apache.org on 2014/10/06 06:00:54 UTC

svn commit: r1629563 [7/33] - in /hive/branches/spark: ./ accumulo-handler/ beeline/ beeline/src/java/org/apache/hive/beeline/ bin/ bin/ext/ common/ common/src/java/org/apache/hadoop/hive/conf/ common/src/test/org/apache/hadoop/hive/common/type/ contri...

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HiveSplitGenerator.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HiveSplitGenerator.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HiveSplitGenerator.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HiveSplitGenerator.java Mon Oct  6 04:00:39 2014
@@ -38,8 +38,9 @@ import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapreduce.split.TezMapReduceSplitsGrouper;
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.tez.common.TezUtils;
-import org.apache.tez.dag.api.VertexLocationHint;
 import org.apache.tez.dag.api.TaskLocationHint;
+import org.apache.tez.dag.api.VertexLocationHint;
+import org.apache.tez.dag.api.event.VertexStateUpdate;
 import org.apache.tez.mapreduce.hadoop.InputSplitInfoMem;
 import org.apache.tez.mapreduce.hadoop.MRInputHelpers;
 import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto;
@@ -152,8 +153,21 @@ public class HiveSplitGenerator extends 
   public static Multimap<Integer, InputSplit> generateGroupedSplits(JobConf jobConf,
       Configuration conf, InputSplit[] splits, float waves, int availableSlots)
       throws Exception {
+    return generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, null);
+  }
 
-    MapWork work = Utilities.getMapWork(jobConf);
+  public static Multimap<Integer, InputSplit> generateGroupedSplits(JobConf jobConf,
+      Configuration conf, InputSplit[] splits, float waves, int availableSlots,
+      String inputName) throws Exception {
+
+    MapWork work = null;
+    if (inputName != null) {
+      work = (MapWork) Utilities.getMergeWork(jobConf, inputName);
+      // work can still be null if there is no merge work for this input
+    }
+    if (work == null) {
+      work = Utilities.getMapWork(jobConf);
+    }
 
     Multimap<Integer, InputSplit> bucketSplitMultiMap =
         ArrayListMultimap.<Integer, InputSplit> create();
@@ -230,9 +244,14 @@ public class HiveSplitGenerator extends 
   }
 
   @Override
+  public void onVertexStateUpdated(VertexStateUpdate stateUpdate) {
+    pruner.processVertex(stateUpdate.getVertexName());
+  }
+
+  @Override
   public void handleInputInitializerEvent(List<InputInitializerEvent> events) throws Exception {
     for (InputInitializerEvent e : events) {
-      pruner.getQueue().put(e);
+      pruner.addEvent(e);
     }
   }
 }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java Mon Oct  6 04:00:39 2014
@@ -17,14 +17,20 @@
  */
 package org.apache.hadoop.hive.ql.exec.tez;
 
-import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.TreeMap;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.ql.exec.DummyStoreOperator;
 import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator;
 import org.apache.hadoop.hive.ql.exec.MapOperator;
 import org.apache.hadoop.hive.ql.exec.MapredContext;
@@ -36,15 +42,17 @@ import org.apache.hadoop.hive.ql.exec.Ut
 import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats;
 import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext;
 import org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector;
+import org.apache.hadoop.hive.ql.exec.tez.tools.KeyValueInputMerger;
 import org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator;
+import org.apache.hadoop.hive.ql.io.IOContext;
 import org.apache.hadoop.hive.ql.log.PerfLogger;
 import org.apache.hadoop.hive.ql.plan.MapWork;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
-import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.util.StringUtils;
 import org.apache.tez.mapreduce.input.MRInputLegacy;
+import org.apache.tez.mapreduce.input.MultiMRInput;
 import org.apache.tez.mapreduce.processor.MRTaskReporter;
+import org.apache.tez.runtime.api.Input;
 import org.apache.tez.runtime.api.LogicalInput;
 import org.apache.tez.runtime.api.LogicalOutput;
 import org.apache.tez.runtime.api.ProcessorContext;
@@ -58,27 +66,61 @@ public class MapRecordProcessor extends 
 
 
   private MapOperator mapOp;
+  private final List<MapOperator> mergeMapOpList = new ArrayList<MapOperator>();
   public static final Log l4j = LogFactory.getLog(MapRecordProcessor.class);
-  private final ExecMapperContext execContext = new ExecMapperContext();
+  private MapRecordSource[] sources;
+  private final Map<String, MultiMRInput> multiMRInputMap = new HashMap<String, MultiMRInput>();
+  private int position = 0;
+  private boolean foundCachedMergeWork = false;
+  MRInputLegacy legacyMRInput = null;
+  private ExecMapperContext execContext = null;
   private boolean abort = false;
   protected static final String MAP_PLAN_KEY = "__MAP_PLAN__";
   private MapWork mapWork;
+  List<MapWork> mergeWorkList = null;
+  private static Map<Integer, DummyStoreOperator> connectOps =
+      new TreeMap<Integer, DummyStoreOperator>();
 
-  public MapRecordProcessor(JobConf jconf) {
+  public MapRecordProcessor(JobConf jconf) throws Exception {
     ObjectCache cache = ObjectCacheFactory.getCache(jconf);
+    execContext = new ExecMapperContext(jconf);
     execContext.setJc(jconf);
     // create map and fetch operators
     mapWork = (MapWork) cache.retrieve(MAP_PLAN_KEY);
     if (mapWork == null) {
       mapWork = Utilities.getMapWork(jconf);
       cache.cache(MAP_PLAN_KEY, mapWork);
-      l4j.info("Plan: "+mapWork);
+      l4j.debug("Plan: " + mapWork);
       for (String s: mapWork.getAliases()) {
-        l4j.info("Alias: "+s);
+        l4j.debug("Alias: " + s);
       }
     } else {
       Utilities.setMapWork(jconf, mapWork);
     }
+
+    String prefixes = jconf.get(DagUtils.TEZ_MERGE_WORK_FILE_PREFIXES);
+    if (prefixes != null) {
+      mergeWorkList = new ArrayList<MapWork>();
+      for (String prefix : prefixes.split(",")) {
+        MapWork mergeMapWork = (MapWork) cache.retrieve(prefix);
+        if (mergeMapWork != null) {
+          l4j.info("Found merge work in cache");
+          foundCachedMergeWork = true;
+          mergeWorkList.add(mergeMapWork);
+          continue;
+        }
+        if (foundCachedMergeWork) {
+          throw new Exception(
+              "Should find all work in cache else operator pipeline will be in non-deterministic state");
+        }
+
+        if ((prefix != null) && (prefix.isEmpty() == false)) {
+          mergeMapWork = (MapWork) Utilities.getMergeWork(jconf, prefix);
+          mergeWorkList.add(mergeMapWork);
+          cache.cache(prefix, mergeMapWork);
+        }
+      }
+    }
   }
 
   @Override
@@ -88,8 +130,8 @@ public class MapRecordProcessor extends 
     super.init(jconf, processorContext, mrReporter, inputs, outputs);
 
     //Update JobConf using MRInput, info like filename comes via this
-    MRInputLegacy mrInput = TezProcessor.getMRInput(inputs);
-    Configuration updatedConf = mrInput.getConfigUpdates();
+    legacyMRInput = getMRInput(inputs);
+    Configuration updatedConf = legacyMRInput.getConfigUpdates();
     if (updatedConf != null) {
       for (Entry<String, String> entry : updatedConf) {
         jconf.set(entry.getKey(), entry.getValue());
@@ -99,20 +141,52 @@ public class MapRecordProcessor extends 
     createOutputMap();
     // Start all the Outputs.
     for (Entry<String, LogicalOutput> outputEntry : outputs.entrySet()) {
-      l4j.info("Starting Output: " + outputEntry.getKey());
+      l4j.debug("Starting Output: " + outputEntry.getKey());
       outputEntry.getValue().start();
       ((TezKVOutputCollector) outMap.get(outputEntry.getKey())).initialize();
     }
 
     try {
+
       if (mapWork.getVectorMode()) {
         mapOp = new VectorMapOperator();
       } else {
         mapOp = new MapOperator();
       }
 
+      connectOps.clear();
+      if (mergeWorkList != null) {
+        MapOperator mergeMapOp = null;
+        for (MapWork mergeMapWork : mergeWorkList) {
+          processorContext.waitForAnyInputReady(Collections.singletonList((Input) (inputs
+              .get(mergeMapWork.getName()))));
+          if (mergeMapWork.getVectorMode()) {
+            mergeMapOp = new VectorMapOperator();
+          } else {
+            mergeMapOp = new MapOperator();
+          }
+
+          mergeMapOpList.add(mergeMapOp);
+          // initialize the merge operators first.
+          if (mergeMapOp != null) {
+            mergeMapOp.setConf(mergeMapWork);
+            l4j.info("Input name is " + mergeMapWork.getName());
+            jconf.set(Utilities.INPUT_NAME, mergeMapWork.getName());
+            mergeMapOp.setChildren(jconf);
+            if (foundCachedMergeWork == false) {
+              DummyStoreOperator dummyOp = getJoinParentOp(mergeMapOp);
+              connectOps.put(mergeMapWork.getTag(), dummyOp);
+            }
+            mergeMapOp.setExecContext(new ExecMapperContext(jconf));
+            mergeMapOp.initializeLocalWork(jconf);
+          }
+        }
+      }
+
       // initialize map operator
       mapOp.setConf(mapWork);
+      l4j.info("Main input name is " + mapWork.getName());
+      jconf.set(Utilities.INPUT_NAME, mapWork.getName());
       mapOp.setChildren(jconf);
       l4j.info(mapOp.dump(0));
 
@@ -121,12 +195,21 @@ public class MapRecordProcessor extends 
       ((TezContext) MapredContext.get()).setTezProcessorContext(processorContext);
       mapOp.setExecContext(execContext);
       mapOp.initializeLocalWork(jconf);
+
+      initializeMapRecordSources();
       mapOp.initialize(jconf, null);
+      if ((mergeMapOpList != null) && mergeMapOpList.isEmpty() == false) {
+        for (MapOperator mergeMapOp : mergeMapOpList) {
+          jconf.set(Utilities.INPUT_NAME, mergeMapOp.getConf().getName());
+          mergeMapOp.initialize(jconf, null);
+        }
+      }
 
       // Initialization isn't finished until all parents of all operators
       // are initialized. For broadcast joins that means initializing the
       // dummy parent operators as well.
       List<HashTableDummyOperator> dummyOps = mapWork.getDummyOps();
+      jconf.set(Utilities.INPUT_NAME, mapWork.getName());
       if (dummyOps != null) {
         for (Operator<? extends OperatorDesc> dummyOp : dummyOps){
           dummyOp.setExecContext(execContext);
@@ -151,54 +234,46 @@ public class MapRecordProcessor extends 
     perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
   }
 
-  @Override
-  void run() throws IOException{
-
-    MRInputLegacy in = TezProcessor.getMRInput(inputs);
-    KeyValueReader reader = in.getReader();
+  private void initializeMapRecordSources() throws Exception {
+    int size = mergeMapOpList.size() + 1; // the +1 is for the main map operator itself
+    sources = new MapRecordSource[size];
+    KeyValueReader reader = legacyMRInput.getReader();
+    position = mapOp.getConf().getTag();
+    sources[position] = new MapRecordSource();
+    sources[position].init(jconf, mapOp, reader);
+    for (MapOperator mapOp : mergeMapOpList) {
+      int tag = mapOp.getConf().getTag();
+      sources[tag] = new MapRecordSource();
+      String inputName = mapOp.getConf().getName();
+      MultiMRInput multiMRInput = multiMRInputMap.get(inputName);
+      Collection<KeyValueReader> kvReaders = multiMRInput.getKeyValueReaders();
+      l4j.debug("There are " + kvReaders.size() + " key-value readers for input " + inputName);
+      List<KeyValueReader> kvReaderList = new ArrayList<KeyValueReader>(kvReaders);
+      reader = new KeyValueInputMerger(kvReaderList);
+      sources[tag].init(jconf, mapOp, reader);
+    }
+    ((TezContext) MapredContext.get()).setRecordSources(sources);
+  }
 
-    //process records until done
-    while(reader.next()){
-      //ignore the key for maps -  reader.getCurrentKey();
-      Object value = reader.getCurrentValue();
-      boolean needMore = processRow(value);
-      if(!needMore){
-        break;
+  private DummyStoreOperator getJoinParentOp(Operator<? extends OperatorDesc> mergeMapOp) {
+    for (Operator<? extends OperatorDesc> childOp : mergeMapOp.getChildOperators()) {
+      if ((childOp.getChildOperators() == null) || (childOp.getChildOperators().isEmpty())) {
+        return (DummyStoreOperator) childOp;
+      } else {
+        return getJoinParentOp(childOp);
       }
     }
+    return null;
   }
 
+  @Override
+  void run() throws Exception {
 
-  /**
-   * @param value  value to process
-   * @return true if it is not done and can take more inputs
-   */
-  private boolean processRow(Object value) {
-    // reset the execContext for each new row
-    execContext.resetRow();
-
-    try {
-      if (mapOp.getDone()) {
-        return false; //done
-      } else {
-        // Since there is no concept of a group, we don't invoke
-        // startGroup/endGroup for a mapper
-        mapOp.process((Writable)value);
-        if (isLogInfoEnabled) {
-          logProgress();
-        }
-      }
-    } catch (Throwable e) {
-      abort = true;
-      if (e instanceof OutOfMemoryError) {
-        // Don't create a new object if we are already out of memory
-        throw (OutOfMemoryError) e;
-      } else {
-        l4j.fatal(StringUtils.stringifyException(e));
-        throw new RuntimeException(e);
+    while (sources[position].pushRecord()) {
+      if (isLogInfoEnabled) {
+        logProgress();
       }
     }
-    return true; //give me more
   }
 
   @Override
@@ -214,6 +289,11 @@ public class MapRecordProcessor extends 
         return;
       }
       mapOp.close(abort);
+      if (mergeMapOpList.isEmpty() == false) {
+        for (MapOperator mergeMapOp : mergeMapOpList) {
+          mergeMapOp.close(abort);
+        }
+      }
 
       // Need to close the dummyOps as well. The operator pipeline
       // is not considered "closed/done" unless all operators are
@@ -242,4 +322,27 @@ public class MapRecordProcessor extends 
       MapredContext.close();
     }
   }
+
+  public static Map<Integer, DummyStoreOperator> getConnectOps() {
+    return connectOps;
+  }
+
+  private MRInputLegacy getMRInput(Map<String, LogicalInput> inputs) throws Exception {
+    // there should be only one MRInput
+    MRInputLegacy theMRInput = null;
+    l4j.info("The input names are: " + Arrays.toString(inputs.keySet().toArray()));
+    for (Entry<String, LogicalInput> inp : inputs.entrySet()) {
+      if (inp.getValue() instanceof MRInputLegacy) {
+        if (theMRInput != null) {
+          throw new IllegalArgumentException("Only one MRInput is expected");
+        }
+        // a better logic would be to find the alias
+        theMRInput = (MRInputLegacy) inp.getValue();
+      } else if (inp.getValue() instanceof MultiMRInput) {
+        multiMRInputMap.put(inp.getKey(), (MultiMRInput) inp.getValue());
+      }
+    }
+    theMRInput.init();
+    return theMRInput;
+  }
 }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileRecordProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileRecordProcessor.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileRecordProcessor.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileRecordProcessor.java Mon Oct  6 04:00:39 2014
@@ -20,6 +20,7 @@ package org.apache.hadoop.hive.ql.exec.t
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.ql.exec.MapredContext;
 import org.apache.hadoop.hive.ql.exec.ObjectCacheFactory;
 import org.apache.hadoop.hive.ql.exec.Operator;
@@ -40,7 +41,9 @@ import org.apache.tez.runtime.api.Logica
 import org.apache.tez.runtime.api.ProcessorContext;
 import org.apache.tez.runtime.library.api.KeyValueReader;
 
+import java.io.IOException;
 import java.util.Map;
+import java.util.Map.Entry;
 
 /**
  * Record processor for fast merging of files.
@@ -51,11 +54,12 @@ public class MergeFileRecordProcessor ex
       .getLog(MergeFileRecordProcessor.class);
 
   protected Operator<? extends OperatorDesc> mergeOp;
-  private final ExecMapperContext execContext = new ExecMapperContext();
+  private ExecMapperContext execContext = null;
   protected static final String MAP_PLAN_KEY = "__MAP_PLAN__";
   private MergeFileWork mfWork;
+  MRInputLegacy mrInput = null;
   private boolean abort = false;
-  private Object[] row = new Object[2];
+  private final Object[] row = new Object[2];
 
   @Override
   void init(JobConf jconf, ProcessorContext processorContext,
@@ -63,16 +67,16 @@ public class MergeFileRecordProcessor ex
       Map<String, LogicalOutput> outputs) throws Exception {
     perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
     super.init(jconf, processorContext, mrReporter, inputs, outputs);
+    execContext = new ExecMapperContext(jconf);
 
     //Update JobConf using MRInput, info like filename comes via this
-    MRInputLegacy mrInput = TezProcessor.getMRInput(inputs);
+    mrInput = getMRInput(inputs);
     Configuration updatedConf = mrInput.getConfigUpdates();
     if (updatedConf != null) {
       for (Map.Entry<String, String> entry : updatedConf) {
         jconf.set(entry.getKey(), entry.getValue());
       }
     }
-
     createOutputMap();
     // Start all the Outputs.
     for (Map.Entry<String, LogicalOutput> outputEntry : outputs.entrySet()) {
@@ -127,8 +131,7 @@ public class MergeFileRecordProcessor ex
 
   @Override
   void run() throws Exception {
-    MRInputLegacy in = TezProcessor.getMRInput(inputs);
-    KeyValueReader reader = in.getReader();
+    KeyValueReader reader = mrInput.getReader();
 
     //process records until done
     while (reader.next()) {
@@ -205,4 +208,23 @@ public class MergeFileRecordProcessor ex
     return true; //give me more
   }
 
+  private MRInputLegacy getMRInput(Map<String, LogicalInput> inputs) throws Exception {
+    // there should be only one MRInput
+    MRInputLegacy theMRInput = null;
+    for (Entry<String, LogicalInput> inp : inputs.entrySet()) {
+      if (inp.getValue() instanceof MRInputLegacy) {
+        if (theMRInput != null) {
+          throw new IllegalArgumentException("Only one MRInput is expected");
+        }
+        // a better logic would be to find the alias
+        theMRInput = (MRInputLegacy) inp.getValue();
+      } else {
+        throw new IOException("Expecting only one input of type MRInputLegacy. Found type: "
+            + inp.getClass().getCanonicalName());
+      }
+    }
+    theMRInput.init();
+
+    return theMRInput;
+  }
 }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileTezProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileTezProcessor.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileTezProcessor.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileTezProcessor.java Mon Oct  6 04:00:39 2014
@@ -39,12 +39,6 @@ public class MergeFileTezProcessor exten
   public void run(Map<String, LogicalInput> inputs,
       Map<String, LogicalOutput> outputs) throws Exception {
     rproc = new MergeFileRecordProcessor();
-    MRInputLegacy mrInput = getMRInput(inputs);
-    try {
-      mrInput.init();
-    } catch (IOException e) {
-      throw new RuntimeException("Failed while initializing MRInput", e);
-    }
     initializeAndRunProcessor(inputs, outputs);
   }
 }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java Mon Oct  6 04:00:39 2014
@@ -115,8 +115,7 @@ public abstract class RecordProcessor  {
    */
   protected void logCloseInfo() {
     long used_memory = memoryMXBean.getHeapMemoryUsage().getUsed();
-    l4j.info("ExecMapper: processed " + numRows + " rows: used memory = "
-        + used_memory);
+    l4j.info("TezProcessor: processed " + numRows + " rows/groups: used memory = " + used_memory);
   }
 
   /**
@@ -126,8 +125,7 @@ public abstract class RecordProcessor  {
     numRows++;
     if (numRows == nextUpdateCntr) {
       long used_memory = memoryMXBean.getHeapMemoryUsage().getUsed();
-      l4j.info("ExecMapper: processing " + numRows
-          + " rows: used memory = " + used_memory);
+      l4j.info("TezProcessor: processing " + numRows + " rows/groups: used memory = " + used_memory);
       nextUpdateCntr = getNextUpdateRecordCounter(numRows);
     }
   }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java Mon Oct  6 04:00:39 2014
@@ -17,9 +17,7 @@
  */
 package org.apache.hadoop.hive.ql.exec.tez;
 
-import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
@@ -35,31 +33,13 @@ import org.apache.hadoop.hive.ql.exec.Op
 import org.apache.hadoop.hive.ql.exec.OperatorUtils;
 import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats;
-import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext;
 import org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector;
-import org.apache.hadoop.hive.ql.exec.tez.tools.InputMerger;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory;
 import org.apache.hadoop.hive.ql.log.PerfLogger;
-import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
 import org.apache.hadoop.hive.ql.plan.ReduceWork;
 import org.apache.hadoop.hive.ql.plan.TableDesc;
-import org.apache.hadoop.hive.serde2.Deserializer;
-import org.apache.hadoop.hive.serde2.SerDe;
-import org.apache.hadoop.hive.serde2.SerDeException;
-import org.apache.hadoop.hive.serde2.SerDeUtils;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import org.apache.hadoop.hive.serde2.objectinspector.StructField;
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.DataOutputBuffer;
 import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.util.ReflectionUtils;
-import org.apache.hadoop.util.StringUtils;
 import org.apache.tez.mapreduce.processor.MRTaskReporter;
 import org.apache.tez.runtime.api.Input;
 import org.apache.tez.runtime.api.LogicalInput;
@@ -76,39 +56,16 @@ public class ReduceRecordProcessor  exte
   private static final String REDUCE_PLAN_KEY = "__REDUCE_PLAN__";
 
   public static final Log l4j = LogFactory.getLog(ReduceRecordProcessor.class);
-  private final ExecMapperContext execContext = new ExecMapperContext();
-  private boolean abort = false;
-  private Deserializer inputKeyDeserializer;
-
-  // Input value serde needs to be an array to support different SerDe
-  // for different tags
-  private final SerDe[] inputValueDeserializer = new SerDe[Byte.MAX_VALUE];
 
-  TableDesc keyTableDesc;
-  TableDesc[] valueTableDesc;
+  private ReduceWork redWork;
 
-  ObjectInspector[] rowObjectInspector;
   private Operator<?> reducer;
-  private boolean isTagged = false;
-
-  private Object keyObject = null;
-  private BytesWritable groupKey;
-
-  private ReduceWork redWork;
 
-  private boolean vectorized = false;
+  private ReduceRecordSource[] sources;
 
-  List<Object> row = new ArrayList<Object>(Utilities.reduceFieldNameList.size());
+  private final byte position = 0;
 
-  private DataOutputBuffer buffer;
-  private VectorizedRowBatch[] batches;
-  // number of columns pertaining to keys in a vectorized row batch
-  private int keysColumnOffset;
-  private final int BATCH_SIZE = VectorizedRowBatch.DEFAULT_SIZE;
-  private StructObjectInspector keyStructInspector;
-  private StructObjectInspector[] valueStructInspectors;
-  /* this is only used in the error code path */
-  private List<VectorExpressionWriter>[] valueStringWriters;
+  private boolean abort;
 
   @Override
   void init(JobConf jconf, ProcessorContext processorContext, MRTaskReporter mrReporter,
@@ -118,10 +75,6 @@ public class ReduceRecordProcessor  exte
 
     ObjectCache cache = ObjectCacheFactory.getCache(jconf);
 
-    rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
-    ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
-    ObjectInspector keyObjectInspector;
-
     redWork = (ReduceWork) cache.retrieve(REDUCE_PLAN_KEY);
     if (redWork == null) {
       redWork = Utilities.getReduceWork(jconf);
@@ -131,95 +84,36 @@ public class ReduceRecordProcessor  exte
     }
 
     reducer = redWork.getReducer();
-    reducer.setParentOperators(null); // clear out any parents as reducer is the
-    // root
-    isTagged = redWork.getNeedsTagging();
-    vectorized = redWork.getVectorMode();
+    reducer.getParentOperators().clear();
+    reducer.setParentOperators(null); // clear out any parents as reducer is the root
 
-    try {
-      keyTableDesc = redWork.getKeyDesc();
-      inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc
-          .getDeserializerClass(), null);
-      SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
-      keyObjectInspector = inputKeyDeserializer.getObjectInspector();
-      reducer.setGroupKeyObjectInspector(keyObjectInspector);
-      valueTableDesc = new TableDesc[redWork.getTagToValueDesc().size()];
-
-      if(vectorized) {
-        final int maxTags = redWork.getTagToValueDesc().size();
-        keyStructInspector = (StructObjectInspector)keyObjectInspector;
-        batches = new VectorizedRowBatch[maxTags];
-        valueStructInspectors = new StructObjectInspector[maxTags];
-        valueStringWriters = new List[maxTags];
-        keysColumnOffset = keyStructInspector.getAllStructFieldRefs().size();
-        buffer = new DataOutputBuffer();
-      }
+    int numTags = redWork.getTagToValueDesc().size();
 
-      for (int tag = 0; tag < redWork.getTagToValueDesc().size(); tag++) {
-        // We should initialize the SerDe with the TypeInfo when available.
-        valueTableDesc[tag] = redWork.getTagToValueDesc().get(tag);
-        inputValueDeserializer[tag] = (SerDe) ReflectionUtils.newInstance(
-            valueTableDesc[tag].getDeserializerClass(), null);
-        SerDeUtils.initializeSerDe(inputValueDeserializer[tag], null,
-                                   valueTableDesc[tag].getProperties(), null);
-        valueObjectInspector[tag] = inputValueDeserializer[tag]
-            .getObjectInspector();
-
-        ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();
-
-        if(vectorized) {
-          /* vectorization only works with struct object inspectors */
-          valueStructInspectors[tag] = (StructObjectInspector)valueObjectInspector[tag];
-
-          batches[tag] = VectorizedBatchUtil.constructVectorizedRowBatch(keyStructInspector,
-              valueStructInspectors[tag]);
-          final int totalColumns = keysColumnOffset +
-              valueStructInspectors[tag].getAllStructFieldRefs().size();
-          valueStringWriters[tag] = new ArrayList<VectorExpressionWriter>(totalColumns);
-          valueStringWriters[tag].addAll(Arrays
-              .asList(VectorExpressionWriterFactory
-                  .genVectorStructExpressionWritables(keyStructInspector)));
-          valueStringWriters[tag].addAll(Arrays
-              .asList(VectorExpressionWriterFactory
-                  .genVectorStructExpressionWritables(valueStructInspectors[tag])));
-
-          /*
-           * The row object inspector used by ReduceWork needs to be a **standard**
-           * struct object inspector, not just any struct object inspector.
-           */
-          ArrayList<String> colNames = new ArrayList<String>();
-          List<? extends StructField> fields = keyStructInspector.getAllStructFieldRefs();
-          for (StructField field: fields) {
-            colNames.add(Utilities.ReduceField.KEY.toString() + "." + field.getFieldName());
-            ois.add(field.getFieldObjectInspector());
-          }
-          fields = valueStructInspectors[tag].getAllStructFieldRefs();
-          for (StructField field: fields) {
-            colNames.add(Utilities.ReduceField.VALUE.toString() + "." + field.getFieldName());
-            ois.add(field.getFieldObjectInspector());
-          }
-          rowObjectInspector[tag] = ObjectInspectorFactory
-                  .getStandardStructObjectInspector(colNames, ois);
-        } else {
-          ois.add(keyObjectInspector);
-          ois.add(valueObjectInspector[tag]);
-          rowObjectInspector[tag] = ObjectInspectorFactory
-                  .getStandardStructObjectInspector(Utilities.reduceFieldNameList, ois);
-        }
+    ObjectInspector[] ois = new ObjectInspector[numTags];
+    sources = new ReduceRecordSource[numTags];
 
-      }
-    } catch (Exception e) {
-      throw new RuntimeException(e);
+    for (int tag = 0; tag < redWork.getTagToValueDesc().size(); tag++) {
+      TableDesc keyTableDesc = redWork.getKeyDesc();
+      TableDesc valueTableDesc = redWork.getTagToValueDesc().get(tag);
+      KeyValuesReader reader =
+          (KeyValuesReader) inputs.get(redWork.getTagToInput().get(tag)).getReader();
+
+      sources[tag] = new ReduceRecordSource();
+      sources[tag].init(jconf, reducer, redWork.getVectorMode(), keyTableDesc, valueTableDesc,
+          reader, tag == position, (byte) tag,
+          redWork.getScratchColumnVectorTypes());
+      ois[tag] = sources[tag].getObjectInspector();
     }
 
     MapredContext.init(false, new JobConf(jconf));
     ((TezContext) MapredContext.get()).setInputs(inputs);
     ((TezContext) MapredContext.get()).setTezProcessorContext(processorContext);
+    ((TezContext) MapredContext.get()).setRecordSources(sources);
 
     // initialize reduce operator tree
     try {
       l4j.info(reducer.dump(0));
-      reducer.initialize(jconf, rowObjectInspector);
+      reducer.initialize(jconf, ois);
 
       // Initialization isn't finished until all parents of all operators
       // are initialized. For broadcast joins that means initializing the
@@ -227,7 +121,6 @@ public class ReduceRecordProcessor  exte
       List<HashTableDummyOperator> dummyOps = redWork.getDummyOps();
       if (dummyOps != null) {
         for (Operator<? extends OperatorDesc> dummyOp : dummyOps){
-          dummyOp.setExecContext(execContext);
           dummyOp.initialize(jconf, null);
         }
       }
@@ -271,28 +164,12 @@ public class ReduceRecordProcessor  exte
       ((TezKVOutputCollector) outMap.get(outputEntry.getKey())).initialize();
     }
 
-    KeyValuesReader kvsReader;
-    try {
-      if(shuffleInputs.size() == 1){
-        //no merging of inputs required
-        kvsReader = (KeyValuesReader) shuffleInputs.get(0).getReader();
-      }else {
-        //get a sort merged input
-        kvsReader = new InputMerger(shuffleInputs);
-      }
-    } catch (Exception e) {
-      throw new IOException(e);
-    }
-
-    while(kvsReader.next()){
-      Object key = kvsReader.getCurrentKey();
-      Iterable<Object> values = kvsReader.getCurrentValues();
-      boolean needMore = processRows(key, values);
-      if(!needMore){
-        break;
+    // run the operator pipeline
+    while (sources[position].pushRecord()) {
+      if (isLogInfoEnabled) {
+        logProgress();
       }
     }
-
   }
 
   /**
@@ -302,209 +179,22 @@ public class ReduceRecordProcessor  exte
    */
   private List<LogicalInput> getShuffleInputs(Map<String, LogicalInput> inputs) {
     //the reduce plan inputs have tags, add all inputs that have tags
-    Map<Integer, String> tag2input = redWork.getTagToInput();
+    Map<Integer, String> tagToinput = redWork.getTagToInput();
     ArrayList<LogicalInput> shuffleInputs = new ArrayList<LogicalInput>();
-    for(String inpStr : tag2input.values()){
+    for(String inpStr : tagToinput.values()){
+      if (inputs.get(inpStr) == null) {
+        throw new AssertionError("Cound not find input: " + inpStr);
+      }
       shuffleInputs.add(inputs.get(inpStr));
     }
     return shuffleInputs;
   }
 
-  /**
-   * @param key
-   * @param values
-   * @return true if it is not done and can take more inputs
-   */
-  private boolean processRows(Object key, Iterable<Object> values) {
-    if(reducer.getDone()){
-      //done - no more records needed
-      return false;
-    }
-
-    // reset the execContext for each new row
-    execContext.resetRow();
-
-    try {
-      BytesWritable keyWritable = (BytesWritable) key;
-      byte tag = 0;
-
-      if (isTagged) {
-        // remove the tag from key coming out of reducer
-        // and store it in separate variable.
-        int size = keyWritable.getLength() - 1;
-        tag = keyWritable.getBytes()[size];
-        keyWritable.setSize(size);
-      }
-
-      //Set the key, check if this is a new group or same group
-      if (!keyWritable.equals(this.groupKey)) {
-        // If a operator wants to do some work at the beginning of a group
-        if (groupKey == null) { // the first group
-          this.groupKey = new BytesWritable();
-        } else {
-          // If a operator wants to do some work at the end of a group
-          if(isLogTraceEnabled) {
-            l4j.trace("End Group");
-          }
-          reducer.endGroup();
-        }
-
-        try {
-          this.keyObject = inputKeyDeserializer.deserialize(keyWritable);
-        } catch (Exception e) {
-          throw new HiveException(
-              "Hive Runtime Error: Unable to deserialize reduce input key from "
-              + Utilities.formatBinaryString(keyWritable.getBytes(), 0,
-              keyWritable.getLength()) + " with properties "
-              + keyTableDesc.getProperties(), e);
-        }
-        groupKey.set(keyWritable.getBytes(), 0, keyWritable.getLength());
-        if (isLogTraceEnabled) {
-          l4j.trace("Start Group");
-        }
-        reducer.setGroupKeyObject(keyObject);
-        reducer.startGroup();
-      }
-      /* this.keyObject passed via reference */
-      if(vectorized) {
-        return processVectors(values, tag);
-      } else {
-        return processKeyValues(values, tag);
-      }
-    } catch (Throwable e) {
-      abort = true;
-      if (e instanceof OutOfMemoryError) {
-        // Don't create a new object if we are already out of memory
-        throw (OutOfMemoryError) e;
-      } else {
-        l4j.fatal(StringUtils.stringifyException(e));
-        throw new RuntimeException(e);
-      }
-    }
-  }
-
-  private Object deserializeValue(BytesWritable valueWritable, byte tag) throws HiveException {
-    try {
-      return inputValueDeserializer[tag].deserialize(valueWritable);
-    } catch (SerDeException e) {
-      throw new HiveException(
-          "Hive Runtime Error: Unable to deserialize reduce input value (tag="
-              + tag
-              + ") from "
-              + Utilities.formatBinaryString(valueWritable.getBytes(), 0,
-                  valueWritable.getLength()) + " with properties "
-              + valueTableDesc[tag].getProperties(), e);
-    }
-  }
-
-  /**
-   * @param values
-   * @return true if it is not done and can take more inputs
-   */
-  private boolean processKeyValues(Iterable<Object> values, byte tag) throws HiveException {
-
-    for (Object value : values) {
-      BytesWritable valueWritable = (BytesWritable) value;
-
-      row.clear();
-      row.add(this.keyObject);
-      row.add(deserializeValue(valueWritable, tag));
-
-      try {
-        reducer.processOp(row, tag);
-      } catch (Exception e) {
-        String rowString = null;
-        try {
-          rowString = SerDeUtils.getJSONString(row, rowObjectInspector[tag]);
-        } catch (Exception e2) {
-          rowString = "[Error getting row data with exception "
-              + StringUtils.stringifyException(e2) + " ]";
-        }
-        throw new HiveException("Hive Runtime Error while processing row (tag="
-            + tag + ") " + rowString, e);
-      }
-      if (isLogInfoEnabled) {
-        logProgress();
-      }
-    }
-    return true; //give me more
-  }
-
-  /**
-   * @param values
-   * @return true if it is not done and can take more inputs
-   */
-  private boolean processVectors(Iterable<Object> values, byte tag) throws HiveException {
-    VectorizedRowBatch batch = batches[tag];
-    batch.reset();
-
-    /* deserialize key into columns */
-    VectorizedBatchUtil.addRowToBatchFrom(keyObject, keyStructInspector,
-        0, 0, batch, buffer);
-    for(int i = 0; i < keysColumnOffset; i++) {
-      VectorizedBatchUtil.setRepeatingColumn(batch, i);
-    }
-
-    int rowIdx = 0;
-    try {
-      for (Object value : values) {
-        /* deserialize value into columns */
-        BytesWritable valueWritable = (BytesWritable) value;
-        Object valueObj = deserializeValue(valueWritable, tag);
-
-        VectorizedBatchUtil.addRowToBatchFrom(valueObj, valueStructInspectors[tag],
-            rowIdx, keysColumnOffset, batch, buffer);
-        rowIdx++;
-        if (rowIdx >= BATCH_SIZE) {
-          VectorizedBatchUtil.setBatchSize(batch, rowIdx);
-          reducer.processOp(batch, tag);
-          rowIdx = 0;
-          if (isLogInfoEnabled) {
-            logProgress();
-          }
-        }
-      }
-      if (rowIdx > 0) {
-        VectorizedBatchUtil.setBatchSize(batch, rowIdx);
-        reducer.processOp(batch, tag);
-      }
-      if (isLogInfoEnabled) {
-        logProgress();
-      }
-    } catch (Exception e) {
-      String rowString = null;
-      try {
-        /* batch.toString depends on this */
-        batch.setValueWriters(valueStringWriters[tag]
-            .toArray(new VectorExpressionWriter[0]));
-        rowString = batch.toString();
-      } catch (Exception e2) {
-        rowString = "[Error getting row data with exception "
-            + StringUtils.stringifyException(e2) + " ]";
-      }
-      throw new HiveException("Hive Runtime Error while processing vector batch (tag="
-          + tag + ") " + rowString, e);
-    }
-    return true; // give me more
-  }
-
   @Override
   void close(){
-    // check if there are IOExceptions
-    if (!abort) {
-      abort = execContext.getIoCxt().getIOExceptions();
-    }
-
     try {
-      if (groupKey != null) {
-        // If a operator wants to do some work at the end of a group
-        if(isLogTraceEnabled) {
-          l4j.trace("End Group");
-        }
-        reducer.endGroup();
-      }
-      if (isLogInfoEnabled) {
-        logCloseInfo();
+      for (ReduceRecordSource rs: sources) {
+        abort = abort && rs.close();
       }
 
       reducer.close(abort);

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezContext.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezContext.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezContext.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezContext.java Mon Oct  6 04:00:39 2014
@@ -37,6 +37,8 @@ public class TezContext extends MapredCo
 
   private ProcessorContext processorContext;
 
+  private RecordSource[] sources;
+
   public TezContext(boolean isMap, JobConf jobConf) {
     super(isMap, jobConf);
   }
@@ -70,4 +72,12 @@ public class TezContext extends MapredCo
   public ProcessorContext getTezProcessorContext() {
     return processorContext;
   }
+
+  public RecordSource[] getRecordSources() {
+    return sources;
+  }
+
+  public void setRecordSources(RecordSource[] sources) {
+    this.sources = sources;
+  }
 }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezJobMonitor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezJobMonitor.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezJobMonitor.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezJobMonitor.java Mon Oct  6 04:00:39 2014
@@ -78,7 +78,7 @@ public class TezJobMonitor {
         try {
           for (TezSessionState s: TezSessionState.getOpenSessions()) {
             System.err.println("Shutting down tez session.");
-            TezSessionPoolManager.getInstance().close(s);
+            TezSessionPoolManager.getInstance().close(s, false);
           }
         } catch (Exception e) {
           // ignore
@@ -113,6 +113,7 @@ public class TezJobMonitor {
     String lastReport = null;
     Set<StatusGetOpts> opts = new HashSet<StatusGetOpts>();
     Heartbeater heartbeater = new Heartbeater(txnMgr, conf);
+    long startTime = 0;
 
     shutdownList.add(dagClient);
 
@@ -145,6 +146,7 @@ public class TezJobMonitor {
               for (String s: progressMap.keySet()) {
                 perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_RUN_VERTEX + s);
               }
+              startTime = System.currentTimeMillis();
               running = true;
             }
 
@@ -152,7 +154,8 @@ public class TezJobMonitor {
             break;
           case SUCCEEDED:
             lastReport = printStatus(progressMap, lastReport, console);
-            console.printInfo("Status: Finished successfully");
+            double duration = (System.currentTimeMillis() - startTime)/1000.0;
+            console.printInfo("Status: Finished successfully in " + String.format("%.2f seconds", duration));
             running = false;
             done = true;
             break;

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java Mon Oct  6 04:00:39 2014
@@ -17,6 +17,14 @@
  */
 package org.apache.hadoop.hive.ql.exec.tez;
 
+import java.io.IOException;
+import java.text.NumberFormat;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -26,6 +34,7 @@ import org.apache.hadoop.mapred.OutputCo
 import org.apache.hadoop.util.StringUtils;
 import org.apache.tez.common.TezUtils;
 import org.apache.tez.mapreduce.input.MRInputLegacy;
+import org.apache.tez.mapreduce.input.MultiMRInput;
 import org.apache.tez.mapreduce.processor.MRTaskReporter;
 import org.apache.tez.runtime.api.AbstractLogicalIOProcessor;
 import org.apache.tez.runtime.api.Event;
@@ -34,11 +43,6 @@ import org.apache.tez.runtime.api.Logica
 import org.apache.tez.runtime.api.ProcessorContext;
 import org.apache.tez.runtime.library.api.KeyValueWriter;
 
-import java.io.IOException;
-import java.text.NumberFormat;
-import java.util.List;
-import java.util.Map;
-
 /**
  * Hive processor for Tez that forms the vertices in Tez and processes the data.
  * Does what ExecMapper and ExecReducer does for hive in MR framework.
@@ -90,7 +94,8 @@ public class TezProcessor extends Abstra
     perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INITIALIZE_PROCESSOR);
     Configuration conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload());
     this.jobConf = new JobConf(conf);
-    setupMRLegacyConfigs(getContext());
+    this.processorContext = getContext();
+    setupMRLegacyConfigs(processorContext);
     perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INITIALIZE_PROCESSOR);
   }
 
@@ -130,12 +135,6 @@ public class TezProcessor extends Abstra
 
       if (isMap) {
         rproc = new MapRecordProcessor(jobConf);
-        MRInputLegacy mrInput = getMRInput(inputs);
-        try {
-          mrInput.init();
-        } catch (IOException e) {
-          throw new RuntimeException("Failed while initializing MRInput", e);
-        }
       } else {
         rproc = new ReduceRecordProcessor();
       }
@@ -148,6 +147,7 @@ public class TezProcessor extends Abstra
       throws Exception {
     Throwable originalThrowable = null;
     try {
+      // Outputs will be started later by the individual Processors.
       TezCacheAccess cacheAccess = TezCacheAccess.createInstance(jobConf);
       // Start the actual Inputs. After MRInput initialization.
       for (Map.Entry<String, LogicalInput> inputEntry : inputs.entrySet()) {
@@ -155,13 +155,10 @@ public class TezProcessor extends Abstra
           LOG.info("Input: " + inputEntry.getKey() + " is not cached");
           inputEntry.getValue().start();
         } else {
-          LOG.info("Input: " + inputEntry.getKey() +
-              " is already cached. Skipping start");
+          LOG.info("Input: " + inputEntry.getKey() + " is already cached. Skipping start");
         }
       }
 
-      // Outputs will be started later by the individual Processors.
-
       MRTaskReporter mrReporter = new MRTaskReporter(getContext());
       rproc.init(jobConf, getContext(), mrReporter, inputs, outputs);
       rproc.run();
@@ -214,19 +211,4 @@ public class TezProcessor extends Abstra
       writer.write(key, value);
     }
   }
-
-  static  MRInputLegacy getMRInput(Map<String, LogicalInput> inputs) {
-    //there should be only one MRInput
-    MRInputLegacy theMRInput = null;
-    for(LogicalInput inp : inputs.values()){
-      if(inp instanceof MRInputLegacy){
-        if(theMRInput != null){
-          throw new IllegalArgumentException("Only one MRInput is expected");
-        }
-        //a better logic would be to find the alias
-        theMRInput = (MRInputLegacy)inp;
-      }
-    }
-    return theMRInput;
-  }
 }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionPoolManager.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionPoolManager.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionPoolManager.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezSessionPoolManager.java Mon Oct  6 04:00:39 2014
@@ -168,10 +168,10 @@ public class TezSessionPoolManager {
     // session in the SessionState
   }
 
-  public void close(TezSessionState tezSessionState) throws Exception {
+  public void close(TezSessionState tezSessionState, boolean keepTmpDir) throws Exception {
     LOG.info("Closing tez session default? " + tezSessionState.isDefault());
     if (!tezSessionState.isDefault()) {
-      tezSessionState.close(false);
+      tezSessionState.close(keepTmpDir);
     }
   }
 
@@ -262,19 +262,24 @@ public class TezSessionPoolManager {
     }
 
     if (session != null) {
-      close(session);
+      close(session, false);
     }
 
     return getSession(conf, doOpen, forceCreate);
   }
 
-  public void closeAndOpen(TezSessionState sessionState, HiveConf conf)
+  public void closeAndOpen(TezSessionState sessionState, HiveConf conf, boolean keepTmpDir)
       throws Exception {
+    closeAndOpen(sessionState, conf, null, keepTmpDir);
+  }
+
+  public void closeAndOpen(TezSessionState sessionState, HiveConf conf,
+      String[] additionalFiles, boolean keepTmpDir) throws Exception {
     HiveConf sessionConf = sessionState.getConf();
     if (sessionConf != null && sessionConf.get("tez.queue.name") != null) {
       conf.set("tez.queue.name", sessionConf.get("tez.queue.name"));
     }
-    close(sessionState);
-    sessionState.open(conf);
+    close(sessionState, keepTmpDir);
+    sessionState.open(conf, additionalFiles);
   }
 }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java Mon Oct  6 04:00:39 2014
@@ -37,6 +37,7 @@ import org.apache.hadoop.hive.ql.exec.Ut
 import org.apache.hadoop.hive.ql.log.PerfLogger;
 import org.apache.hadoop.hive.ql.plan.BaseWork;
 import org.apache.hadoop.hive.ql.plan.MapWork;
+import org.apache.hadoop.hive.ql.plan.MergeJoinWork;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
 import org.apache.hadoop.hive.ql.plan.ReduceWork;
 import org.apache.hadoop.hive.ql.plan.TezEdgeProperty;
@@ -55,6 +56,7 @@ import org.apache.tez.dag.api.DAG;
 import org.apache.tez.dag.api.Edge;
 import org.apache.tez.dag.api.GroupInputEdge;
 import org.apache.tez.dag.api.SessionNotRunning;
+import org.apache.tez.dag.api.TezUncheckedException;
 import org.apache.tez.dag.api.Vertex;
 import org.apache.tez.dag.api.VertexGroup;
 import org.apache.tez.dag.api.client.DAGClient;
@@ -124,14 +126,11 @@ public class TezTask extends Task<TezWor
       // create the tez tmp dir
       scratchDir = utils.createTezDir(scratchDir, conf);
 
-      if (!session.isOpen()) {
-        // can happen if the user sets the tez flag after the session was
-        // established
-        LOG.info("Tez session hasn't been created yet. Opening session");
-        session.open(conf, inputOutputJars);
-      } else {
-        session.refreshLocalResourcesFromConf(conf);
-      }
+      Map<String,LocalResource> inputOutputLocalResources =
+          getExtraLocalResources(jobConf, scratchDir, inputOutputJars);
+
+      // Ensure the session is open and has the necessary local resources
+      updateSession(session, jobConf, scratchDir, inputOutputJars, inputOutputLocalResources);
 
       List<LocalResource> additionalLr = session.getLocalizedResources();
 
@@ -153,8 +152,12 @@ public class TezTask extends Task<TezWor
       // next we translate the TezWork to a Tez DAG
       DAG dag = build(jobConf, work, scratchDir, appJarLr, additionalLr, ctx);
 
+      // Add the extra resources to the dag
+      addExtraResourcesToDag(session, dag, inputOutputJars, inputOutputLocalResources);
+
       // submit will send the job to the cluster and start executing
-      client = submit(jobConf, dag, scratchDir, appJarLr, session, additionalLr);
+      client = submit(jobConf, dag, scratchDir, appJarLr, session,
+          additionalLr, inputOutputJars, inputOutputLocalResources);
 
       // finally monitor will print progress until the job is done
       TezJobMonitor monitor = new TezJobMonitor();
@@ -195,6 +198,63 @@ public class TezTask extends Task<TezWor
     return rc;
   }
 
+  /**
+   * Converted the list of jars into local resources
+   */
+  Map<String,LocalResource> getExtraLocalResources(JobConf jobConf, Path scratchDir,
+      String[] inputOutputJars) throws Exception {
+    final Map<String,LocalResource> resources = new HashMap<String,LocalResource>();
+    final List<LocalResource> localResources = utils.localizeTempFiles(
+        scratchDir.toString(), jobConf, inputOutputJars);
+    if (null != localResources) {
+      for (LocalResource lr : localResources) {
+        resources.put(utils.getBaseName(lr), lr);
+      }
+    }
+    return resources;
+  }
+
+  /**
+   * Ensures that the Tez Session is open and the AM has all necessary jars configured.
+   */
+  void updateSession(TezSessionState session,
+      JobConf jobConf, Path scratchDir, String[] inputOutputJars,
+      Map<String,LocalResource> extraResources) throws Exception {
+    final boolean missingLocalResources = !session
+        .hasResources(inputOutputJars);
+
+    if (!session.isOpen()) {
+      // can happen if the user sets the tez flag after the session was
+      // established
+      LOG.info("Tez session hasn't been created yet. Opening session");
+      session.open(conf, inputOutputJars);
+    } else {
+      LOG.info("Session is already open");
+
+      // Ensure the open session has the necessary resources (StorageHandler)
+      if (missingLocalResources) {
+        LOG.info("Tez session missing resources," +
+            " adding additional necessary resources");
+        session.getSession().addAppMasterLocalFiles(extraResources);
+      }
+
+      session.refreshLocalResourcesFromConf(conf);
+    }
+  }
+
+  /**
+   * Adds any necessary resources that must be localized in each vertex to the DAG.
+   */
+  void addExtraResourcesToDag(TezSessionState session, DAG dag,
+      String[] inputOutputJars,
+      Map<String,LocalResource> inputOutputLocalResources) throws Exception {
+    if (!session.hasResources(inputOutputJars)) {
+      if (null != inputOutputLocalResources) {
+        dag.addTaskLocalFiles(inputOutputLocalResources);
+      }
+    }
+  }
+
   DAG build(JobConf conf, TezWork work, Path scratchDir,
       LocalResource appJarLr, List<LocalResource> additionalLr, Context ctx)
       throws Exception {
@@ -254,15 +314,16 @@ public class TezTask extends Task<TezWor
         for (BaseWork v: children) {
           // finally we can create the grouped edge
           GroupInputEdge e = utils.createEdge(group, parentConf,
-               workToVertex.get(v), work.getEdgeProperty(w, v));
+               workToVertex.get(v), work.getEdgeProperty(w, v), work.getVertexType(v));
 
           dag.addEdge(e);
         }
       } else {
         // Regular vertices
         JobConf wxConf = utils.initializeVertexConf(conf, ctx, w);
-        Vertex wx = utils.createVertex(wxConf, w, scratchDir, appJarLr,
-          additionalLr, fs, ctx, !isFinal, work);
+        Vertex wx =
+            utils.createVertex(wxConf, w, scratchDir, appJarLr, additionalLr, fs, ctx, !isFinal,
+                work, work.getVertexType(w));
         dag.addVertex(wx);
         utils.addCredentials(w, dag);
         perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_CREATE_VERTEX + w.getName());
@@ -276,7 +337,7 @@ public class TezTask extends Task<TezWor
 
           TezEdgeProperty edgeProp = work.getEdgeProperty(w, v);
 
-          e = utils.createEdge(wxConf, wx, workToVertex.get(v), edgeProp);
+          e = utils.createEdge(wxConf, wx, workToVertex.get(v), edgeProp, work.getVertexType(v));
           dag.addEdge(e);
         }
       }
@@ -287,7 +348,8 @@ public class TezTask extends Task<TezWor
 
   DAGClient submit(JobConf conf, DAG dag, Path scratchDir,
       LocalResource appJarLr, TezSessionState sessionState,
-      List<LocalResource> additionalLr)
+      List<LocalResource> additionalLr, String[] inputOutputJars,
+      Map<String,LocalResource> inputOutputLocalResources)
       throws Exception {
 
     perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_SUBMIT_DAG);
@@ -308,7 +370,7 @@ public class TezTask extends Task<TezWor
       console.printInfo("Tez session was closed. Reopening...");
 
       // close the old one, but keep the tmp files around
-      TezSessionPoolManager.getInstance().closeAndOpen(sessionState, this.conf);
+      TezSessionPoolManager.getInstance().closeAndOpen(sessionState, this.conf, inputOutputJars, true);
       console.printInfo("Session re-established.");
 
       dagClient = sessionState.getSession().submitDAG(dag);
@@ -326,6 +388,9 @@ public class TezTask extends Task<TezWor
     try {
       List<BaseWork> ws = work.getAllWork();
       for (BaseWork w: ws) {
+        if (w instanceof MergeJoinWork) {
+          w = ((MergeJoinWork) w).getMainWork();
+        }
         for (Operator<?> op: w.getAllOperators()) {
           op.jobClose(conf, rc == 0);
         }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/tools/TezMergedLogicalInput.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/tools/TezMergedLogicalInput.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/tools/TezMergedLogicalInput.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/tools/TezMergedLogicalInput.java Mon Oct  6 04:00:39 2014
@@ -40,7 +40,7 @@ public class TezMergedLogicalInput exten
  
   @Override
   public Reader getReader() throws Exception {
-    return new InputMerger(getInputs());
+    return new KeyValuesInputMerger(getInputs());
   }
 
   @Override

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnAssignFactory.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnAssignFactory.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnAssignFactory.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorColumnAssignFactory.java Mon Oct  6 04:00:39 2014
@@ -24,7 +24,9 @@ import java.util.List;
 import java.util.Map;
 
 import org.apache.hadoop.hive.common.type.Decimal128;
+import org.apache.hadoop.hive.common.type.HiveChar;
 import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.common.type.HiveVarchar;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.serde2.io.ByteWritable;
 import org.apache.hadoop.hive.serde2.io.DateWritable;
@@ -404,8 +406,7 @@ public class VectorColumnAssignFactory {
           public void assignObjectValue(Object val, int destIndex) throws HiveException {
             if (val == null) {
               assignNull(destIndex);
-            }
-            else {
+            } else {
               Text bw = (Text) val;
               byte[] bytes = bw.getBytes();
               assignBytes(bytes, 0, bw.getLength(), destIndex);
@@ -413,6 +414,35 @@ public class VectorColumnAssignFactory {
           }
         }.init(outputBatch, (BytesColumnVector) destCol);
         break;
+      case VARCHAR:
+        outVCA = new VectorBytesColumnAssign() {
+          @Override
+          public void assignObjectValue(Object val, int destIndex) throws HiveException {
+            if (val == null) {
+              assignNull(destIndex);
+            } else {
+              HiveVarchar hiveVarchar = (HiveVarchar) val;
+              byte[] bytes = hiveVarchar.getValue().getBytes();
+              assignBytes(bytes, 0, bytes.length, destIndex);
+            }
+          }
+        }.init(outputBatch, (BytesColumnVector) destCol);
+        break;
+      case CHAR:
+        outVCA = new VectorBytesColumnAssign() {
+        @Override
+          public void assignObjectValue(Object val, int destIndex) throws HiveException {
+            if (val == null) {
+              assignNull(destIndex);
+            } else {
+              // We store CHAR type stripped of pads.
+              HiveChar hiveChar = (HiveChar) val;
+              byte[] bytes = hiveChar.getStrippedValue().getBytes();
+              assignBytes(bytes, 0, bytes.length, destIndex);
+            }
+          }
+        }.init(outputBatch, (BytesColumnVector) destCol);
+        break;
       default:
         throw new HiveException("Incompatible Bytes vector column and primitive category " +
             category);

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExpressionDescriptor.java Mon Oct  6 04:00:39 2014
@@ -67,6 +67,7 @@ public class VectorExpressionDescriptor 
     DATE                    (0x040),
     TIMESTAMP               (0x080),
     DATETIME_FAMILY         (DATE.value | TIMESTAMP.value),
+    INT_TIMESTAMP_FAMILY    (INT_FAMILY.value | TIMESTAMP.value),
     INT_DATETIME_FAMILY     (INT_FAMILY.value | DATETIME_FAMILY.value),
     STRING_DATETIME_FAMILY  (STRING_FAMILY.value | DATETIME_FAMILY.value),
     ALL_FAMILY              (0xFFF);

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractOperator.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractOperator.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractOperator.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorExtractOperator.java Mon Oct  6 04:00:39 2014
@@ -19,6 +19,7 @@
 package org.apache.hadoop.hive.ql.exec.vector;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
@@ -45,7 +46,8 @@ public class VectorExtractOperator exten
   private int keyColCount;
   private int valueColCount;
   
-  private transient int [] projectedColumns = null;
+  private transient VectorizedRowBatch outputBatch;
+  private transient int remainingColCount;
 
   public VectorExtractOperator(VectorizationContext vContext, OperatorDesc conf)
       throws HiveException {
@@ -57,26 +59,25 @@ public class VectorExtractOperator exten
     super();
   }
 
-  private StructObjectInspector makeStandardStructObjectInspector(StructObjectInspector structObjectInspector) {
-    List<? extends StructField> fields = structObjectInspector.getAllStructFieldRefs();
+  @Override
+  protected void initializeOp(Configuration hconf) throws HiveException {
+    StructObjectInspector structInputObjInspector = (StructObjectInspector) inputObjInspectors[0];
+    List<? extends StructField> fields = structInputObjInspector.getAllStructFieldRefs();
     ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();
     ArrayList<String> colNames = new ArrayList<String>();
-    for (StructField field: fields) {
-      colNames.add(field.getFieldName());
+    for (int i = keyColCount; i < fields.size(); i++) {
+      StructField field = fields.get(i);
+      String fieldName = field.getFieldName();
+
+      // Remove "VALUE." prefix.
+      int dotIndex = fieldName.indexOf(".");
+      colNames.add(fieldName.substring(dotIndex + 1));
       ois.add(field.getFieldObjectInspector());
     }
-    return ObjectInspectorFactory
+    outputObjInspector = ObjectInspectorFactory
               .getStandardStructObjectInspector(colNames, ois);
-    }
- 
-  @Override
-  protected void initializeOp(Configuration hconf) throws HiveException {
-    outputObjInspector = inputObjInspectors[0];
-    LOG.info("VectorExtractOperator class of outputObjInspector is " + outputObjInspector.getClass().getName());
-    projectedColumns = new int [valueColCount];
-    for (int i = 0; i < valueColCount; i++) {
-      projectedColumns[i] = keyColCount + i;
-    }
+    remainingColCount = fields.size() - keyColCount;
+    outputBatch =  new VectorizedRowBatch(remainingColCount);
     initializeChildren(hconf);
   }
 
@@ -86,20 +87,16 @@ public class VectorExtractOperator exten
   }
   
   @Override
-  // Evaluate vectorized batches of rows and forward them.
+  // Remove the key columns and forward the values (and scratch columns).
   public void processOp(Object row, int tag) throws HiveException {
-    VectorizedRowBatch vrg = (VectorizedRowBatch) row;
+    VectorizedRowBatch inputBatch = (VectorizedRowBatch) row;
+
+    // Copy references to the input columns array starting after the keys...
+    for (int i = 0; i < remainingColCount; i++) {
+      outputBatch.cols[i] = inputBatch.cols[keyColCount + i];
+    }
+    outputBatch.size = inputBatch.size;
 
-    // Project away the key columns...
-    int[] originalProjections = vrg.projectedColumns;
-    int originalProjectionSize = vrg.projectionSize;
-    vrg.projectionSize = valueColCount;
-    vrg.projectedColumns = this.projectedColumns;
-
-    forward(vrg, outputObjInspector);
-
-    // Revert the projected columns back, because vrg will be re-used.
-    vrg.projectionSize = originalProjectionSize;
-    vrg.projectedColumns = originalProjections;
+    forward(outputBatch, outputObjInspector);
   }
 }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorFileSinkOperator.java Mon Oct  6 04:00:39 2014
@@ -18,8 +18,6 @@
 
 package org.apache.hadoop.hive.ql.exec.vector;
 
-import java.io.IOException;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
 import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter;
@@ -27,16 +25,7 @@ import org.apache.hadoop.hive.ql.exec.ve
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
-import org.apache.hadoop.hive.common.StatsSetupConst;
-import org.apache.hadoop.hive.serde2.SerDeException;
-import org.apache.hadoop.hive.serde2.SerDeStats;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.hadoop.io.ObjectWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
 
 /**
  * File Sink operator implementation.
@@ -69,113 +58,10 @@ public class VectorFileSinkOperator exte
 
   @Override
   public void processOp(Object data, int tag) throws HiveException {
-
     VectorizedRowBatch vrg = (VectorizedRowBatch)data;
-
-    Writable [] records = null;
-    boolean vectorizedSerde = false;
-    try {
-      if (serializer instanceof VectorizedSerde) {
-        recordValue = ((VectorizedSerde) serializer).serializeVector(vrg,
-            inputObjInspectors[0]);
-        records = (Writable[]) ((ObjectWritable) recordValue).get();
-        vectorizedSerde = true;
-      }
-    } catch (SerDeException e1) {
-      throw new HiveException(e1);
-    }
-
     for (int i = 0; i < vrg.size; i++) {
-      Writable row = null;
-      if (vectorizedSerde) {
-        row = records[i];
-      } else {
-        if (vrg.valueWriters == null) {
-          vrg.setValueWriters(this.valueWriters);
-        }
-        try {
-          row = serializer.serialize(getRowObject(vrg, i), inputObjInspectors[0]);
-        } catch (SerDeException ex) {
-          throw new HiveException(ex);
-        }
-      }
-    /* Create list bucketing sub-directory only if stored-as-directories is on. */
-    String lbDirName = null;
-    lbDirName = (lbCtx == null) ? null : generateListBucketingDirName(row);
-
-    FSPaths fpaths;
-
-    if (!bDynParts && !filesCreated) {
-      if (lbDirName != null) {
-        FSPaths fsp2 = lookupListBucketingPaths(lbDirName);
-      } else {
-        createBucketFiles(fsp);
-      }
-    }
-
-    try {
-      updateProgress();
-
-      // if DP is enabled, get the final output writers and prepare the real output row
-      assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT : "input object inspector is not struct";
-
-      if (bDynParts) {
-        // copy the DP column values from the input row to dpVals
-        dpVals.clear();
-        dpWritables.clear();
-        ObjectInspectorUtils.partialCopyToStandardObject(dpWritables, row, dpStartCol, numDynParts,
-            (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE);
-        // get a set of RecordWriter based on the DP column values
-        // pass the null value along to the escaping process to determine what the dir should be
-        for (Object o : dpWritables) {
-          if (o == null || o.toString().length() == 0) {
-            dpVals.add(dpCtx.getDefaultPartitionName());
-          } else {
-            dpVals.add(o.toString());
-          }
-        }
-        fpaths = getDynOutPaths(dpVals, lbDirName);
-
-      } else {
-        if (lbDirName != null) {
-          fpaths = lookupListBucketingPaths(lbDirName);
-        } else {
-          fpaths = fsp;
-        }
-      }
-
-      rowOutWriters = fpaths.getOutWriters();
-      // check if all record writers implement statistics. if atleast one RW
-      // doesn't implement stats interface we will fallback to conventional way
-      // of gathering stats
-      isCollectRWStats = areAllTrue(statsFromRecordWriter);
-      if (conf.isGatherStats() && !isCollectRWStats) {
-        if (statsCollectRawDataSize) {
-          SerDeStats stats = serializer.getSerDeStats();
-          if (stats != null) {
-            fpaths.getStat().addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
-          }
-        }
-        fpaths.getStat().addToStat(StatsSetupConst.ROW_COUNT, 1);
-      }
-
-
-      if (row_count != null) {
-        row_count.set(row_count.get() + 1);
-      }
-
-      if (!multiFileSpray) {
-        rowOutWriters[0].write(row);
-      } else {
-        int keyHashCode = 0;
-        key.setHashCode(keyHashCode);
-        int bucketNum = prtner.getBucket(key, null, totalFiles);
-        int idx = bucketMap.get(bucketNum);
-        rowOutWriters[idx].write(row);
-      }
-    } catch (IOException e) {
-      throw new HiveException(e);
-    }
+      Object[] row = getRowObject(vrg, i);
+      super.processOp(row, tag);
     }
   }
 
@@ -187,7 +73,7 @@ public class VectorFileSinkOperator exte
     }
     for (int i = 0; i < vrg.projectionSize; i++) {
       ColumnVector vectorColumn = vrg.cols[vrg.projectedColumns[i]];
-      singleRow[i] = vrg.valueWriters[i].writeValue(vectorColumn, batchIndex);
+      singleRow[i] = valueWriters[i].writeValue(vectorColumn, batchIndex);
     }
     return singleRow;
   }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupByOperator.java Mon Oct  6 04:00:39 2014
@@ -653,6 +653,21 @@ public class VectorGroupByOperator exten
   /**
    * Sorted reduce group batch processing mode. Each input VectorizedRowBatch will have the
    * same key.  On endGroup (or close), the intermediate values are flushed.
+   *
+   * We build the output rows one-at-a-time in the output vectorized row batch (outputBatch)
+   * in 2 steps:
+   *
+   *   1) Just after startGroup, we copy the group key to the next position in the output batch,
+   *      but don't increment the size in the batch (yet).  This is done with the copyGroupKey
+   *      method of VectorGroupKeyHelper.  The next position is outputBatch.size
+   *
+   *      We know the same key is used for the whole batch (i.e. repeating) since that is how
+   *      vectorized reduce-shuffle feeds the batches to us.
+   *
+   *   2) Later at endGroup after reduce-shuffle has fed us all the input batches for the group,
+   *      we fill in the aggregation columns in outputBatch at outputBatch.size.  Our method 
+   *      writeGroupRow does this and finally increments outputBatch.size.
+   *
    */
   private class ProcessingModeGroupBatches extends ProcessingModeBase {
 

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupKeyHelper.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupKeyHelper.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupKeyHelper.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorGroupKeyHelper.java Mon Oct  6 04:00:39 2014
@@ -42,19 +42,38 @@ public class VectorGroupKeyHelper extend
     finishAdding();
   }
 
+  /*
+   * This helper method copies the group keys from one vectorized row batch to another,
+   * but does not increment the outputBatch.size (i.e. the next output position).
+   * 
+   * It was designed for VectorGroupByOperator's sorted reduce group batch processing mode
+   * to copy the group keys at startGroup.
+   */
   public void copyGroupKey(VectorizedRowBatch inputBatch, VectorizedRowBatch outputBatch,
           DataOutputBuffer buffer) throws HiveException {
-    // Grab the key at index 0.  We don't care about selected or repeating since all keys in the input batch are the same.
     for(int i = 0; i< longIndices.length; ++i) {
       int keyIndex = longIndices[i];
       LongColumnVector inputColumnVector = (LongColumnVector) inputBatch.cols[keyIndex];
       LongColumnVector outputColumnVector = (LongColumnVector) outputBatch.cols[keyIndex];
+
+      // This vectorized code pattern says: 
+      //    If the input batch has no nulls at all (noNulls is true) OR
+      //    the input row is NOT NULL, copy the value.
+      //
+      //    Otherwise, we have a NULL input value.  The standard way to mark a NULL in the
+      //    output batch is: turn off noNulls indicating there is at least one NULL in the batch
+      //    and mark that row as NULL.
+      //
+      //    When a vectorized row batch is reset, noNulls is set to true and the isNull array
+      //    is zeroed.
+      //
+      // We grab the key at index 0.  We don't care about selected or repeating since all keys
+      // in the input batch are suppose to be the same.
+      //
       if (inputColumnVector.noNulls || !inputColumnVector.isNull[0]) {
         outputColumnVector.vector[outputBatch.size] = inputColumnVector.vector[0];
-      } else if (inputColumnVector.noNulls ){
-        outputColumnVector.noNulls = false;
-        outputColumnVector.isNull[outputBatch.size] = true;
       } else {
+        outputColumnVector.noNulls = false;
         outputColumnVector.isNull[outputBatch.size] = true;
       }
     }
@@ -64,10 +83,8 @@ public class VectorGroupKeyHelper extend
       DoubleColumnVector outputColumnVector = (DoubleColumnVector) outputBatch.cols[keyIndex];
       if (inputColumnVector.noNulls || !inputColumnVector.isNull[0]) {
         outputColumnVector.vector[outputBatch.size] = inputColumnVector.vector[0];
-      } else if (inputColumnVector.noNulls ){
-        outputColumnVector.noNulls = false;
-        outputColumnVector.isNull[outputBatch.size] = true;
       } else {
+        outputColumnVector.noNulls = false;
         outputColumnVector.isNull[outputBatch.size] = true;
       }
     }
@@ -85,10 +102,8 @@ public class VectorGroupKeyHelper extend
           throw new IllegalStateException("bad write", ioe);
         }
         outputColumnVector.setRef(outputBatch.size, buffer.getData(), start, length);
-      } else if (inputColumnVector.noNulls ){
-        outputColumnVector.noNulls = false;
-        outputColumnVector.isNull[outputBatch.size] = true;
       } else {
+        outputColumnVector.noNulls = false;
         outputColumnVector.isNull[outputBatch.size] = true;
       }
     }
@@ -98,10 +113,8 @@ public class VectorGroupKeyHelper extend
       DecimalColumnVector outputColumnVector = (DecimalColumnVector) outputBatch.cols[keyIndex];
       if (inputColumnVector.noNulls || !inputColumnVector.isNull[0]) {
         outputColumnVector.vector[outputBatch.size] = inputColumnVector.vector[0];
-      } else if (inputColumnVector.noNulls ){
-        outputColumnVector.noNulls = false;
-        outputColumnVector.isNull[outputBatch.size] = true;
       } else {
+        outputColumnVector.noNulls = false;
         outputColumnVector.isNull[outputBatch.size] = true;
       }
     }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapper.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapper.java?rev=1629563&r1=1629562&r2=1629563&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapper.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorHashKeyWrapper.java Mon Oct  6 04:00:39 2014
@@ -36,6 +36,12 @@ import org.apache.hadoop.hive.serde2.obj
  */
 public class VectorHashKeyWrapper extends KeyWrapper {
 
+  private static final int[] EMPTY_INT_ARRAY = new int[0];
+  private static final long[] EMPTY_LONG_ARRAY = new long[0];
+  private static final double[] EMPTY_DOUBLE_ARRAY = new double[0];
+  private static final byte[][] EMPTY_BYTES_ARRAY = new byte[0][];
+  private static final Decimal128[] EMPTY_DECIMAL_ARRAY = new Decimal128[0];
+
   private long[] longValues;
   private double[] doubleValues;
 
@@ -50,15 +56,21 @@ public class VectorHashKeyWrapper extend
 
   public VectorHashKeyWrapper(int longValuesCount, int doubleValuesCount,
           int byteValuesCount, int decimalValuesCount) {
-    longValues = new long[longValuesCount];
-    doubleValues = new double[doubleValuesCount];
-    decimalValues = new Decimal128[decimalValuesCount];
+    longValues = longValuesCount > 0 ? new long[longValuesCount] : EMPTY_LONG_ARRAY;
+    doubleValues = doubleValuesCount > 0 ? new double[doubleValuesCount] : EMPTY_DOUBLE_ARRAY;
+    decimalValues = decimalValuesCount > 0 ? new Decimal128[decimalValuesCount] : EMPTY_DECIMAL_ARRAY;
     for(int i = 0; i < decimalValuesCount; ++i) {
       decimalValues[i] = new Decimal128();
     }
-    byteValues = new byte[byteValuesCount][];
-    byteStarts = new int[byteValuesCount];
-    byteLengths = new int[byteValuesCount];
+    if (byteValuesCount > 0) {
+      byteValues = new byte[byteValuesCount][];
+      byteStarts = new int[byteValuesCount];
+      byteLengths = new int[byteValuesCount];
+    } else {
+      byteValues = EMPTY_BYTES_ARRAY;
+      byteStarts = EMPTY_INT_ARRAY;
+      byteLengths = EMPTY_INT_ARRAY;
+    }
     isNull = new boolean[longValuesCount + doubleValuesCount + byteValuesCount + decimalValuesCount];
     hashcode = 0;
   }