You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by li...@apache.org on 2016/11/07 03:25:27 UTC

hive git commit: HIVE-15054: Hive insertion query execution fails on Hive on Spark (Aihua Xu via Rui Li)

Repository: hive
Updated Branches:
  refs/heads/master 876ecdc6e -> 0951c9c64


HIVE-15054: Hive insertion query execution fails on Hive on Spark (Aihua Xu via Rui Li)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0951c9c6
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0951c9c6
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0951c9c6

Branch: refs/heads/master
Commit: 0951c9c6443fda41e9e4ab5f8302a043f564a5d8
Parents: 876ecdc
Author: Aihua Xu <ax...@cloudera.com>
Authored: Mon Nov 7 11:25:17 2016 +0800
Committer: Rui Li <sh...@cn.ibm.com>
Committed: Mon Nov 7 11:25:17 2016 +0800

----------------------------------------------------------------------
 .../hive/ql/exec/spark/HivePairFlatMapFunction.java       | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/0951c9c6/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java
index 7df626b..4f1b7d7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java
@@ -70,10 +70,14 @@ public abstract class HivePairFlatMapFunction<T, K, V> implements PairFlatMapFun
       taskAttemptIdBuilder.append("r_");
     }
 
-    // Spark task attempt id is increased by Spark context instead of task, which may introduce
-    // unstable qtest output, since non Hive features depends on this, we always set it to 0 here.
+    // Hive requires this TaskAttemptId to be unique. MR's TaskAttemptId is composed
+    // of "attempt_timestamp_jobNum_m/r_taskNum_attemptNum". The counterpart for
+    // Spark should be "attempt_timestamp_stageNum_m/r_partitionId_attemptNum".
+    // When there're multiple attempts for a task, Hive will rely on the partitionId
+    // to figure out if the data are duplicate or not when collecting the final outputs
+    // (see org.apache.hadoop.hive.ql.exec.Utils.removeTempOrDuplicateFiles)
     taskAttemptIdBuilder.append(taskIdFormat.format(TaskContext.get().partitionId()))
-      .append("_0");
+      .append("_").append(TaskContext.get().attemptNumber());
 
     String taskAttemptIdStr = taskAttemptIdBuilder.toString();
     jobConf.set("mapred.task.id", taskAttemptIdStr);