You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by li...@apache.org on 2016/11/07 03:25:27 UTC
hive git commit: HIVE-15054: Hive insertion query execution fails on
Hive on Spark (Aihua Xu via Rui Li)
Repository: hive
Updated Branches:
refs/heads/master 876ecdc6e -> 0951c9c64
HIVE-15054: Hive insertion query execution fails on Hive on Spark (Aihua Xu via Rui Li)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0951c9c6
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0951c9c6
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0951c9c6
Branch: refs/heads/master
Commit: 0951c9c6443fda41e9e4ab5f8302a043f564a5d8
Parents: 876ecdc
Author: Aihua Xu <ax...@cloudera.com>
Authored: Mon Nov 7 11:25:17 2016 +0800
Committer: Rui Li <sh...@cn.ibm.com>
Committed: Mon Nov 7 11:25:17 2016 +0800
----------------------------------------------------------------------
.../hive/ql/exec/spark/HivePairFlatMapFunction.java | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/0951c9c6/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java
index 7df626b..4f1b7d7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java
@@ -70,10 +70,14 @@ public abstract class HivePairFlatMapFunction<T, K, V> implements PairFlatMapFun
taskAttemptIdBuilder.append("r_");
}
- // Spark task attempt id is increased by Spark context instead of task, which may introduce
- // unstable qtest output, since non Hive features depends on this, we always set it to 0 here.
+ // Hive requires this TaskAttemptId to be unique. MR's TaskAttemptId is composed
+ // of "attempt_timestamp_jobNum_m/r_taskNum_attemptNum". The counterpart for
+ // Spark should be "attempt_timestamp_stageNum_m/r_partitionId_attemptNum".
+ // When there're multiple attempts for a task, Hive will rely on the partitionId
+ // to figure out if the data are duplicate or not when collecting the final outputs
+ // (see org.apache.hadoop.hive.ql.exec.Utils.removeTempOrDuplicateFiles)
taskAttemptIdBuilder.append(taskIdFormat.format(TaskContext.get().partitionId()))
- .append("_0");
+ .append("_").append(TaskContext.get().attemptNumber());
String taskAttemptIdStr = taskAttemptIdBuilder.toString();
jobConf.set("mapred.task.id", taskAttemptIdStr);