You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2016/03/15 06:26:00 UTC
spark git commit: [SPARK-13661][SQL] avoid the copy in HashedRelation
Repository: spark
Updated Branches:
refs/heads/master e76679a81 -> 9256840cb
[SPARK-13661][SQL] avoid the copy in HashedRelation
## What changes were proposed in this pull request?
Avoid the copy in HashedRelation, since most of the HashedRelation are built with Array[Row], added the copy() for LeftSemiJoinHash. This could help to reduce the memory consumption for Broadcast join.
## How was this patch tested?
Existing tests.
Author: Davies Liu <da...@databricks.com>
Closes #11666 from davies/remove_copy.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9256840c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9256840c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9256840c
Branch: refs/heads/master
Commit: 9256840cb631cad50852b2b218a1ac71b567084a
Parents: e76679a
Author: Davies Liu <da...@databricks.com>
Authored: Mon Mar 14 22:25:57 2016 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Mon Mar 14 22:25:57 2016 -0700
----------------------------------------------------------------------
.../spark/sql/execution/joins/HashedRelation.scala | 11 ++++++++---
.../spark/sql/execution/joins/LeftSemiJoinHash.scala | 2 +-
2 files changed, 9 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/9256840c/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 6235897..0b0f59c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -156,6 +156,11 @@ private[joins] class UniqueKeyHashedRelation(
private[execution] object HashedRelation {
+ /**
+ * Create a HashedRelation from an Iterator of InternalRow.
+ *
+ * Note: The caller should make sure that these InternalRow are different objects.
+ */
def apply(
input: Iterator[InternalRow],
keyGenerator: Projection,
@@ -188,7 +193,7 @@ private[execution] object HashedRelation {
keyIsUnique = false
existingMatchList
}
- matchList += currentRow.copy()
+ matchList += currentRow
}
}
@@ -438,7 +443,7 @@ private[joins] object UnsafeHashedRelation {
} else {
existingMatchList
}
- matchList += unsafeRow.copy()
+ matchList += unsafeRow
}
}
@@ -622,7 +627,7 @@ private[joins] object LongHashedRelation {
keyIsUnique = false
existingMatchList
}
- matchList += unsafeRow.copy()
+ matchList += unsafeRow
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/9256840c/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
index 242ed61..14389e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
@@ -47,7 +47,7 @@ case class LeftSemiJoinHash(
val numOutputRows = longMetric("numOutputRows")
right.execute().zipPartitions(left.execute()) { (buildIter, streamIter) =>
- val hashRelation = HashedRelation(buildIter, rightKeyGenerator)
+ val hashRelation = HashedRelation(buildIter.map(_.copy()), rightKeyGenerator)
hashSemiJoin(streamIter, hashRelation, numOutputRows)
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org