You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2016/09/06 08:06:03 UTC
spark git commit: [SPARK-17356][SQL] Fix out of memory issue when
generating JSON for TreeNode
Repository: spark
Updated Branches:
refs/heads/master c0ae6bc6e -> 6f13aa7df
[SPARK-17356][SQL] Fix out of memory issue when generating JSON for TreeNode
## What changes were proposed in this pull request?
class `org.apache.spark.sql.types.Metadata` is widely used in mllib to store some ml attributes. `Metadata` is commonly stored in `Alias` expression.
```
case class Alias(child: Expression, name: String)(
val exprId: ExprId = NamedExpression.newExprId,
val qualifier: Option[String] = None,
val explicitMetadata: Option[Metadata] = None,
override val isGenerated: java.lang.Boolean = false)
```
The `Metadata` can take a big memory footprint since the number of attributes is big ( in scale of million). When `toJSON` is called on `Alias` expression, the `Metadata` will also be converted to a big JSON string.
If a plan contains many such kind of `Alias` expressions, it may trigger out of memory error when `toJSON` is called, since converting all `Metadata` references to JSON will take huge memory.
With this PR, we will skip scanning Metadata when doing JSON conversion. For a reproducer of the OOM, and analysis, please look at jira https://issues.apache.org/jira/browse/SPARK-17356.
## How was this patch tested?
Existing tests.
Author: Sean Zhong <se...@databricks.com>
Closes #14915 from clockfly/json_oom.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f13aa7d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f13aa7d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f13aa7d
Branch: refs/heads/master
Commit: 6f13aa7dfee12b1b301bd10a1050549008ecc67e
Parents: c0ae6bc
Author: Sean Zhong <se...@databricks.com>
Authored: Tue Sep 6 16:05:50 2016 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Tue Sep 6 16:05:50 2016 +0800
----------------------------------------------------------------------
.../org/apache/spark/sql/catalyst/trees/TreeNode.scala | 4 +++-
.../src/test/scala/org/apache/spark/sql/QueryTest.scala | 10 +++++++++-
2 files changed, 12 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/6f13aa7d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 037f8cb..893af51 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -618,7 +618,9 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
case s: String => JString(s)
case u: UUID => JString(u.toString)
case dt: DataType => dt.jsonValue
- case m: Metadata => m.jsonValue
+ // SPARK-17356: In usage of mllib, Metadata may store a huge vector of data, transforming
+ // it to JSON may trigger OutOfMemoryError.
+ case m: Metadata => Metadata.empty.jsonValue
case s: StorageLevel =>
("useDisk" -> s.useDisk) ~ ("useMemory" -> s.useMemory) ~ ("useOffHeap" -> s.useOffHeap) ~
("deserialized" -> s.deserialized) ~ ("replication" -> s.replication)
http://git-wip-us.apache.org/repos/asf/spark/blob/6f13aa7d/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index c7af402..d361f61 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
import org.apache.spark.sql.execution.columnar.InMemoryRelation
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.execution.streaming.MemoryPlan
-import org.apache.spark.sql.types.ObjectType
+import org.apache.spark.sql.types.{Metadata, ObjectType}
abstract class QueryTest extends PlanTest {
@@ -274,6 +274,14 @@ abstract class QueryTest extends PlanTest {
val normalized1 = logicalPlan.transformAllExpressions {
case udf: ScalaUDF => udf.copy(function = null)
case gen: UserDefinedGenerator => gen.copy(function = null)
+ // After SPARK-17356: the JSON representation no longer has the Metadata. We need to remove
+ // the Metadata from the normalized plan so that we can compare this plan with the
+ // JSON-deserialzed plan.
+ case a @ Alias(child, name) if a.explicitMetadata.isDefined =>
+ Alias(child, name)(a.exprId, a.qualifier, Some(Metadata.empty), a.isGenerated)
+ case a: AttributeReference if a.metadata != Metadata.empty =>
+ AttributeReference(a.name, a.dataType, a.nullable, Metadata.empty)(a.exprId, a.qualifier,
+ a.isGenerated)
}
// RDDs/data are not serializable to JSON, so we need to collect LogicalPlans that contains
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org