You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/05/28 21:03:50 UTC
spark git commit: [SPARK-7911] [MLLIB] A workaround for VectorUDT
serialize (or deserialize) being called multiple times
Repository: spark
Updated Branches:
refs/heads/master 000df2f0d -> 530efe3e8
[SPARK-7911] [MLLIB] A workaround for VectorUDT serialize (or deserialize) being called multiple times
~~A PythonUDT shouldn't be serialized into external Scala types in PythonRDD. I'm not sure whether this should fix one of the bugs related to SQL UDT/UDF in PySpark.~~
The fix above didn't work. So I added a workaround for this. If a Python UDF is applied to a Python UDT. This will put the Python SQL types as inputs. Still incorrect, but at least it doesn't throw exceptions on the Scala side. davies harsha2010
Author: Xiangrui Meng <me...@databricks.com>
Closes #6442 from mengxr/SPARK-7903 and squashes the following commits:
c257d2a [Xiangrui Meng] add a workaround for VectorUDT
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/530efe3e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/530efe3e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/530efe3e
Branch: refs/heads/master
Commit: 530efe3e80c62b25c869b85167e00330eb1ddea6
Parents: 000df2f
Author: Xiangrui Meng <me...@databricks.com>
Authored: Thu May 28 12:03:46 2015 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Thu May 28 12:03:46 2015 -0700
----------------------------------------------------------------------
.../org/apache/spark/mllib/linalg/Vectors.scala | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/530efe3e/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index f6bcdf8..2ffa497 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -176,27 +176,31 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
}
override def serialize(obj: Any): Row = {
- val row = new GenericMutableRow(4)
obj match {
case SparseVector(size, indices, values) =>
+ val row = new GenericMutableRow(4)
row.setByte(0, 0)
row.setInt(1, size)
row.update(2, indices.toSeq)
row.update(3, values.toSeq)
+ row
case DenseVector(values) =>
+ val row = new GenericMutableRow(4)
row.setByte(0, 1)
row.setNullAt(1)
row.setNullAt(2)
row.update(3, values.toSeq)
+ row
+ // TODO: There are bugs in UDT serialization because we don't have a clear separation between
+ // TODO: internal SQL types and language specific types (including UDT). UDT serialize and
+ // TODO: deserialize may get called twice. See SPARK-7186.
+ case row: Row =>
+ row
}
- row
}
override def deserialize(datum: Any): Vector = {
datum match {
- // TODO: something wrong with UDT serialization
- case v: Vector =>
- v
case row: Row =>
require(row.length == 4,
s"VectorUDT.deserialize given row with length ${row.length} but requires length == 4")
@@ -211,6 +215,11 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
val values = row.getAs[Iterable[Double]](3).toArray
new DenseVector(values)
}
+ // TODO: There are bugs in UDT serialization because we don't have a clear separation between
+ // TODO: internal SQL types and language specific types (including UDT). UDT serialize and
+ // TODO: deserialize may get called twice. See SPARK-7186.
+ case v: Vector =>
+ v
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org