You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/05/08 00:45:44 UTC
spark git commit: [SPARK-6948] [MLLIB] compress vectors in
VectorAssembler
Repository: spark
Updated Branches:
refs/heads/master 658a478d3 -> e43803b8f
[SPARK-6948] [MLLIB] compress vectors in VectorAssembler
The compression is based on storage. brkyvz
Author: Xiangrui Meng <me...@databricks.com>
Closes #5985 from mengxr/SPARK-6948 and squashes the following commits:
df56a00 [Xiangrui Meng] update python tests
6d90d45 [Xiangrui Meng] compress vectors in VectorAssembler
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e43803b8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e43803b8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e43803b8
Branch: refs/heads/master
Commit: e43803b8f477b2c8d28836ac163cb54328d13f1a
Parents: 658a478
Author: Xiangrui Meng <me...@databricks.com>
Authored: Thu May 7 15:45:37 2015 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Thu May 7 15:45:37 2015 -0700
----------------------------------------------------------------------
.../org/apache/spark/ml/feature/VectorAssembler.scala | 2 +-
.../apache/spark/ml/feature/VectorAssemblerSuite.scala | 10 +++++++++-
python/pyspark/ml/feature.py | 6 +++---
3 files changed, 13 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/e43803b8/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index b5a69ce..796758a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -102,6 +102,6 @@ object VectorAssembler {
case o =>
throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.")
}
- Vectors.sparse(cur, indices.result(), values.result())
+ Vectors.sparse(cur, indices.result(), values.result()).compressed
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/e43803b8/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
index 57d0278..0db2760 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.feature
import org.scalatest.FunSuite
import org.apache.spark.SparkException
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SQLContext}
@@ -48,6 +48,14 @@ class VectorAssemblerSuite extends FunSuite with MLlibTestSparkContext {
}
}
+ test("assemble should compress vectors") {
+ import org.apache.spark.ml.feature.VectorAssembler.assemble
+ val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0))
+ assert(v1.isInstanceOf[SparseVector])
+ val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0)))
+ assert(v2.isInstanceOf[DenseVector])
+ }
+
test("VectorAssembler") {
val df = sqlContext.createDataFrame(Seq(
(0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L)
http://git-wip-us.apache.org/repos/asf/spark/blob/e43803b8/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 8a0fddd..705a368 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -121,12 +121,12 @@ class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol):
>>> df = sc.parallelize([Row(a=1, b=0, c=3)]).toDF()
>>> vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features")
>>> vecAssembler.transform(df).head().features
- SparseVector(3, {0: 1.0, 2: 3.0})
+ DenseVector([1.0, 0.0, 3.0])
>>> vecAssembler.setParams(outputCol="freqs").transform(df).head().freqs
- SparseVector(3, {0: 1.0, 2: 3.0})
+ DenseVector([1.0, 0.0, 3.0])
>>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "vector"}
>>> vecAssembler.transform(df, params).head().vector
- SparseVector(2, {1: 1.0})
+ DenseVector([0.0, 1.0])
"""
_java_class = "org.apache.spark.ml.feature.VectorAssembler"
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org