You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2015/06/21 21:05:00 UTC
spark git commit: [SPARK-7604] [MLLIB] Python API for PCA and PCAModel
Repository: spark
Updated Branches:
refs/heads/master a1e3649c8 -> 32e3cdaa6
[SPARK-7604] [MLLIB] Python API for PCA and PCAModel
Python API for PCA and PCAModel
Author: Yanbo Liang <yb...@gmail.com>
Closes #6315 from yanboliang/spark-7604 and squashes the following commits:
1d58734 [Yanbo Liang] remove transform() in PCAModel, use default behavior
4d9d121 [Yanbo Liang] Python API for PCA and PCAModel
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/32e3cdaa
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/32e3cdaa
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/32e3cdaa
Branch: refs/heads/master
Commit: 32e3cdaa647722671adcb5068bd5ffbf2f157806
Parents: a1e3649
Author: Yanbo Liang <yb...@gmail.com>
Authored: Sun Jun 21 12:04:20 2015 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Sun Jun 21 12:04:20 2015 -0700
----------------------------------------------------------------------
.../spark/mllib/api/python/PythonMLLibAPI.scala | 10 ++++++
python/pyspark/mllib/feature.py | 35 ++++++++++++++++++++
2 files changed, 45 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/32e3cdaa/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 2897865..634d56d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -520,6 +520,16 @@ private[python] class PythonMLLibAPI extends Serializable {
}
/**
+ * Java stub for PCA.fit(). This stub returns a
+ * handle to the Java object instead of the content of the Java object.
+ * Extra care needs to be taken in the Python code to ensure it gets freed on
+ * exit; see the Py4J documentation.
+ */
+ def fitPCA(k: Int, data: JavaRDD[Vector]): PCAModel = {
+ new PCA(k).fit(data.rdd)
+ }
+
+ /**
* Java stub for IDF.fit(). This stub returns a
* handle to the Java object instead of the content of the Java object.
* Extra care needs to be taken in the Python code to ensure it gets freed on
http://git-wip-us.apache.org/repos/asf/spark/blob/32e3cdaa/python/pyspark/mllib/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index cf5fdf2..334f5b8 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -252,6 +252,41 @@ class ChiSqSelector(object):
return ChiSqSelectorModel(jmodel)
+class PCAModel(JavaVectorTransformer):
+ """
+ Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
+ """
+
+
+class PCA(object):
+ """
+ A feature transformer that projects vectors to a low-dimensional space using PCA.
+
+ >>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),
+ ... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),
+ ... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
+ >>> model = PCA(2).fit(sc.parallelize(data))
+ >>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
+ >>> pcArray[0]
+ 1.648...
+ >>> pcArray[1]
+ -4.013...
+ """
+ def __init__(self, k):
+ """
+ :param k: number of principal components.
+ """
+ self.k = int(k)
+
+ def fit(self, data):
+ """
+ Computes a [[PCAModel]] that contains the principal components of the input vectors.
+ :param data: source vectors
+ """
+ jmodel = callMLlibFunc("fitPCA", self.k, data)
+ return PCAModel(jmodel)
+
+
class HashingTF(object):
"""
.. note:: Experimental
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org