You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/06/06 23:52:31 UTC
spark git commit: [SPARK-7639] [PYSPARK] [MLLIB] Python API for KernelDensity

Repository: spark
Updated Branches:
  refs/heads/master 16fc49617 -> 5aa804f3c


[SPARK-7639] [PYSPARK] [MLLIB] Python API for KernelDensity

Python API for KernelDensity

Author: MechCoder <ma...@gmail.com>

Closes #6387 from MechCoder/spark-7639 and squashes the following commits:

17abc62 [MechCoder] add tests
2de6540 [MechCoder] style tests
bf4acc0 [MechCoder] Added doctests
84359d5 [MechCoder] [SPARK-7639] Python API for KernelDensity


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5aa804f3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5aa804f3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5aa804f3

Branch: refs/heads/master
Commit: 5aa804f3c6485670937a658ce8207c2317c6a506
Parents: 16fc496
Author: MechCoder <ma...@gmail.com>
Authored: Sat Jun 6 14:52:14 2015 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Sat Jun 6 14:52:14 2015 -0700

----------------------------------------------------------------------
 .../spark/mllib/api/python/PythonMLLibAPI.scala | 12 +++-
 python/pyspark/mllib/stat/KernelDensity.py      | 61 ++++++++++++++++++++
 python/pyspark/mllib/stat/__init__.py           |  3 +-
 python/run-tests                                |  1 +
 4 files changed, 75 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/5aa804f3/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 16f3131..8f66bc8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -43,7 +43,8 @@ import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.stat.correlation.CorrelationNames
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.stat.test.ChiSqTestResult
-import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
+import org.apache.spark.mllib.stat.{
+  KernelDensity, MultivariateStatisticalSummary, Statistics}
 import org.apache.spark.mllib.tree.configuration.{Algo, BoostingStrategy, Strategy}
 import org.apache.spark.mllib.tree.impurity._
 import org.apache.spark.mllib.tree.loss.Losses
@@ -945,6 +946,15 @@ private[python] class PythonMLLibAPI extends Serializable {
       r => (r.getSeq(0).toArray[Any], r.getSeq(1).toArray[Any])))
   }
 
+  /**
+   * Java stub for the estimate method of KernelDensity
+   */
+  def estimateKernelDensity(
+      sample: JavaRDD[Double],
+      bandwidth: Double, points: java.util.ArrayList[Double]): Array[Double] = {
+    return new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate(
+      points.asScala.toArray)
+  }
 
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/5aa804f3/python/pyspark/mllib/stat/KernelDensity.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py
new file mode 100644
index 0000000..7da9219
--- /dev/null
+++ b/python/pyspark/mllib/stat/KernelDensity.py
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+
+if sys.version > '3':
+    xrange = range
+
+import numpy as np
+
+from pyspark.mllib.common import callMLlibFunc
+from pyspark.rdd import RDD
+
+
+class KernelDensity(object):
+    """
+    .. note:: Experimental
+
+    Estimate probability density at required points given a RDD of samples
+    from the population.
+
+    >>> kd = KernelDensity()
+    >>> sample = sc.parallelize([0.0, 1.0])
+    >>> kd.setSample(sample)
+    >>> kd.estimate([0.0, 1.0])
+    array([ 0.12938758,  0.12938758])
+    """
+    def __init__(self):
+        self._bandwidth = 1.0
+        self._sample = None
+
+    def setBandwidth(self, bandwidth):
+        """Set bandwidth of each sample. Defaults to 1.0"""
+        self._bandwidth = bandwidth
+
+    def setSample(self, sample):
+        """Set sample points from the population. Should be a RDD"""
+        if not isinstance(sample, RDD):
+            raise TypeError("samples should be a RDD, received %s" % type(sample))
+        self._sample = sample
+
+    def estimate(self, points):
+        """Estimate the probability density at points"""
+        points = list(points)
+        densities = callMLlibFunc(
+            "estimateKernelDensity", self._sample, self._bandwidth, points)
+        return np.asarray(densities)

http://git-wip-us.apache.org/repos/asf/spark/blob/5aa804f3/python/pyspark/mllib/stat/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py
index e3e1285..c8a721d 100644
--- a/python/pyspark/mllib/stat/__init__.py
+++ b/python/pyspark/mllib/stat/__init__.py
@@ -22,6 +22,7 @@ Python package for statistical functions in MLlib.
 from pyspark.mllib.stat._statistics import *
 from pyspark.mllib.stat.distribution import MultivariateGaussian
 from pyspark.mllib.stat.test import ChiSqTestResult
+from pyspark.mllib.stat.KernelDensity import KernelDensity
 
 __all__ = ["Statistics", "MultivariateStatisticalSummary", "ChiSqTestResult",
-           "MultivariateGaussian"]
+           "MultivariateGaussian", "KernelDensity"]

http://git-wip-us.apache.org/repos/asf/spark/blob/5aa804f3/python/run-tests
----------------------------------------------------------------------
diff --git a/python/run-tests b/python/run-tests
index 17dda3e..4468fdb 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -93,6 +93,7 @@ function run_mllib_tests() {
     run_test "pyspark.mllib.recommendation"
     run_test "pyspark.mllib.regression"
     run_test "pyspark.mllib.stat._statistics"
+    run_test "pyspark.mllib.stat.KernelDensity"
     run_test "pyspark.mllib.tree"
     run_test "pyspark.mllib.util"
     run_test "pyspark.mllib.tests"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org