You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by pw...@apache.org on 2014/01/14 08:08:35 UTC

[2/8] git commit: Update some Python MLlib parameters to use camelCase, and tweak docs

Update some Python MLlib parameters to use camelCase, and tweak docs

We've used camel case in other Spark methods so it felt reasonable to
keep using it here and make the code match Scala/Java as much as
possible. Note that parameter names matter in Python because it allows
passing optional parameters by name.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/4c28a2ba
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/4c28a2ba
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/4c28a2ba

Branch: refs/heads/master
Commit: 4c28a2bad8a6d64ee69213eede440837636fe58b
Parents: 9a0dfdf
Author: Matei Zaharia <ma...@databricks.com>
Authored: Fri Jan 10 00:12:43 2014 -0800
Committer: Matei Zaharia <ma...@databricks.com>
Committed: Sat Jan 11 22:30:48 2014 -0800

----------------------------------------------------------------------
 docs/mllib-guide.md                    |  9 +++++++++
 python/pyspark/mllib/classification.py | 14 +++++++-------
 python/pyspark/mllib/regression.py     | 28 ++++++++++++++--------------
 3 files changed, 30 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/4c28a2ba/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index c977bc4..1a5c640 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -21,6 +21,8 @@ depends on native Fortran routines. You may need to install the
 if it is not already present on your nodes. MLlib will throw a linking error if it cannot 
 detect these libraries automatically.
 
+To use MLlib in Python, you will also need [NumPy](http://www.numpy.org) version 1.7 or newer.
+
 # Binary Classification
 
 Binary classification is a supervised learning problem in which we want to
@@ -316,6 +318,13 @@ other signals), you can use the trainImplicit method to get better results.
 val model = ALS.trainImplicit(ratings, 1, 20, 0.01)
 {% endhighlight %}
 
+# Using MLLib in Java
+
+All of MLlib's methods use Java-friendly types, so you can import and call them there the same
+way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
+Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
+calling `.rdd()` on your `JavaRDD` object.
+
 # Using MLLib in Python
 Following examples can be tested in the PySpark shell.
 

http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/4c28a2ba/python/pyspark/mllib/classification.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 03ff5a5..19b90df 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -44,13 +44,13 @@ class LogisticRegressionModel(LinearModel):
 class LogisticRegressionWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0,
-              mini_batch_fraction=1.0, initial_weights=None):
+              miniBatchFraction=1.0, initialWeights=None):
         """Train a logistic regression model on the given data."""
         sc = data.context
         return _regression_train_wrapper(sc, lambda d, i:
                 sc._jvm.PythonMLLibAPI().trainLogisticRegressionModelWithSGD(d._jrdd,
-                        iterations, step, mini_batch_fraction, i),
-                LogisticRegressionModel, data, initial_weights)
+                        iterations, step, miniBatchFraction, i),
+                LogisticRegressionModel, data, initialWeights)
 
 class SVMModel(LinearModel):
     """A support vector machine.
@@ -67,14 +67,14 @@ class SVMModel(LinearModel):
 
 class SVMWithSGD(object):
     @classmethod
-    def train(cls, data, iterations=100, step=1.0, reg_param=1.0,
-              mini_batch_fraction=1.0, initial_weights=None):
+    def train(cls, data, iterations=100, step=1.0, regParam=1.0,
+              miniBatchFraction=1.0, initialWeights=None):
         """Train a support vector machine on the given data."""
         sc = data.context
         return _regression_train_wrapper(sc, lambda d, i:
                 sc._jvm.PythonMLLibAPI().trainSVMModelWithSGD(d._jrdd,
-                        iterations, step, reg_param, mini_batch_fraction, i),
-                SVMModel, data, initial_weights)
+                        iterations, step, regParam, miniBatchFraction, i),
+                SVMModel, data, initialWeights)
 
 class NaiveBayesModel(object):
     """

http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/4c28a2ba/python/pyspark/mllib/regression.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index e90b728..7656db0 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -47,57 +47,57 @@ class LinearRegressionModel(LinearRegressionModelBase):
     """A linear regression model derived from a least-squares fit.
 
     >>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2)
-    >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights=array([1.0]))
+    >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
     """
 
 class LinearRegressionWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0,
-              mini_batch_fraction=1.0, initial_weights=None):
+              miniBatchFraction=1.0, initialWeights=None):
         """Train a linear regression model on the given data."""
         sc = data.context
         return _regression_train_wrapper(sc, lambda d, i:
                 sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD(
-                        d._jrdd, iterations, step, mini_batch_fraction, i),
-                LinearRegressionModel, data, initial_weights)
+                        d._jrdd, iterations, step, miniBatchFraction, i),
+                LinearRegressionModel, data, initialWeights)
 
 class LassoModel(LinearRegressionModelBase):
     """A linear regression model derived from a least-squares fit with an
     l_1 penalty term.
 
     >>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2)
-    >>> lrm = LassoWithSGD.train(sc.parallelize(data), initial_weights=array([1.0]))
+    >>> lrm = LassoWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
     """
 
 class LassoWithSGD(object):
     @classmethod
-    def train(cls, data, iterations=100, step=1.0, reg_param=1.0,
-              mini_batch_fraction=1.0, initial_weights=None):
+    def train(cls, data, iterations=100, step=1.0, regParam=1.0,
+              miniBatchFraction=1.0, initialWeights=None):
         """Train a Lasso regression model on the given data."""
         sc = data.context
         return _regression_train_wrapper(sc, lambda d, i:
                 sc._jvm.PythonMLLibAPI().trainLassoModelWithSGD(d._jrdd,
-                        iterations, step, reg_param, mini_batch_fraction, i),
-                LassoModel, data, initial_weights)
+                        iterations, step, regParam, miniBatchFraction, i),
+                LassoModel, data, initialWeights)
 
 class RidgeRegressionModel(LinearRegressionModelBase):
     """A linear regression model derived from a least-squares fit with an
     l_2 penalty term.
 
     >>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2)
-    >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initial_weights=array([1.0]))
+    >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
     """
 
 class RidgeRegressionWithSGD(object):
     @classmethod
-    def train(cls, data, iterations=100, step=1.0, reg_param=1.0,
-              mini_batch_fraction=1.0, initial_weights=None):
+    def train(cls, data, iterations=100, step=1.0, regParam=1.0,
+              miniBatchFraction=1.0, initialWeights=None):
         """Train a ridge regression model on the given data."""
         sc = data.context
         return _regression_train_wrapper(sc, lambda d, i:
                 sc._jvm.PythonMLLibAPI().trainRidgeModelWithSGD(d._jrdd,
-                        iterations, step, reg_param, mini_batch_fraction, i),
-                RidgeRegressionModel, data, initial_weights)
+                        iterations, step, regParam, miniBatchFraction, i),
+                RidgeRegressionModel, data, initialWeights)
 
 def _test():
     import doctest