You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by pw...@apache.org on 2014/01/08 07:32:31 UTC
[5/6] git commit: Fixed merge conflict
Fixed merge conflict
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/46cb980a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/46cb980a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/46cb980a
Branch: refs/heads/master
Commit: 46cb980a5f5ba55f341b8bcce4f0d1e5213989d3
Parents: 150089d b2e690f
Author: Hossein Falaki <fa...@gmail.com>
Authored: Tue Jan 7 21:28:26 2014 -0800
Committer: Hossein Falaki <fa...@gmail.com>
Committed: Tue Jan 7 21:28:26 2014 -0800
----------------------------------------------------------------------
.gitignore | 3 +
README.md | 20 +-
assembly/lib/PY4J_LICENSE.txt | 27 -
assembly/lib/PY4J_VERSION.txt | 1 -
assembly/lib/net/sf/py4j/py4j/0.7/py4j-0.7.jar | Bin 103286 -> 0 bytes
assembly/lib/net/sf/py4j/py4j/0.7/py4j-0.7.pom | 9 -
.../net/sf/py4j/py4j/maven-metadata-local.xml | 12 -
assembly/pom.xml | 20 +-
assembly/src/main/assembly/assembly.xml | 11 +-
bin/compute-classpath.cmd | 2 +-
bin/compute-classpath.sh | 2 +-
bin/pyspark | 70 ++
bin/pyspark.cmd | 23 +
bin/pyspark2.cmd | 55 +
bin/run-example | 91 ++
bin/run-example.cmd | 23 +
bin/run-example2.cmd | 61 ++
bin/slaves.sh | 91 --
bin/spark-class | 154 +++
bin/spark-class.cmd | 23 +
bin/spark-class2.cmd | 85 ++
bin/spark-config.sh | 36 -
bin/spark-daemon.sh | 183 ----
bin/spark-daemons.sh | 35 -
bin/spark-shell | 102 ++
bin/spark-shell.cmd | 23 +
bin/start-all.sh | 34 -
bin/start-master.sh | 52 -
bin/start-slave.sh | 35 -
bin/start-slaves.sh | 48 -
bin/stop-all.sh | 32 -
bin/stop-master.sh | 27 -
bin/stop-slaves.sh | 35 -
conf/spark-env.sh.template | 2 +-
core/pom.xml | 422 ++++----
.../apache/spark/network/netty/FileClient.java | 37 +-
.../netty/FileClientChannelInitializer.java | 8 +-
.../spark/network/netty/FileClientHandler.java | 12 +-
.../apache/spark/network/netty/FileServer.java | 37 +-
.../netty/FileServerChannelInitializer.java | 7 +-
.../spark/network/netty/FileServerHandler.java | 22 +-
.../spark/network/netty/PathResolver.java | 52 +-
.../org/apache/spark/log4j-defaults.properties | 8 +
.../scala/org/apache/spark/Accumulators.scala | 8 +-
.../scala/org/apache/spark/HttpServer.scala | 1 +
.../main/scala/org/apache/spark/Logging.scala | 43 +-
.../org/apache/spark/MapOutputTracker.scala | 11 +-
.../scala/org/apache/spark/Partitioner.scala | 4 +-
.../main/scala/org/apache/spark/SparkConf.scala | 193 ++++
.../scala/org/apache/spark/SparkContext.scala | 278 +++--
.../main/scala/org/apache/spark/SparkEnv.scala | 54 +-
.../org/apache/spark/api/java/JavaPairRDD.scala | 36 +
.../org/apache/spark/api/java/JavaRDDLike.scala | 11 +
.../spark/api/java/JavaSparkContext.scala | 41 +-
.../org/apache/spark/api/python/PythonRDD.scala | 4 +-
.../org/apache/spark/broadcast/Broadcast.scala | 8 +-
.../spark/broadcast/BroadcastFactory.scala | 4 +-
.../apache/spark/broadcast/HttpBroadcast.scala | 43 +-
.../spark/broadcast/TorrentBroadcast.scala | 45 +-
.../spark/deploy/FaultToleranceTest.scala | 4 +-
.../apache/spark/deploy/LocalSparkCluster.scala | 7 +-
.../apache/spark/deploy/SparkHadoopUtil.scala | 14 +-
.../org/apache/spark/deploy/client/Client.scala | 21 +-
.../apache/spark/deploy/client/TestClient.scala | 10 +-
.../org/apache/spark/deploy/master/Master.scala | 41 +-
.../spark/deploy/master/MasterArguments.scala | 11 +-
.../deploy/master/SparkZooKeeperSession.scala | 7 +-
.../master/ZooKeeperLeaderElectionAgent.scala | 9 +-
.../master/ZooKeeperPersistenceEngine.scala | 8 +-
.../spark/deploy/master/ui/MasterWebUI.scala | 2 +-
.../org/apache/spark/deploy/worker/Worker.scala | 34 +-
.../spark/deploy/worker/ui/WorkerWebUI.scala | 6 +-
.../executor/CoarseGrainedExecutorBackend.scala | 6 +-
.../org/apache/spark/executor/Executor.scala | 47 +-
.../org/apache/spark/io/CompressionCodec.scala | 19 +-
.../apache/spark/metrics/MetricsConfig.scala | 1 -
.../apache/spark/metrics/MetricsSystem.scala | 11 +-
.../spark/network/ConnectionManager.scala | 22 +-
.../org/apache/spark/network/ReceiverTest.scala | 12 +-
.../org/apache/spark/network/SenderTest.scala | 16 +-
.../spark/network/netty/ShuffleCopier.scala | 10 +-
.../org/apache/spark/rdd/CheckpointRDD.scala | 5 +-
.../org/apache/spark/rdd/CoGroupedRDD.scala | 2 +-
.../org/apache/spark/rdd/PairRDDFunctions.scala | 42 +
.../spark/rdd/PartitionerAwareUnionRDD.scala | 110 ++
.../main/scala/org/apache/spark/rdd/RDD.scala | 19 +-
.../apache/spark/rdd/RDDCheckpointData.scala | 2 +-
.../org/apache/spark/rdd/ShuffledRDD.scala | 2 +-
.../org/apache/spark/rdd/SubtractedRDD.scala | 2 +-
.../apache/spark/scheduler/DAGScheduler.scala | 3 +-
.../spark/scheduler/InputFormatInfo.scala | 14 +-
.../org/apache/spark/scheduler/ResultTask.scala | 4 +-
.../spark/scheduler/SchedulableBuilder.scala | 6 +-
.../spark/scheduler/SchedulerBackend.scala | 3 -
.../apache/spark/scheduler/ShuffleMapTask.scala | 6 +-
.../spark/scheduler/TaskResultGetter.scala | 3 +-
.../spark/scheduler/TaskSchedulerImpl.scala | 25 +-
.../apache/spark/scheduler/TaskSetManager.scala | 23 +-
.../cluster/CoarseGrainedSchedulerBackend.scala | 20 +-
.../cluster/SimrSchedulerBackend.scala | 4 +-
.../cluster/SparkDeploySchedulerBackend.scala | 8 +-
.../mesos/CoarseMesosSchedulerBackend.scala | 18 +-
.../cluster/mesos/MesosSchedulerBackend.scala | 12 +-
.../spark/scheduler/local/LocalBackend.scala | 3 +-
.../spark/serializer/JavaSerializer.scala | 3 +-
.../spark/serializer/KryoSerializer.scala | 14 +-
.../apache/spark/serializer/Serializer.scala | 3 +
.../spark/serializer/SerializerManager.scala | 23 +-
.../spark/storage/BlockFetcherIterator.scala | 4 +-
.../org/apache/spark/storage/BlockManager.scala | 58 +-
.../spark/storage/BlockManagerMaster.scala | 11 +-
.../spark/storage/BlockManagerMasterActor.scala | 16 +-
.../spark/storage/BlockManagerWorker.scala | 3 -
.../spark/storage/BlockMessageArray.scala | 2 -
.../spark/storage/BlockObjectWriter.scala | 5 +-
.../apache/spark/storage/DiskBlockManager.scala | 2 +-
.../spark/storage/ShuffleBlockManager.scala | 10 +-
.../spark/storage/StoragePerfTester.scala | 2 +-
.../apache/spark/storage/ThreadingTest.scala | 8 +-
.../scala/org/apache/spark/ui/SparkUI.scala | 4 +-
.../apache/spark/ui/UIWorkloadGenerator.scala | 21 +-
.../org/apache/spark/ui/env/EnvironmentUI.scala | 15 +-
.../spark/ui/jobs/JobProgressListener.scala | 4 +-
.../scala/org/apache/spark/util/AkkaUtils.scala | 45 +-
.../org/apache/spark/util/MetadataCleaner.scala | 35 +-
.../spark/util/SerializableHyperLogLog.scala | 50 +
.../org/apache/spark/util/SizeEstimator.scala | 14 +-
.../scala/org/apache/spark/util/Utils.scala | 25 +-
core/src/test/resources/spark.conf | 8 +
.../test/resources/uncommons-maths-1.2.2.jar | Bin 49019 -> 0 bytes
.../org/apache/spark/CheckpointSuite.scala | 363 ++++---
.../scala/org/apache/spark/DriverSuite.scala | 8 +-
.../org/apache/spark/FileServerSuite.scala | 108 +-
.../scala/org/apache/spark/JavaAPISuite.java | 32 +
.../apache/spark/MapOutputTrackerSuite.scala | 16 +-
.../org/apache/spark/SharedSparkContext.scala | 4 +-
.../scala/org/apache/spark/SparkConfSuite.scala | 110 ++
.../deploy/worker/ExecutorRunnerTest.scala | 4 +-
.../apache/spark/io/CompressionCodecSuite.scala | 8 +-
.../spark/metrics/MetricsSystemSuite.scala | 8 +-
.../spark/rdd/PairRDDFunctionsSuite.scala | 34 +
.../scala/org/apache/spark/rdd/RDDSuite.scala | 40 +
.../spark/scheduler/ClusterSchedulerSuite.scala | 2 +-
.../spark/scheduler/DAGSchedulerSuite.scala | 23 +-
.../apache/spark/scheduler/JobLoggerSuite.scala | 2 +-
.../spark/scheduler/TaskResultGetterSuite.scala | 6 +-
.../spark/scheduler/TaskSetManagerSuite.scala | 4 +-
.../spark/serializer/KryoSerializerSuite.scala | 33 +-
.../spark/storage/BlockManagerSuite.scala | 97 +-
.../spark/storage/DiskBlockManagerSuite.scala | 18 +-
.../apache/spark/util/SizeEstimatorSuite.scala | 2 +-
data/kmeans_data.txt | 6 +
data/lr_data.txt | 1000 ++++++++++++++++++
data/pagerank_data.txt | 6 +
docs/_config.yml | 2 +-
docs/bagel-programming-guide.md | 4 +-
docs/building-with-maven.md | 14 +-
docs/configuration.md | 89 +-
docs/css/bootstrap.min.css | 2 +-
docs/index.md | 10 +-
docs/java-programming-guide.md | 4 +-
docs/job-scheduling.md | 21 +-
docs/mllib-guide.md | 6 +-
docs/monitoring.md | 3 +-
docs/python-programming-guide.md | 43 +-
docs/quick-start.md | 66 +-
docs/running-on-mesos.md | 19 +-
docs/running-on-yarn.md | 15 +-
docs/scala-programming-guide.md | 18 +-
docs/spark-debugger.md | 2 +-
docs/spark-standalone.md | 35 +-
docs/streaming-programming-guide.md | 8 +-
docs/tuning.md | 21 +-
ec2/spark_ec2.py | 2 +-
examples/pom.xml | 3 +
.../org/apache/spark/examples/JavaHdfsLR.java | 31 +-
.../org/apache/spark/examples/JavaKMeans.java | 25 +-
.../org/apache/spark/examples/JavaLogQuery.java | 22 +-
.../org/apache/spark/examples/JavaPageRank.java | 13 +-
.../org/apache/spark/examples/JavaSparkPi.java | 12 +-
.../java/org/apache/spark/examples/JavaTC.java | 18 +-
.../apache/spark/examples/JavaWordCount.java | 12 +-
.../apache/spark/mllib/examples/JavaALS.java | 20 +-
.../apache/spark/mllib/examples/JavaKMeans.java | 18 +-
.../org/apache/spark/mllib/examples/JavaLR.java | 21 +-
.../streaming/examples/JavaFlumeEventCount.java | 8 +-
.../streaming/examples/JavaKafkaWordCount.java | 21 +-
.../examples/JavaNetworkWordCount.java | 18 +-
.../streaming/examples/JavaQueueStream.java | 13 +-
.../apache/spark/examples/BroadcastTest.scala | 2 +-
.../spark/examples/ExceptionHandlingTest.scala | 2 +-
.../org/apache/spark/examples/GroupByTest.scala | 2 +-
.../org/apache/spark/examples/HBaseTest.scala | 2 +-
.../org/apache/spark/examples/HdfsTest.scala | 2 +-
.../org/apache/spark/examples/LogQuery.scala | 2 +-
.../spark/examples/MultiBroadcastTest.scala | 2 +-
.../examples/SimpleSkewedGroupByTest.scala | 2 +-
.../spark/examples/SkewedGroupByTest.scala | 2 +-
.../org/apache/spark/examples/SparkALS.scala | 2 +-
.../org/apache/spark/examples/SparkHdfsLR.scala | 2 +-
.../org/apache/spark/examples/SparkKMeans.scala | 2 +-
.../org/apache/spark/examples/SparkLR.scala | 2 +-
.../apache/spark/examples/SparkPageRank.scala | 2 +-
.../org/apache/spark/examples/SparkPi.scala | 2 +-
.../org/apache/spark/examples/SparkTC.scala | 2 +-
.../examples/bagel/WikipediaPageRank.scala | 10 +-
.../bagel/WikipediaPageRankStandalone.scala | 8 +-
.../streaming/examples/ActorWordCount.scala | 9 +-
.../streaming/examples/FlumeEventCount.scala | 2 +-
.../streaming/examples/HdfsWordCount.scala | 4 +-
.../streaming/examples/KafkaWordCount.scala | 4 +-
.../streaming/examples/MQTTWordCount.scala | 6 +-
.../streaming/examples/NetworkWordCount.scala | 4 +-
.../spark/streaming/examples/QueueStream.scala | 2 +-
.../streaming/examples/RawNetworkGrep.scala | 2 +-
.../examples/StatefulNetworkWordCount.scala | 4 +-
.../streaming/examples/TwitterAlgebirdCMS.scala | 2 +-
.../streaming/examples/TwitterAlgebirdHLL.scala | 2 +-
.../streaming/examples/TwitterPopularTags.scala | 2 +-
.../streaming/examples/ZeroMQWordCount.scala | 6 +-
.../clickstream/PageViewGenerator.scala | 4 +-
.../examples/clickstream/PageViewStream.scala | 6 +-
kmeans_data.txt | 6 -
lr_data.txt | 1000 ------------------
make-distribution.sh | 27 +-
.../spark/mllib/api/python/PythonMLLibAPI.scala | 30 +
.../spark/mllib/classification/NaiveBayes.scala | 119 +++
.../spark/mllib/optimization/Gradient.scala | 4 +-
.../apache/spark/mllib/recommendation/ALS.scala | 13 +-
.../MatrixFactorizationModel.scala | 38 +-
.../mllib/classification/NaiveBayesSuite.scala | 108 ++
.../optimization/GradientDescentSuite.scala | 116 ++
.../spark/mllib/recommendation/ALSSuite.scala | 33 +-
new-yarn/pom.xml | 161 ---
.../spark/deploy/yarn/ApplicationMaster.scala | 446 --------
.../yarn/ApplicationMasterArguments.scala | 94 --
.../org/apache/spark/deploy/yarn/Client.scala | 521 ---------
.../spark/deploy/yarn/ClientArguments.scala | 149 ---
.../yarn/ClientDistributedCacheManager.scala | 228 ----
.../spark/deploy/yarn/WorkerLauncher.scala | 222 ----
.../spark/deploy/yarn/WorkerRunnable.scala | 209 ----
.../deploy/yarn/YarnAllocationHandler.scala | 687 ------------
.../spark/deploy/yarn/YarnSparkHadoopUtil.scala | 43 -
.../cluster/YarnClientClusterScheduler.scala | 48 -
.../cluster/YarnClientSchedulerBackend.scala | 110 --
.../cluster/YarnClusterScheduler.scala | 56 -
.../ClientDistributedCacheManagerSuite.scala | 220 ----
pagerank_data.txt | 6 -
pom.xml | 69 +-
project/SparkBuild.scala | 52 +-
project/build.properties | 1 -
pyspark | 70 --
pyspark.cmd | 23 -
pyspark2.cmd | 55 -
python/epydoc.conf | 2 +-
python/lib/py4j-0.8.1-src.zip | Bin 0 -> 37662 bytes
python/lib/py4j0.7.egg | Bin 191756 -> 0 bytes
python/pyspark/__init__.py | 34 +-
python/pyspark/broadcast.py | 11 +
python/pyspark/conf.py | 171 +++
python/pyspark/context.py | 59 +-
python/pyspark/java_gateway.py | 3 +-
python/pyspark/mllib/_common.py | 25 +
python/pyspark/mllib/recommendation.py | 12 +-
python/pyspark/rdd.py | 66 +-
python/pyspark/shell.py | 2 +-
python/run-tests | 3 +-
repl-bin/src/deb/bin/run | 3 +-
repl/pom.xml | 1 -
.../org/apache/spark/repl/SparkILoop.scala | 19 +-
.../org/apache/spark/repl/SparkIMain.scala | 7 +-
run-example | 91 --
run-example.cmd | 23 -
run-example2.cmd | 61 --
sbin/slaves.sh | 91 ++
sbin/spark-config.sh | 36 +
sbin/spark-daemon.sh | 183 ++++
sbin/spark-daemons.sh | 35 +
sbin/spark-executor | 23 +
sbin/start-all.sh | 34 +
sbin/start-master.sh | 52 +
sbin/start-slave.sh | 35 +
sbin/start-slaves.sh | 48 +
sbin/stop-all.sh | 32 +
sbin/stop-master.sh | 27 +
sbin/stop-slaves.sh | 35 +
sbt/sbt | 82 +-
sbt/sbt-launch-0.11.3-2.jar | Bin 1096763 -> 0 bytes
sbt/sbt.cmd | 25 -
spark-class | 149 ---
spark-class.cmd | 23 -
spark-class2.cmd | 85 --
spark-executor | 22 -
spark-shell | 102 --
spark-shell.cmd | 22 -
.../org/apache/spark/streaming/Checkpoint.scala | 18 +-
.../org/apache/spark/streaming/DStream.scala | 4 +-
.../apache/spark/streaming/DStreamGraph.scala | 1 -
.../spark/streaming/PairDStreamFunctions.scala | 13 +-
.../spark/streaming/StreamingContext.scala | 63 +-
.../streaming/api/java/JavaPairDStream.scala | 18 +-
.../api/java/JavaStreamingContext.scala | 32 +-
.../streaming/dstream/NetworkInputDStream.scala | 8 +-
.../streaming/dstream/ShuffledDStream.scala | 9 +-
.../streaming/dstream/WindowedDStream.scala | 16 +-
.../streaming/scheduler/JobGenerator.scala | 10 +-
.../streaming/scheduler/JobScheduler.scala | 6 +-
.../streaming/util/MasterFailureTest.scala | 3 -
.../spark/streaming/util/RawTextSender.scala | 4 +-
.../apache/spark/streaming/JavaAPISuite.java | 10 +-
.../spark/streaming/BasicOperationsSuite.scala | 8 +-
.../spark/streaming/CheckpointSuite.scala | 15 +-
.../spark/streaming/InputStreamsSuite.scala | 18 +-
.../apache/spark/streaming/TestSuiteBase.scala | 34 +-
.../spark/streaming/WindowOperationsSuite.scala | 5 +-
yarn/README.md | 12 +
yarn/alpha/pom.xml | 32 +
.../spark/deploy/yarn/ApplicationMaster.scala | 464 ++++++++
.../org/apache/spark/deploy/yarn/Client.scala | 509 +++++++++
.../spark/deploy/yarn/WorkerLauncher.scala | 250 +++++
.../spark/deploy/yarn/WorkerRunnable.scala | 236 +++++
.../deploy/yarn/YarnAllocationHandler.scala | 680 ++++++++++++
.../yarn/ApplicationMasterArguments.scala | 94 ++
.../spark/deploy/yarn/ClientArguments.scala | 150 +++
.../yarn/ClientDistributedCacheManager.scala | 228 ++++
.../spark/deploy/yarn/YarnSparkHadoopUtil.scala | 43 +
.../cluster/YarnClientClusterScheduler.scala | 48 +
.../cluster/YarnClientSchedulerBackend.scala | 110 ++
.../cluster/YarnClusterScheduler.scala | 56 +
.../ClientDistributedCacheManagerSuite.scala | 220 ++++
yarn/pom.xml | 84 +-
.../spark/deploy/yarn/ApplicationMaster.scala | 477 ---------
.../yarn/ApplicationMasterArguments.scala | 94 --
.../org/apache/spark/deploy/yarn/Client.scala | 503 ---------
.../spark/deploy/yarn/ClientArguments.scala | 146 ---
.../yarn/ClientDistributedCacheManager.scala | 228 ----
.../spark/deploy/yarn/WorkerLauncher.scala | 243 -----
.../spark/deploy/yarn/WorkerRunnable.scala | 235 ----
.../deploy/yarn/YarnAllocationHandler.scala | 673 ------------
.../spark/deploy/yarn/YarnSparkHadoopUtil.scala | 43 -
.../cluster/YarnClientClusterScheduler.scala | 48 -
.../cluster/YarnClientSchedulerBackend.scala | 110 --
.../cluster/YarnClusterScheduler.scala | 59 --
.../ClientDistributedCacheManagerSuite.scala | 220 ----
yarn/stable/pom.xml | 32 +
.../spark/deploy/yarn/ApplicationMaster.scala | 432 ++++++++
.../org/apache/spark/deploy/yarn/Client.scala | 525 +++++++++
.../spark/deploy/yarn/WorkerLauncher.scala | 230 ++++
.../spark/deploy/yarn/WorkerRunnable.scala | 210 ++++
.../deploy/yarn/YarnAllocationHandler.scala | 695 ++++++++++++
350 files changed, 11425 insertions(+), 10524 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/46cb980a/docs/mllib-guide.md
----------------------------------------------------------------------
diff --cc docs/mllib-guide.md
index 5f3b676,95537ef..45ee166
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@@ -42,10 -39,57 +42,6 @@@ underlying gradient descent primitive (
parameter (*regParam*) along with various parameters associated with gradient
descent (*stepSize*, *numIterations*, *miniBatchFraction*).
-
-
-
-
-The following code snippet illustrates how to load a sample dataset, execute a
-training algorithm on this training data using a static method in the algorithm
-object, and make predictions with the resulting model to compute the training
-error.
-
-{% highlight scala %}
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.classification.SVMWithSGD
-import org.apache.spark.mllib.regression.LabeledPoint
-
-// Load and parse the data file
-val data = sc.textFile("mllib/data/sample_svm_data.txt")
-val parsedData = data.map { line =>
- val parts = line.split(' ')
- LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
-}
-
-// Run training algorithm
-val numIterations = 20
-val model = SVMWithSGD.train(parsedData, numIterations)
-
-// Evaluate model on training examples and compute training error
-val labelAndPreds = parsedData.map { point =>
- val prediction = model.predict(point.features)
- (point.label, prediction)
-}
-val trainErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / parsedData.count
-println("trainError = " + trainErr)
-{% endhighlight %}
-
-The `SVMWithSGD.train()` method by default performs L2 regularization with the
-regularization parameter set to 1.0. If we want to configure this algorithm, we
-can customize `SVMWithSGD` further by creating a new object directly and
-calling setter methods. All other MLlib algorithms support customization in
-this way as well. For example, the following code produces an L1 regularized
-variant of SVMs with regularization parameter set to 0.1, and runs the training
-algorithm for 200 iterations.
-
-{% highlight scala %}
-import org.apache.spark.mllib.optimization.L1Updater
-
-val svmAlg = new SVMWithSGD()
-svmAlg.optimizer.setNumIterations(200)
- .setRegParam(0.1)
- .setUpdater(new L1Updater)
-val modelL1 = svmAlg.run(parsedData)
-{% endhighlight %}
-
-Both of the code snippets above can be executed in `bin/spark-shell` to generate a
-classifier for the provided dataset.
-
Available algorithms for binary classification:
* [SVMWithSGD](api/mllib/index.html#org.apache.spark.mllib.classification.SVMWithSGD)
@@@ -166,269 -210,3 +162,269 @@@ at each iteration
Available algorithms for gradient descent:
* [GradientDescent](api/mllib/index.html#org.apache.spark.mllib.optimization.GradientDescent)
+
+# Using MLLib in Scala
+
+Following code snippets can be executed in `spark-shell`.
+
+## Binary Classification
+
+The following code snippet illustrates how to load a sample dataset, execute a
+training algorithm on this training data using a static method in the algorithm
+object, and make predictions with the resulting model to compute the training
+error.
+
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.classification.SVMWithSGD
+import org.apache.spark.mllib.regression.LabeledPoint
+
+// Load and parse the data file
+val data = sc.textFile("mllib/data/sample_svm_data.txt")
+val parsedData = data.map { line =>
+ val parts = line.split(' ')
+ LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
+}
+
+// Run training algorithm to build the model
+val numIterations = 20
+val model = SVMWithSGD.train(parsedData, numIterations)
+
+// Evaluate model on training examples and compute training error
+val labelAndPreds = parsedData.map { point =>
+ val prediction = model.predict(point.features)
+ (point.label, prediction)
+}
+val trainErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / parsedData.count
+println("Training Error = " + trainErr)
+{% endhighlight %}
+
+
+The `SVMWithSGD.train()` method by default performs L2 regularization with the
+regularization parameter set to 1.0. If we want to configure this algorithm, we
+can customize `SVMWithSGD` further by creating a new object directly and
+calling setter methods. All other MLlib algorithms support customization in
+this way as well. For example, the following code produces an L1 regularized
+variant of SVMs with regularization parameter set to 0.1, and runs the training
+algorithm for 200 iterations.
+
+{% highlight scala %}
+import org.apache.spark.mllib.optimization.L1Updater
+
+val svmAlg = new SVMWithSGD()
+svmAlg.optimizer.setNumIterations(200)
+ .setRegParam(0.1)
+ .setUpdater(new L1Updater)
+val modelL1 = svmAlg.run(parsedData)
+{% endhighlight %}
+
+## Linear Regression
+The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint. The
+example then uses LinearRegressionWithSGD to build a simple linear model to predict label values. We
+compute the Mean Squared Error at the end to evaluate
+[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit)
+
+{% highlight scala %}
+import org.apache.spark.mllib.regression.LinearRegressionWithSGD
+import org.apache.spark.mllib.regression.LabeledPoint
+
+// Load and parse the data
+val data = sc.textFile("mllib/data/ridge-data/lpsa.data")
+val parsedData = data.map { line =>
+ val parts = line.split(',')
+ LabeledPoint(parts(0).toDouble, parts(1).split(' ').map(x => x.toDouble).toArray)
+}
+
+// Building the model
+val numIterations = 20
+val model = LinearRegressionWithSGD.train(parsedData, numIterations)
+
+// Evaluate model on training examples and compute training error
+val valuesAndPreds = parsedData.map { point =>
+ val prediction = model.predict(point.features)
+ (point.label, prediction)
+}
+val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _)/valuesAndPreds.count
+println("training Mean Squared Error = " + MSE)
+{% endhighlight %}
+
+
+Similarly you can use RidgeRegressionWithSGD and LassoWithSGD and compare training
+[Mean Squared Errors](http://en.wikipedia.org/wiki/Mean_squared_error).
+
+## Clustering
+In the following example after loading and parsing data, we use the KMeans object to cluster the data
+into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within
+Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the
+optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
+
+{% highlight scala %}
+import org.apache.spark.mllib.clustering.KMeans
+
+// Load and parse the data
+val data = sc.textFile("kmeans_data.txt")
+val parsedData = data.map( _.split(' ').map(_.toDouble))
+
+// Cluster the data into two classes using KMeans
+val numIterations = 20
+val numClusters = 2
+val clusters = KMeans.train(parsedData, numClusters, numIterations)
+
+// Evaluate clustering by computing Within Set Sum of Squared Errors
+val WSSSE = clusters.computeCost(parsedData)
+println("Within Set Sum of Squared Errors = " + WSSSE)
+{% endhighlight %}
+
+
+## Collaborative Filtering
+In the following example we load rating data. Each row consists of a user, a product and a rating.
+We use the default ALS.train() method which assumes ratings are explicit. We evaluate the recommendation
+model by measuring the Mean Squared Error of rating prediction.
+
+{% highlight scala %}
+import org.apache.spark.mllib.recommendation.ALS
+import org.apache.spark.mllib.recommendation.Rating
+
+// Load and parse the data
+val data = sc.textFile("mllib/data/als/test.data")
+val ratings = data.map(_.split(',') match {
+ case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble)
+})
+
+// Build the recommendation model using ALS
+val numIterations = 20
+val model = ALS.train(ratings, 1, 20, 0.01)
+
+// Evaluate the model on rating data
+val usersProducts = ratings.map{ case Rating(user, product, rate) => (user, product)}
+val predictions = model.predict(usersProducts).map{
+ case Rating(user, product, rate) => ((user, product), rate)
+}
+val ratesAndPreds = ratings.map{
+ case Rating(user, product, rate) => ((user, product), rate)
+}.join(predictions)
+val MSE = ratesAndPreds.map{
+ case ((user, product), (r1, r2)) => math.pow((r1- r2), 2)
+}.reduce(_ + _)/ratesAndPreds.count
+println("Mean Squared Error = " + MSE)
+{% endhighlight %}
+
+If the rating matrix is derived from other source of information (i.e., it is inferred from
+other signals), you can use the trainImplicit method to get better results.
+
+{% highlight scala %}
+val model = ALS.trainImplicit(ratings, 1, 20, 0.01)
+{% endhighlight %}
+
+# Using MLLib in Python
+Following examples can be tested in the PySpark shell.
+
+## Binary Classification
+The following example shows how to load a sample dataset, build Logistic Regression model,
+and make predictions with the resulting model to compute the training error.
+
+{% highlight python %}
+from pyspark.mllib.classification import LogisticRegressionWithSGD
+from numpy import array
+
+# Load and parse the data
+data = sc.textFile("mllib/data/sample_svm_data.txt")
+parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
+model = LogisticRegressionWithSGD.train(sc, parsedData)
+
+# Build the model
+labelsAndPreds = parsedData.map(lambda point: (int(point.item(0)),
+ model.predict(point.take(range(1, point.size)))))
+
+# Evaluating the model on training data
+trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
+print("Training Error = " + str(trainErr))
+{% endhighlight %}
+
+## Linear Regression
+The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint. The
+example then uses LinearRegressionWithSGD to build a simple linear model to predict label values. We
+compute the Mean Squared Error at the end to evaluate
+[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit)
+
+{% highlight python %}
+from pyspark.mllib.regression import LinearRegressionWithSGD
+from numpy import array
+
+# Load and parse the data
+data = sc.textFile("mllib/data/ridge-data/lpsa.data")
+parsedData = data.map(lambda line: array([float(x) for x in line.replace(',', ' ').split(' ')]))
+
+# Build the model
+model = LinearRegressionWithSGD.train(sc, parsedData)
+
+# Evaluate the model on training data
+valuesAndPreds = parsedData.map(lambda point: (point.item(0),
+ model.predict(point.take(range(1, point.size)))))
+MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)/valuesAndPreds.count()
+print("Mean Squared Error = " + str(MSE))
+{% endhighlight %}
+
+
+## Clustering
+In the following example after loading and parsing data, we use the KMeans object to cluster the data
+into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within
+Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the
+optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
+
+{% highlight python %}
+from pyspark.mllib.clustering import KMeans
+from numpy import array
+from math import sqrt
+
+# Load and parse the data
+data = sc.textFile("kmeans_data.txt")
+parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
+
+# Build the model (cluster the data)
+clusters = KMeans.train(sc, parsedData, 2, maxIterations=10,
+ runs=30, initialization_mode="random")
+
+# Evaluate clustering by computing Within Set Sum of Squared Errors
+def error(point):
+ center = clusters.centers[clusters.predict(point)]
+ return sqrt(sum([x**2 for x in (point - center)]))
+
+WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
+print("Within Set Sum of Squared Error = " + str(WSSSE))
+{% endhighlight %}
+
+Similarly you can use RidgeRegressionWithSGD and LassoWithSGD and compare training Mean Squared
+Errors.
+
+## Collaborative Filtering
+In the following example we load rating data. Each row consists of a user, a product and a rating.
+We use the default ALS.train() method which assumes ratings are explicit. We evaluate the
+recommendation by measuring the Mean Squared Error of rating prediction.
+
+{% highlight python %}
+from pyspark.mllib.recommendation import ALS
+from numpy import array
+
+# Load and parse the data
+data = sc.textFile("mllib/data/als/test.data")
+ratings = data.map(lambda line: array([float(x) for x in line.split(',')]))
+
+# Build the recommendation model using Alternating Least Squares
+model = ALS.train(sc, ratings, 1, 20)
+
+# Evaluate the model on training data
+testdata = ratings.map(lambda p: (int(p[0]), int(p[1])))
+predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
+ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
+MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y)/ratesAndPreds.count()
+print("Mean Squared Error = " + str(MSE))
+{% endhighlight %}
+
+If the rating matrix is derived from other source of information (i.e., it is inferred from other
+signals), you can use the trainImplicit method to get better results.
+
+{% highlight python %}
+# Build the recommendation model using Alternating Least Squares based on implicit ratings
+model = ALS.trainImplicit(sc, ratings, 1, 20)
- {% endhighlight %}
++{% endhighlight %}