You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/01/22 23:01:44 UTC
[01/50] git commit: new main file
Updated Branches:
refs/heads/master 749f84282 -> d009b17d1
new main file
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/1a21ba29
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/1a21ba29
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/1a21ba29
Branch: refs/heads/master
Commit: 1a21ba29672074692be6c006a8938bfa86330a19
Parents: 6c3674c
Author: Reza Zadeh <ri...@gmail.com>
Authored: Thu Dec 26 18:09:33 2013 -0500
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Thu Dec 26 18:09:33 2013 -0500
----------------------------------------------------------------------
.../apache/spark/mllib/linalg/sparsesvd.scala | 29 +++++++++++++-------
1 file changed, 19 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/1a21ba29/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index f9b9a04..be8ccff 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -110,22 +110,31 @@ object SVD {
(retU, retS, retV)
}
+
def main(args: Array[String]) {
- if (args.length < 4) {
- println("Usage: KMeans <master> <input_file> <k> <max_iterations> [<runs>]")
+ if (args.length < 8) {
+ println("Usage: SVD <master> <matrix_file> <m> <n> <minimum_singular_value> <output_U_file> <output_S_file> <output_V_file>")
System.exit(1)
}
- val (master, inputFile, k, iters) = (args(0), args(1), args(2).toInt, args(3).toInt)
- val runs = if (args.length >= 5) args(4).toInt else 1
- val sc = new SparkContext(master, "KMeans")
- val data = sc.textFile(inputFile).map(line => line.split(' ').map(_.toDouble)).cache()
- println("Cost: ")
+ val (master, inputFile, m, n, min_svalue, output_u, output_s, output_v) =
+ (args(0), args(1), args(2).toInt, args(3).toInt, args(4).toDouble, args(5), args(6), args(7))
+
+ val sc = new SparkContext(master, "SVD")
+
+ val rawdata = sc.textFile(inputFile)
+ val data = rawdata.map { line =>
+ val parts = line.split(',')
+ ((parts(0).toInt, parts(1).toInt), parts(2).toDouble)
+ }
+
+ val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
+ println("Computed " + s.size + " singular values and vectors")
+ u.saveAsText(output_u)
+ s.saveAsText(output_s)
+ v.saveAsText(output_v)
System.exit(0)
- //val data = sc.makeRDD(Array.tabulate(m,n){ (a,b)=> ((a+1,b+1),a*b%37) }.flatten )
}
}
-
-
[08/50] git commit: large scale considerations
Posted by ma...@apache.org.
large scale considerations
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/ae5102ac
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/ae5102ac
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/ae5102ac
Branch: refs/heads/master
Commit: ae5102acc08f9cbe07caba9b95d59f928bc0e16a
Parents: 642ab5c
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Dec 27 04:15:13 2013 -0500
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Dec 27 04:15:13 2013 -0500
----------------------------------------------------------------------
.../src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/ae5102ac/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index edf715d..0ab05de 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -58,8 +58,8 @@ object SVD {
n: Int,
min_svalue: Double)
: ( RDD[((Int, Int), Double)],
- RDD[((Int, Int), Double)],
- RDD[((Int, Int), Double)]) =
+ RDD[((Int, Int), Double)],
+ RDD[((Int, Int), Double)]) =
{
if (m < n || m <= 0 || n <= 0) {
throw new IllegalArgumentException("Expecting a tall and skinny matrix")
[12/50] git commit: doc tweaks
Posted by ma...@apache.org.
doc tweaks
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/b941b6f7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/b941b6f7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/b941b6f7
Branch: refs/heads/master
Commit: b941b6f7b0131b4382b09740d56916574901fd55
Parents: 185c882
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 20:01:13 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 20:01:13 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/b941b6f7/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 8c86369..08d6d74 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -231,12 +231,12 @@ Only singular vectors associated with singular values
greater or equal to MIN_SVALUE are recovered. If there are k
such values, then the dimensions of the return will be:
-S is k x k and diagonal, holding the singular values on diagonal
-U is m x k and satisfies U^T*U = eye(k)
-V is n x k and satisfies V^TV = eye(k)
+* *S* is *k x k* and diagonal, holding the singular values on diagonal.
+* *U* is *m x k* and satisfies U^T*U = eye(k).
+* *V* is *n x k* and satisfies V^TV = eye(k).
All input and output is expected in sparse matrix format, 1-indexed
-as tuples of the form ((i,j),value) all in RDDs
+as tuples of the form ((i,j),value) all in RDDs. Below is example usage.
{% highlight scala %}
[42/50] git commit: replace this.type with SVD
Posted by ma...@apache.org.
replace this.type with SVD
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/eb2d8c43
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/eb2d8c43
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/eb2d8c43
Branch: refs/heads/master
Commit: eb2d8c431f3fa3a5634fe94ef85ed78a08393a25
Parents: cb13b15
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 13:57:27 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 13:57:27 2014 -0800
----------------------------------------------------------------------
mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/eb2d8c43/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index ba7a0fd..83fcb01 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -33,7 +33,7 @@ class SVD {
/**
* Set the number of top-k singular vectors to return
*/
- def setK(k: Int): this.type = {
+ def setK(k: Int): SVD = {
this.k = k
this
}
[34/50] git commit: documentation for sparsematrix
Posted by ma...@apache.org.
documentation for sparsematrix
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/4f38b6fa
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/4f38b6fa
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/4f38b6fa
Branch: refs/heads/master
Commit: 4f38b6fab5bf633a205b9039db9d4a26ed28ec89
Parents: 7d7490b
Author: Reza Zadeh <ri...@gmail.com>
Authored: Tue Jan 7 17:19:28 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Tue Jan 7 17:19:28 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/4f38b6fa/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 653848b..44e6c8f 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -228,8 +228,8 @@ from which we recover S and V.
Then we compute U via easy matrix multiplication
as *U = A * V * S^-1*
-Only singular vectors associated with singular values
-greater or equal to MIN_SVALUE are recovered. If there are k
+Only singular vectors associated with largest k singular values
+are recovered. If there are k
such values, then the dimensions of the return will be:
* *S* is *k x k* and diagonal, holding the singular values on diagonal.
@@ -237,7 +237,8 @@ such values, then the dimensions of the return will be:
* *V* is *n x k* and satisfies V^TV = eye(k).
All input and output is expected in sparse matrix format, 1-indexed
-as tuples of the form ((i,j),value) all in RDDs. Below is example usage.
+as tuples of the form ((i,j),value) all in
+SparseMatrix RDDs. Below is example usage.
{% highlight scala %}
[46/50] git commit: make example 0-indexed
Posted by ma...@apache.org.
make example 0-indexed
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/4e967577
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/4e967577
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/4e967577
Branch: refs/heads/master
Commit: 4e96757793e7aee165381f80a60b3f46f60c9ebc
Parents: 5c639d7
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 14:33:03 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 14:33:03 2014 -0800
----------------------------------------------------------------------
.../src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/4e967577/examples/src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala
index 50e5f5b..19676fc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala
@@ -43,7 +43,7 @@ object SparkSVD {
// Load and parse the data file
val data = sc.textFile(args(1)).map { line =>
val parts = line.split(',')
- MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
+ MatrixEntry(parts(0).toInt - 1, parts(1).toInt - 1, parts(2).toDouble)
}
val m = args(2).toInt
val n = args(3).toInt
[39/50] git commit: Merge remote-tracking branch 'upstream/master'
into sparsesvd
Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master' into sparsesvd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/845e568f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/845e568f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/845e568f
Branch: refs/heads/master
Commit: 845e568fada0550e632e7381748c5a9ebbe53e16
Parents: f324d53 fdaabdc
Author: Reza Zadeh <ri...@gmail.com>
Authored: Mon Jan 13 23:52:34 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Mon Jan 13 23:52:34 2014 -0800
----------------------------------------------------------------------
.../org/apache/spark/bagel/BagelSuite.scala | 1 -
bin/compute-classpath.sh | 2 +
.../scala/org/apache/spark/Accumulators.scala | 6 +-
.../scala/org/apache/spark/Aggregator.scala | 14 +-
.../scala/org/apache/spark/CacheManager.scala | 4 +-
.../scala/org/apache/spark/FutureAction.scala | 8 +-
.../scala/org/apache/spark/HttpFileServer.scala | 6 +-
.../apache/spark/InterruptibleIterator.scala | 2 +-
.../main/scala/org/apache/spark/Logging.scala | 4 +-
.../org/apache/spark/MapOutputTracker.scala | 12 +-
.../scala/org/apache/spark/Partitioner.scala | 4 +-
.../scala/org/apache/spark/SparkContext.scala | 75 +-
.../main/scala/org/apache/spark/SparkEnv.scala | 10 -
.../org/apache/spark/SparkHadoopWriter.scala | 15 +-
.../org/apache/spark/api/python/PythonRDD.scala | 3 +-
.../org/apache/spark/broadcast/Broadcast.scala | 1 +
.../spark/broadcast/BroadcastFactory.scala | 2 +-
.../apache/spark/broadcast/HttpBroadcast.scala | 5 +-
.../spark/broadcast/TorrentBroadcast.scala | 12 +-
.../scala/org/apache/spark/deploy/Client.scala | 3 +-
.../spark/deploy/worker/CommandUtils.scala | 3 +-
.../executor/CoarseGrainedExecutorBackend.scala | 1 -
.../org/apache/spark/executor/Executor.scala | 4 +-
.../org/apache/spark/executor/TaskMetrics.scala | 10 +
.../apache/spark/network/BufferMessage.scala | 2 +-
.../org/apache/spark/network/Connection.scala | 6 +-
.../org/apache/spark/network/Message.scala | 6 +-
.../spark/network/netty/ShuffleSender.scala | 2 +-
.../main/scala/org/apache/spark/package.scala | 3 +
.../org/apache/spark/rdd/CoGroupedRDD.scala | 3 +
.../org/apache/spark/rdd/CoalescedRDD.scala | 10 +-
.../scala/org/apache/spark/rdd/HadoopRDD.scala | 32 +-
.../org/apache/spark/rdd/NewHadoopRDD.scala | 24 +-
.../org/apache/spark/rdd/PairRDDFunctions.scala | 12 +-
.../scala/org/apache/spark/rdd/PipedRDD.scala | 5 +-
.../main/scala/org/apache/spark/rdd/RDD.scala | 9 +-
.../apache/spark/scheduler/DAGScheduler.scala | 4 +-
.../spark/scheduler/InputFormatInfo.scala | 8 +-
.../scala/org/apache/spark/scheduler/Pool.scala | 8 +-
.../spark/scheduler/SchedulingAlgorithm.scala | 11 +-
.../apache/spark/scheduler/SparkListener.scala | 16 +-
.../spark/scheduler/SparkListenerBus.scala | 17 +-
.../org/apache/spark/scheduler/Stage.scala | 2 +-
.../org/apache/spark/scheduler/TaskResult.scala | 2 +-
.../spark/scheduler/TaskResultGetter.scala | 2 +-
.../spark/scheduler/TaskSchedulerImpl.scala | 5 +-
.../apache/spark/scheduler/TaskSetManager.scala | 14 +-
.../cluster/CoarseGrainedSchedulerBackend.scala | 2 +-
.../mesos/CoarseMesosSchedulerBackend.scala | 2 +-
.../cluster/mesos/MesosSchedulerBackend.scala | 6 +-
.../org/apache/spark/storage/BlockManager.scala | 7 +-
.../spark/storage/BlockManagerWorker.scala | 20 +-
.../org/apache/spark/storage/BlockMessage.scala | 2 +-
.../spark/storage/BlockMessageArray.scala | 2 +-
.../spark/storage/BlockObjectWriter.scala | 4 +-
.../org/apache/spark/storage/MemoryStore.scala | 2 +-
.../spark/storage/ShuffleBlockManager.scala | 2 +-
.../org/apache/spark/storage/StorageLevel.scala | 6 +-
.../apache/spark/ui/jobs/ExecutorSummary.scala | 2 +
.../apache/spark/ui/jobs/ExecutorTable.scala | 4 +
.../spark/ui/jobs/JobProgressListener.scala | 14 +
.../org/apache/spark/ui/jobs/StagePage.scala | 53 +-
.../org/apache/spark/ui/jobs/StageTable.scala | 2 +-
.../org/apache/spark/util/ClosureCleaner.scala | 10 +-
.../apache/spark/util/CompletionIterator.scala | 11 +-
.../org/apache/spark/util/MetadataCleaner.scala | 8 +-
.../spark/util/RateLimitedOutputStream.scala | 79 --
.../org/apache/spark/util/SizeEstimator.scala | 10 +-
.../scala/org/apache/spark/util/Utils.scala | 36 +-
.../scala/org/apache/spark/util/Vector.scala | 12 +-
.../spark/util/collection/AppendOnlyMap.scala | 2 +-
.../apache/spark/util/collection/BitSet.scala | 87 +-
.../util/collection/ExternalAppendOnlyMap.scala | 72 +-
.../spark/util/collection/OpenHashSet.scala | 23 +-
.../org/apache/spark/LocalSparkContext.scala | 1 -
.../apache/spark/MapOutputTrackerSuite.scala | 1 -
.../spark/scheduler/ClusterSchedulerSuite.scala | 9 +-
.../spark/scheduler/DAGSchedulerSuite.scala | 3 +-
.../apache/spark/scheduler/JobLoggerSuite.scala | 7 +-
.../spark/storage/BlockManagerSuite.scala | 4 -
.../apache/spark/util/ClosureCleanerSuite.scala | 14 +-
.../util/RateLimitedOutputStreamSuite.scala | 40 -
.../collection/ExternalAppendOnlyMapSuite.scala | 77 +-
docs/_config.yml | 2 +-
docs/_layouts/global.html | 8 +-
docs/_plugins/copy_api_dirs.rb | 2 +-
docs/api.md | 1 +
docs/bagel-programming-guide.md | 10 +-
docs/configuration.md | 13 +-
docs/graphx-programming-guide.md | 1003 ++++++++++++++++++
docs/img/data_parallel_vs_graph_parallel.png | Bin 0 -> 432725 bytes
docs/img/edge-cut.png | Bin 0 -> 12563 bytes
docs/img/edge_cut_vs_vertex_cut.png | Bin 0 -> 79745 bytes
docs/img/graph_analytics_pipeline.png | Bin 0 -> 427220 bytes
docs/img/graph_parallel.png | Bin 0 -> 92288 bytes
docs/img/graphx_figures.pptx | Bin 0 -> 1123363 bytes
docs/img/graphx_logo.png | Bin 0 -> 40324 bytes
docs/img/graphx_performance_comparison.png | Bin 0 -> 166343 bytes
docs/img/property_graph.png | Bin 0 -> 225151 bytes
docs/img/tables_and_graphs.png | Bin 0 -> 166265 bytes
docs/img/triplet.png | Bin 0 -> 31489 bytes
docs/img/vertex-cut.png | Bin 0 -> 12246 bytes
docs/img/vertex_routing_edge_tables.png | Bin 0 -> 570007 bytes
docs/index.md | 4 +-
docs/mllib-guide.md | 19 +-
docs/python-programming-guide.md | 8 +-
docs/streaming-programming-guide.md | 6 +-
.../org/apache/spark/examples/LocalALS.scala | 8 +-
.../org/apache/spark/examples/LocalFileLR.scala | 2 +-
.../org/apache/spark/examples/LocalKMeans.scala | 2 +-
.../org/apache/spark/examples/SparkALS.scala | 6 +-
.../org/apache/spark/examples/SparkHdfsLR.scala | 2 +-
.../org/apache/spark/examples/SparkKMeans.scala | 12 +-
.../examples/graphx/LiveJournalPageRank.scala | 49 +
.../streaming/examples/RawNetworkGrep.scala | 2 +-
.../examples/RecoverableNetworkWordCount.scala | 2 +-
.../streaming/examples/TwitterAlgebirdCMS.scala | 4 +-
.../streaming/examples/TwitterAlgebirdHLL.scala | 4 +-
.../streaming/examples/TwitterPopularTags.scala | 4 +-
.../clickstream/PageViewGenerator.scala | 2 +-
.../examples/clickstream/PageViewStream.scala | 2 +-
.../spark/streaming/flume/FlumeUtils.scala | 4 +-
.../spark/streaming/kafka/KafkaUtils.scala | 6 +-
.../apache/spark/streaming/mqtt/MQTTUtils.scala | 4 +-
.../spark/streaming/mqtt/MQTTStreamSuite.scala | 2 +-
.../spark/streaming/twitter/TwitterUtils.scala | 7 +-
.../streaming/twitter/TwitterStreamSuite.scala | 2 +-
.../spark/streaming/zeromq/ZeroMQUtils.scala | 3 +-
graphx/data/followers.txt | 8 +
graphx/data/users.txt | 7 +
graphx/pom.xml | 67 ++
.../scala/org/apache/spark/graphx/Edge.scala | 45 +
.../org/apache/spark/graphx/EdgeDirection.scala | 44 +
.../scala/org/apache/spark/graphx/EdgeRDD.scala | 102 ++
.../org/apache/spark/graphx/EdgeTriplet.scala | 49 +
.../scala/org/apache/spark/graphx/Graph.scala | 405 +++++++
.../spark/graphx/GraphKryoRegistrator.scala | 31 +
.../org/apache/spark/graphx/GraphLoader.scala | 72 ++
.../org/apache/spark/graphx/GraphOps.scala | 301 ++++++
.../apache/spark/graphx/PartitionStrategy.scala | 103 ++
.../scala/org/apache/spark/graphx/Pregel.scala | 139 +++
.../org/apache/spark/graphx/VertexRDD.scala | 347 ++++++
.../spark/graphx/impl/EdgePartition.scala | 220 ++++
.../graphx/impl/EdgePartitionBuilder.scala | 45 +
.../spark/graphx/impl/EdgeTripletIterator.scala | 42 +
.../apache/spark/graphx/impl/GraphImpl.scala | 379 +++++++
.../spark/graphx/impl/MessageToPartition.scala | 98 ++
.../graphx/impl/ReplicatedVertexView.scala | 195 ++++
.../apache/spark/graphx/impl/RoutingTable.scala | 65 ++
.../apache/spark/graphx/impl/Serializers.scala | 395 +++++++
.../spark/graphx/impl/VertexPartition.scala | 261 +++++
.../org/apache/spark/graphx/impl/package.scala | 7 +
.../org/apache/spark/graphx/lib/Analytics.scala | 136 +++
.../spark/graphx/lib/ConnectedComponents.scala | 38 +
.../org/apache/spark/graphx/lib/PageRank.scala | 147 +++
.../apache/spark/graphx/lib/SVDPlusPlus.scala | 138 +++
.../lib/StronglyConnectedComponents.scala | 94 ++
.../apache/spark/graphx/lib/TriangleCount.scala | 76 ++
.../scala/org/apache/spark/graphx/package.scala | 18 +
.../spark/graphx/util/BytecodeUtils.scala | 117 ++
.../spark/graphx/util/GraphGenerators.scala | 218 ++++
.../collection/PrimitiveKeyOpenHashMap.scala | 153 +++
graphx/src/test/resources/log4j.properties | 28 +
.../org/apache/spark/graphx/GraphOpsSuite.scala | 66 ++
.../org/apache/spark/graphx/GraphSuite.scala | 273 +++++
.../apache/spark/graphx/LocalSparkContext.scala | 28 +
.../org/apache/spark/graphx/PregelSuite.scala | 41 +
.../apache/spark/graphx/SerializerSuite.scala | 183 ++++
.../apache/spark/graphx/VertexRDDSuite.scala | 85 ++
.../spark/graphx/impl/EdgePartitionSuite.scala | 76 ++
.../graphx/impl/VertexPartitionSuite.scala | 113 ++
.../graphx/lib/ConnectedComponentsSuite.scala | 113 ++
.../apache/spark/graphx/lib/PageRankSuite.scala | 119 +++
.../spark/graphx/lib/SVDPlusPlusSuite.scala | 31 +
.../lib/StronglyConnectedComponentsSuite.scala | 57 +
.../spark/graphx/lib/TriangleCountSuite.scala | 70 ++
.../spark/graphx/util/BytecodeUtilsSuite.scala | 93 ++
mllib/data/sample_naive_bayes_data.txt | 6 +
.../spark/mllib/api/python/PythonMLLibAPI.scala | 46 +-
.../classification/LogisticRegression.scala | 4 +-
.../spark/mllib/classification/NaiveBayes.scala | 65 +-
.../apache/spark/mllib/classification/SVM.scala | 2 +
.../apache/spark/mllib/recommendation/ALS.scala | 15 +-
.../spark/mllib/regression/LabeledPoint.scala | 6 +-
.../apache/spark/mllib/regression/Lasso.scala | 4 +-
.../mllib/regression/LinearRegression.scala | 2 +
.../mllib/regression/RidgeRegression.scala | 4 +-
.../classification/JavaNaiveBayesSuite.java | 72 ++
pom.xml | 5 +-
project/SparkBuild.scala | 21 +-
python/pyspark/mllib/_common.py | 2 +-
python/pyspark/mllib/classification.py | 77 +-
python/pyspark/mllib/clustering.py | 11 +-
python/pyspark/mllib/recommendation.py | 10 +-
python/pyspark/mllib/regression.py | 35 +-
python/pyspark/worker.py | 4 +
python/run-tests | 5 +
.../scala/org/apache/spark/repl/ReplSuite.scala | 2 -
.../org/apache/spark/streaming/Checkpoint.scala | 6 +-
.../apache/spark/streaming/ContextWaiter.scala | 28 +
.../org/apache/spark/streaming/DStream.scala | 741 -------------
.../spark/streaming/DStreamCheckpointData.scala | 128 ---
.../apache/spark/streaming/DStreamGraph.scala | 12 +-
.../spark/streaming/PairDStreamFunctions.scala | 621 -----------
.../spark/streaming/StreamingContext.scala | 117 +-
.../spark/streaming/api/java/JavaDStream.scala | 3 +-
.../streaming/api/java/JavaDStreamLike.scala | 30 +-
.../streaming/api/java/JavaPairDStream.scala | 1 +
.../api/java/JavaStreamingContext.scala | 28 +-
.../spark/streaming/dstream/DStream.scala | 775 ++++++++++++++
.../dstream/DStreamCheckpointData.scala | 126 +++
.../streaming/dstream/FileInputDStream.scala | 86 +-
.../streaming/dstream/FilteredDStream.scala | 2 +-
.../dstream/FlatMapValuedDStream.scala | 2 +-
.../streaming/dstream/FlatMappedDStream.scala | 2 +-
.../streaming/dstream/ForEachDStream.scala | 2 +-
.../streaming/dstream/GlommedDStream.scala | 2 +-
.../spark/streaming/dstream/InputDStream.scala | 4 +-
.../dstream/MapPartitionedDStream.scala | 2 +-
.../streaming/dstream/MapValuedDStream.scala | 2 +-
.../spark/streaming/dstream/MappedDStream.scala | 2 +-
.../streaming/dstream/NetworkInputDStream.scala | 2 +-
.../dstream/PairDStreamFunctions.scala | 622 +++++++++++
.../dstream/ReducedWindowedDStream.scala | 2 +-
.../streaming/dstream/ShuffledDStream.scala | 2 +-
.../spark/streaming/dstream/StateDStream.scala | 10 +-
.../streaming/dstream/TransformedDStream.scala | 2 +-
.../spark/streaming/dstream/UnionDStream.scala | 3 +-
.../streaming/dstream/WindowedDStream.scala | 17 +-
.../apache/spark/streaming/scheduler/Job.scala | 9 +-
.../streaming/scheduler/JobGenerator.scala | 81 +-
.../streaming/scheduler/JobScheduler.scala | 141 ++-
.../spark/streaming/scheduler/JobSet.scala | 18 +-
.../scheduler/NetworkInputTracker.scala | 40 +-
.../streaming/scheduler/StreamingListener.scala | 3 +-
.../scheduler/StreamingListenerBus.scala | 23 +-
.../org/apache/spark/streaming/util/Clock.scala | 4 +-
.../streaming/util/MasterFailureTest.scala | 4 +-
.../util/RateLimitedOutputStream.scala | 79 ++
.../spark/streaming/util/RawTextHelper.scala | 2 +-
.../spark/streaming/util/RawTextSender.scala | 13 +-
.../spark/streaming/util/RecurringTimer.scala | 13 +-
.../streaming/LocalJavaStreamingContext.java | 2 -
.../spark/streaming/BasicOperationsSuite.scala | 79 +-
.../spark/streaming/CheckpointSuite.scala | 25 +-
.../spark/streaming/StreamingContextSuite.scala | 219 ++++
.../streaming/StreamingListenerSuite.scala | 1 +
.../apache/spark/streaming/TestSuiteBase.scala | 10 +-
.../spark/streaming/WindowOperationsSuite.scala | 15 +
.../util/RateLimitedOutputStreamSuite.scala | 40 +
.../tools/JavaAPICompletenessChecker.scala | 50 +-
.../org/apache/spark/deploy/yarn/Client.scala | 7 +-
.../spark/deploy/yarn/WorkerLauncher.scala | 8 +-
.../spark/deploy/yarn/WorkerRunnable.scala | 7 +-
.../yarn/ClientDistributedCacheManager.scala | 10 +-
.../ClientDistributedCacheManagerSuite.scala | 2 +-
.../org/apache/spark/deploy/yarn/Client.scala | 3 +-
257 files changed, 10443 insertions(+), 2434 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/845e568f/docs/mllib-guide.md
----------------------------------------------------------------------
diff --cc docs/mllib-guide.md
index 21d0464,1a5c640..a140ecb
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@@ -426,55 -435,5 +435,55 @@@ signals), you can use the trainImplici
{% highlight python %}
# Build the recommendation model using Alternating Least Squares based on implicit ratings
- model = ALS.trainImplicit(sc, ratings, 1, 20)
+ model = ALS.trainImplicit(ratings, 1, 20)
{% endhighlight %}
+
+
+# Singular Value Decomposition
+Singular Value Decomposition for Tall and Skinny matrices.
+Given an *m x n* matrix *A*, we can compute matrices *U, S, V* such that
+
+*A = U * S * V^T*
+
+There is no restriction on m, but we require n^2 doubles to fit in memory.
+Further, n should be less than m.
+
+The decomposition is computed by first computing *A^TA = V S^2 V^T*,
+computing svd locally on that (since n x n is small),
+from which we recover S and V.
+Then we compute U via easy matrix multiplication
+as *U = A * V * S^-1*
+
+Only singular vectors associated with largest k singular values
+are recovered. If there are k
+such values, then the dimensions of the return will be:
+
+* *S* is *k x k* and diagonal, holding the singular values on diagonal.
+* *U* is *m x k* and satisfies U^T*U = eye(k).
+* *V* is *n x k* and satisfies V^TV = eye(k).
+
+All input and output is expected in sparse matrix format, 1-indexed
+as tuples of the form ((i,j),value) all in
+SparseMatrix RDDs. Below is example usage.
+
+{% highlight scala %}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.SVD
+import org.apache.spark.mllib.linalg.SparseMatrix
+import org.apache.spark.mllib.linalg.MatrixEntry
+
+// Load and parse the data file
+val data = sc.textFile("mllib/data/als/test.data").map { line =>
+ val parts = line.split(',')
+ MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
+}
+val m = 4
+val n = 4
+val k = 1
+
+// recover largest singular vector
+val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), k)
+val = decomposed.S.data
+
+println("singular values = " + s.toArray.mkString)
[35/50] git commit: fix example
Posted by ma...@apache.org.
fix example
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/cf5bd4ab
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/cf5bd4ab
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/cf5bd4ab
Branch: refs/heads/master
Commit: cf5bd4ab2e9db72d3d9164053523e9e872d85b94
Parents: 4f38b6f
Author: Reza Zadeh <ri...@gmail.com>
Authored: Thu Jan 9 22:39:41 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Thu Jan 9 22:39:41 2014 -0800
----------------------------------------------------------------------
examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/cf5bd4ab/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala b/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
index 4b9e674..d9c672f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
@@ -33,8 +33,8 @@ import org.apache.spark.mllib.linalg.SparseMatrix
*/
object SparkSVD {
def main(args: Array[String]) {
- if (args.length < 3) {
- System.err.println("Usage: SVD <master> <file>")
+ if (args.length != 2) {
+ System.err.println("Usage: SparkSVD <master> <file>")
System.exit(1)
}
val sc = new SparkContext(args(0), "SVD",
[28/50] git commit: fix tests
Posted by ma...@apache.org.
fix tests
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/8bfcce1a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/8bfcce1a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/8bfcce1a
Branch: refs/heads/master
Commit: 8bfcce1ad81348a5eac3e3d332ddc293380c041a
Parents: 35adc72
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sat Jan 4 11:52:42 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sat Jan 4 11:52:42 2014 -0800
----------------------------------------------------------------------
.../org/apache/spark/mllib/linalg/SVD.scala | 28 +++++++++++++-------
.../apache/spark/mllib/linalg/SVDSuite.scala | 28 ++++++++++++--------
2 files changed, 36 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/8bfcce1a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index 9703e84..d1ee6c6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -31,7 +31,7 @@ import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
* @param m number of rows
* @param n number of columns
*/
-class GradientDescent(var data: RDD[MatrixEntry], var m: Int, var n: Int) {
+class SVD(var data: RDD[MatrixEntry], var m: Int, var n: Int) {
private var k: Int = 1
/**
@@ -65,6 +65,13 @@ class GradientDescent(var data: RDD[MatrixEntry], var m: Int, var n: Int) {
this.n = n
this
}
+
+ /**
+ * Compute SVD using the current set parameters
+ */
+ def computeSVD() : SVDecomposedMatrix = {
+ SVD.sparseSVD(data, m, n, k)
+ }
}
@@ -169,33 +176,36 @@ object SVD {
SVDecomposedMatrix(retU, retS, retV)
}
-/*
+
def main(args: Array[String]) {
if (args.length < 8) {
- println("Usage: SVD <master> <matrix_file> <m> <n>
- <minimum_singular_value> <output_U_file> <output_S_file> <output_V_file>")
+ println("Usage: SVD <master> <matrix_file> <m> <n>" +
+ "<k> <output_U_file> <output_S_file> <output_V_file>")
System.exit(1)
}
- val (master, inputFile, m, n, min_svalue, output_u, output_s, output_v) =
+ val (master, inputFile, m, n, k, output_u, output_s, output_v) =
(args(0), args(1), args(2).toInt, args(3).toInt,
- args(4).toDouble, args(5), args(6), args(7))
+ args(4).toInt, args(5), args(6), args(7))
val sc = new SparkContext(master, "SVD")
val rawdata = sc.textFile(inputFile)
val data = rawdata.map { line =>
val parts = line.split(',')
- ((parts(0).toInt, parts(1).toInt), parts(2).toDouble)
+ MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
}
- val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
+ val decomposed = SVD.sparseSVD(data, m, n, k)
+ val u = decomposed.U
+ val s = decomposed.S
+ val v = decomposed.V
+
println("Computed " + s.toArray.length + " singular values and vectors")
u.saveAsTextFile(output_u)
s.saveAsTextFile(output_s)
v.saveAsTextFile(output_v)
System.exit(0)
}
-*/
}
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/8bfcce1a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
index dc4e923..a83d9d0 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
@@ -87,15 +87,18 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
// check multiplication guarantee
assertMatrixEquals(retu.mmul(rets).mmul(retv.transpose), densea)
}
-/*
+
test("rank one matrix svd") {
val m = 10
val n = 3
- val data = sc.makeRDD(Array.tabulate(m,n){ (a,b)=>
- ((a+1,b+1), 1.0) }.flatten )
- val min_svalue = 1.0e-4
+ val data = sc.makeRDD(Array.tabulate(m, n){ (a,b) =>
+ MatrixEntry(a + 1, b + 1, 1.0) }.flatten )
+ val k = 1
- val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
+ val decomposed = SVD.sparseSVD(data, m, n, k)
+ val u = decomposed.U
+ val s = decomposed.S
+ val v = decomposed.V
val retrank = s.toArray.length
assert(retrank == 1, "rank returned not one")
@@ -116,15 +119,18 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
assertMatrixEquals(retu.mmul(rets).mmul(retv.transpose), densea)
}
- test("truncated with min singular value") {
+ test("truncated with k") {
val m = 10
val n = 3
- val data = sc.makeRDD(Array.tabulate(m,n){ (a,b)=>
- ((a+1,b+1), (a+2).toDouble*(b+1)/(1+a+b)) }.flatten )
+ val data = sc.makeRDD(Array.tabulate(m,n){ (a, b) =>
+ MatrixEntry(a + 1, b + 1, (a + 2).toDouble*(b + 1)/(1 + a + b)) }.flatten )
- val min_svalue = 5.0 // only one svalue above this
+ val k = 1 // only one svalue above this
- val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
+ val decomposed = SVD.sparseSVD(data, m, n, k)
+ val u = decomposed.U
+ val s = decomposed.S
+ val v = decomposed.V
val retrank = s.toArray.length
val densea = getDenseMatrix(data, m, n)
@@ -140,5 +146,5 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
assertMatrixEquals(retu, svd(0).getColumn(0))
assertMatrixEquals(rets, DoubleMatrix.diag(svd(1).getRow(0)))
assertMatrixEquals(retv, svd(2).getColumn(0))
- }*/
+ }
}
[23/50] git commit: start using matrixentry
Posted by ma...@apache.org.
start using matrixentry
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/7f631dd2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/7f631dd2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/7f631dd2
Branch: refs/heads/master
Commit: 7f631dd2a9e2467871167da1514be9863485a96f
Parents: 6bcdb76
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 3 22:17:24 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 3 22:17:24 2014 -0800
----------------------------------------------------------------------
.../org/apache/spark/mllib/linalg/SVD.scala | 23 ++++++++++++--------
1 file changed, 14 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/7f631dd2/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index 2198e6a..08af2c8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -23,6 +23,8 @@ import org.apache.spark.rdd.RDD
import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
+import org.apache.spark.linalg.MatrixEntry
+
/**
* Top-level methods for calling Singular Value Decomposition
* NOTE: All matrices are in 1-indexed sparse format RDD[((int, int), value)]
@@ -60,13 +62,13 @@ object SVD {
* @return Three sparse matrices: U, S, V such that A = USV^T
*/
def sparseSVD(
- data: RDD[((Int, Int), Double)],
+ data: RDD[MatrixEntry],
m: Int,
n: Int,
min_svalue: Double)
- : ( RDD[((Int, Int), Double)],
- RDD[((Int, Int), Double)],
- RDD[((Int, Int), Double)]) =
+ : ( RDD[MatrixEntry],
+ RDD[MatrixEntry],
+ RDD[MatrixEntry]) =
{
if (m < n || m <= 0 || n <= 0) {
throw new IllegalArgumentException("Expecting a tall and skinny matrix")
@@ -78,7 +80,7 @@ object SVD {
// Compute A^T A, assuming rows are sparse enough to fit in memory
val rows = data.map(entry =>
- (entry._1._1, (entry._1._2, entry._2))).groupByKey().cache()
+ (entry.i, (entry.j, entry.mval))).groupByKey().cache()
val emits = rows.flatMap{ case (rowind, cols) =>
cols.flatMap{ case (colind1, mval1) =>
cols.map{ case (colind2, mval2) =>
@@ -106,9 +108,9 @@ object SVD {
// prepare V for returning
val retV = sc.makeRDD(
Array.tabulate(V.rows, sigma.length){ (i,j) =>
- ((i+1, j+1), V.get(i,j)) }.flatten)
+ MatrixEntry(i+1, j+1, V.get(i,j)) }.flatten)
- val retS = sc.makeRDD(Array.tabulate(sigma.length){x=>((x+1,x+1),sigma(x))})
+ val retS = sc.makeRDD(Array.tabulate(sigma.length){x=>MatrixEntry(x+1,x+1,sigma(x))})
// Compute U as U = A V S^-1
// turn V S^-1 into an RDD as a sparse matrix and cache it
@@ -120,6 +122,7 @@ object SVD {
val bRows = vsirdd.map(entry => (entry._1._1, (entry._1._2, entry._2)))
val retU = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal)) )
=> ((rowInd, colInd), rowVal*colVal)}).reduceByKey(_+_)
+ .map( case (row, col, mval) => MatrixEntry(row, col, mval))
(retU, retS, retV)
}
@@ -127,11 +130,13 @@ object SVD {
def main(args: Array[String]) {
if (args.length < 8) {
- println("Usage: SVD <master> <matrix_file> <m> <n> <minimum_singular_value> <output_U_file> <output_S_file> <output_V_file>")
+ println("Usage: SVD <master> <matrix_file> <m> <n>
+ <minimum_singular_value> <output_U_file> <output_S_file> <output_V_file>")
System.exit(1)
}
val (master, inputFile, m, n, min_svalue, output_u, output_s, output_v) =
- (args(0), args(1), args(2).toInt, args(3).toInt, args(4).toDouble, args(5), args(6), args(7))
+ (args(0), args(1), args(2).toInt, args(3).toInt,
+ args(4).toDouble, args(5), args(6), args(7))
val sc = new SparkContext(master, "SVD")
[15/50] git commit: remove accidental changes to ec2 script
Posted by ma...@apache.org.
remove accidental changes to ec2 script
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/0c3797dd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/0c3797dd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/0c3797dd
Branch: refs/heads/master
Commit: 0c3797dd15c1323d046e4eae36c2914470c8701e
Parents: 53ccf65
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 20:05:03 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 20:05:03 2014 -0800
----------------------------------------------------------------------
ec2/spark_ec2.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/0c3797dd/ec2/spark_ec2.py
----------------------------------------------------------------------
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 5e8b381..ac309cc 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -636,7 +636,7 @@ def get_partition(total, num_partitions, current_partitions):
def real_main():
(opts, action, cluster_name) = parse_args()
try:
- conn = ec2.connect_to_region(opts.region,aws_access_key_id="AKIAI2EGAQ7GYNL4LRAA", aws_secret_access_key="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")
+ conn = ec2.connect_to_region(opts.region)
except Exception as e:
print >> stderr, (e)
sys.exit(1)
[09/50] git commit: Merge remote-tracking branch 'upstream/master'
into sparsesvd
Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master' into sparsesvd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/7c04b313
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/7c04b313
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/7c04b313
Branch: refs/heads/master
Commit: 7c04b3134a4257f9aea736caecbcec1f451c2443
Parents: ae5102a c1d928a
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 18:12:35 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 18:12:35 2014 -0800
----------------------------------------------------------------------
core/pom.xml | 422 ++++++++++---------
.../apache/spark/network/netty/FileClient.java | 32 +-
.../netty/FileClientChannelInitializer.java | 6 +-
.../spark/network/netty/FileClientHandler.java | 12 +-
.../apache/spark/network/netty/FileServer.java | 29 +-
.../netty/FileServerChannelInitializer.java | 3 +-
.../spark/network/netty/FileServerHandler.java | 18 +-
.../org/apache/spark/default-log4j.properties | 8 +
.../scala/org/apache/spark/HttpServer.scala | 1 +
.../main/scala/org/apache/spark/Logging.scala | 41 +-
.../scala/org/apache/spark/SparkContext.scala | 26 +-
.../scala/org/apache/spark/TaskEndReason.scala | 2 -
.../org/apache/spark/api/java/JavaPairRDD.scala | 36 ++
.../org/apache/spark/api/java/JavaRDDLike.scala | 11 +
.../spark/api/java/JavaSparkContext.scala | 15 +-
.../spark/deploy/worker/ui/WorkerWebUI.scala | 6 +-
.../org/apache/spark/executor/Executor.scala | 2 -
.../apache/spark/metrics/MetricsConfig.scala | 1 -
.../apache/spark/metrics/MetricsSystem.scala | 1 -
.../org/apache/spark/rdd/CheckpointRDD.scala | 32 +-
.../org/apache/spark/rdd/PairRDDFunctions.scala | 42 ++
.../main/scala/org/apache/spark/rdd/RDD.scala | 16 +-
.../apache/spark/rdd/RDDCheckpointData.scala | 15 +-
.../apache/spark/scheduler/DAGScheduler.scala | 44 +-
.../org/apache/spark/scheduler/JobLogger.scala | 6 +-
.../scala/org/apache/spark/scheduler/Pool.scala | 4 -
.../apache/spark/scheduler/Schedulable.scala | 1 -
.../apache/spark/scheduler/SparkListener.scala | 20 +-
.../spark/scheduler/SparkListenerBus.scala | 2 +-
.../spark/scheduler/TaskSchedulerImpl.scala | 7 -
.../apache/spark/scheduler/TaskSetManager.scala | 10 -
.../spark/storage/BlockManagerMasterActor.scala | 2 -
.../spark/storage/BlockManagerWorker.scala | 3 -
.../spark/storage/BlockMessageArray.scala | 2 -
.../spark/ui/jobs/JobProgressListener.scala | 11 +-
.../spark/util/SerializableHyperLogLog.scala | 50 +++
.../org/apache/spark/CheckpointSuite.scala | 2 -
.../scala/org/apache/spark/JavaAPISuite.java | 36 +-
.../spark/rdd/PairRDDFunctionsSuite.scala | 34 ++
.../scala/org/apache/spark/rdd/RDDSuite.scala | 13 +
.../apache/spark/scheduler/JobLoggerSuite.scala | 2 +-
.../spark/scheduler/SparkListenerSuite.scala | 2 +-
.../spark/serializer/KryoSerializerSuite.scala | 4 +
pom.xml | 7 +-
project/SparkBuild.scala | 5 +-
python/pyspark/context.py | 9 +-
python/pyspark/tests.py | 4 +-
spark-class | 9 +-
spark-class2.cmd | 2 +-
.../org/apache/spark/streaming/Checkpoint.scala | 52 ++-
.../org/apache/spark/streaming/DStream.scala | 2 -
.../apache/spark/streaming/DStreamGraph.scala | 1 -
.../spark/streaming/StreamingContext.scala | 34 +-
.../api/java/JavaStreamingContext.scala | 13 +-
.../streaming/dstream/FileInputDStream.scala | 153 ++++---
.../streaming/dstream/NetworkInputDStream.scala | 2 -
.../streaming/scheduler/JobGenerator.scala | 67 ++-
.../streaming/scheduler/JobScheduler.scala | 2 -
.../streaming/util/MasterFailureTest.scala | 3 -
.../spark/streaming/CheckpointSuite.scala | 44 +-
.../spark/streaming/InputStreamsSuite.scala | 2 +-
61 files changed, 889 insertions(+), 554 deletions(-)
----------------------------------------------------------------------
[48/50] git commit: rename to MatrixSVD
Posted by ma...@apache.org.
rename to MatrixSVD
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/fa329983
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/fa329983
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/fa329983
Branch: refs/heads/master
Commit: fa3299835bd52faf766929987e1aa4686730e2b4
Parents: caf97a2
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 14:39:30 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 14:39:30 2014 -0800
----------------------------------------------------------------------
.../main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala | 6 +++---
mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/fa329983/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala
index 6220035..319f82b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala
@@ -24,6 +24,6 @@ package org.apache.spark.mllib.linalg
* @param S such that A = USV^T
* @param V such that A = USV^T
*/
-case class SVDecomposedMatrix(val U: SparseMatrix,
- val S: SparseMatrix,
- val V: SparseMatrix)
+case class MatrixSVD(val U: SparseMatrix,
+ val S: SparseMatrix,
+ val V: SparseMatrix)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/fa329983/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index 91c6220..cab98b3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -150,7 +150,7 @@ object SVD {
.map{ case ((row, col), mval) => MatrixEntry(row, col, mval)}
val retU = SparseMatrix(retUdata, m, sigma.length)
- SVDecomposedMatrix(retU, retS, retV)
+ MatrixSVD(retU, retS, retV)
}
[19/50] git commit: Merge remote-tracking branch 'upstream/master'
into sparsesvd
Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master' into sparsesvd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/61405785
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/61405785
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/61405785
Branch: refs/heads/master
Commit: 61405785bc561b55681100fc3ef7e15ae8c4b113
Parents: 2612164 3713f81
Author: Reza Zadeh <ri...@gmail.com>
Authored: Thu Jan 2 01:50:30 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Thu Jan 2 01:50:30 2014 -0800
----------------------------------------------------------------------
.../scala/org/apache/spark/Accumulators.scala | 8 +-
.../org/apache/spark/MapOutputTracker.scala | 11 +-
.../scala/org/apache/spark/Partitioner.scala | 4 +-
.../main/scala/org/apache/spark/SparkConf.scala | 189 +++++++++++++++
.../scala/org/apache/spark/SparkContext.scala | 241 +++++++++++--------
.../main/scala/org/apache/spark/SparkEnv.scala | 54 +++--
.../spark/api/java/JavaSparkContext.scala | 21 +-
.../org/apache/spark/api/python/PythonRDD.scala | 4 +-
.../org/apache/spark/broadcast/Broadcast.scala | 8 +-
.../spark/broadcast/BroadcastFactory.scala | 4 +-
.../apache/spark/broadcast/HttpBroadcast.scala | 43 ++--
.../spark/broadcast/TorrentBroadcast.scala | 45 ++--
.../spark/deploy/FaultToleranceTest.scala | 4 +-
.../apache/spark/deploy/LocalSparkCluster.scala | 7 +-
.../apache/spark/deploy/SparkHadoopUtil.scala | 14 +-
.../org/apache/spark/deploy/client/Client.scala | 13 +-
.../apache/spark/deploy/client/TestClient.scala | 10 +-
.../org/apache/spark/deploy/master/Master.scala | 41 ++--
.../spark/deploy/master/MasterArguments.scala | 11 +-
.../deploy/master/SparkZooKeeperSession.scala | 7 +-
.../master/ZooKeeperLeaderElectionAgent.scala | 9 +-
.../master/ZooKeeperPersistenceEngine.scala | 8 +-
.../spark/deploy/master/ui/MasterWebUI.scala | 2 +-
.../org/apache/spark/deploy/worker/Worker.scala | 34 ++-
.../spark/deploy/worker/ui/WorkerWebUI.scala | 6 +-
.../executor/CoarseGrainedExecutorBackend.scala | 6 +-
.../org/apache/spark/executor/Executor.scala | 21 +-
.../org/apache/spark/io/CompressionCodec.scala | 19 +-
.../apache/spark/metrics/MetricsSystem.scala | 10 +-
.../spark/network/ConnectionManager.scala | 22 +-
.../org/apache/spark/network/ReceiverTest.scala | 12 +-
.../org/apache/spark/network/SenderTest.scala | 16 +-
.../spark/network/netty/ShuffleCopier.scala | 10 +-
.../org/apache/spark/rdd/CheckpointRDD.scala | 5 +-
.../org/apache/spark/rdd/CoGroupedRDD.scala | 2 +-
.../main/scala/org/apache/spark/rdd/RDD.scala | 1 +
.../org/apache/spark/rdd/ShuffledRDD.scala | 2 +-
.../org/apache/spark/rdd/SubtractedRDD.scala | 2 +-
.../apache/spark/scheduler/DAGScheduler.scala | 3 +-
.../spark/scheduler/InputFormatInfo.scala | 14 +-
.../org/apache/spark/scheduler/ResultTask.scala | 4 +-
.../spark/scheduler/SchedulableBuilder.scala | 6 +-
.../spark/scheduler/SchedulerBackend.scala | 3 -
.../apache/spark/scheduler/ShuffleMapTask.scala | 6 +-
.../spark/scheduler/TaskResultGetter.scala | 3 +-
.../spark/scheduler/TaskSchedulerImpl.scala | 20 +-
.../apache/spark/scheduler/TaskSetManager.scala | 18 +-
.../cluster/CoarseGrainedSchedulerBackend.scala | 20 +-
.../cluster/SimrSchedulerBackend.scala | 4 +-
.../cluster/SparkDeploySchedulerBackend.scala | 8 +-
.../mesos/CoarseMesosSchedulerBackend.scala | 14 +-
.../cluster/mesos/MesosSchedulerBackend.scala | 8 +-
.../spark/scheduler/local/LocalBackend.scala | 3 +-
.../spark/serializer/JavaSerializer.scala | 3 +-
.../spark/serializer/KryoSerializer.scala | 14 +-
.../spark/serializer/SerializerManager.scala | 12 +-
.../spark/storage/BlockFetcherIterator.scala | 4 +-
.../org/apache/spark/storage/BlockManager.scala | 58 ++---
.../spark/storage/BlockManagerMaster.scala | 11 +-
.../spark/storage/BlockManagerMasterActor.scala | 14 +-
.../spark/storage/BlockObjectWriter.scala | 5 +-
.../apache/spark/storage/DiskBlockManager.scala | 2 +-
.../spark/storage/ShuffleBlockManager.scala | 10 +-
.../spark/storage/StoragePerfTester.scala | 2 +-
.../apache/spark/storage/ThreadingTest.scala | 8 +-
.../scala/org/apache/spark/ui/SparkUI.scala | 4 +-
.../apache/spark/ui/UIWorkloadGenerator.scala | 17 +-
.../org/apache/spark/ui/env/EnvironmentUI.scala | 15 +-
.../spark/ui/jobs/JobProgressListener.scala | 4 +-
.../scala/org/apache/spark/util/AkkaUtils.scala | 25 +-
.../org/apache/spark/util/MetadataCleaner.scala | 35 ++-
.../org/apache/spark/util/SizeEstimator.scala | 14 +-
.../scala/org/apache/spark/util/Utils.scala | 25 +-
core/src/test/resources/spark.conf | 8 +
.../apache/spark/MapOutputTrackerSuite.scala | 16 +-
.../org/apache/spark/SharedSparkContext.scala | 4 +-
.../scala/org/apache/spark/SparkConfSuite.scala | 110 +++++++++
.../apache/spark/io/CompressionCodecSuite.scala | 8 +-
.../spark/metrics/MetricsSystemSuite.scala | 8 +-
.../spark/scheduler/ClusterSchedulerSuite.scala | 2 +-
.../spark/scheduler/DAGSchedulerSuite.scala | 23 +-
.../apache/spark/scheduler/JobLoggerSuite.scala | 2 +-
.../spark/scheduler/TaskResultGetterSuite.scala | 6 +-
.../spark/scheduler/TaskSetManagerSuite.scala | 4 +-
.../spark/serializer/KryoSerializerSuite.scala | 29 +--
.../spark/storage/BlockManagerSuite.scala | 97 ++++----
.../spark/storage/DiskBlockManagerSuite.scala | 18 +-
.../apache/spark/util/SizeEstimatorSuite.scala | 2 +-
docs/_config.yml | 2 +-
docs/configuration.md | 71 ++++--
docs/css/bootstrap.min.css | 2 +-
docs/job-scheduling.md | 21 +-
docs/monitoring.md | 3 +-
docs/python-programming-guide.md | 15 +-
docs/quick-start.md | 52 ++--
docs/running-on-mesos.md | 19 +-
docs/scala-programming-guide.md | 4 +-
docs/spark-standalone.md | 15 +-
docs/streaming-programming-guide.md | 4 +-
docs/tuning.md | 21 +-
.../examples/bagel/WikipediaPageRank.scala | 10 +-
.../bagel/WikipediaPageRankStandalone.scala | 8 +-
.../streaming/examples/ActorWordCount.scala | 3 +-
.../apache/spark/mllib/recommendation/ALS.scala | 13 +-
.../spark/deploy/yarn/ApplicationMaster.scala | 52 ++--
.../org/apache/spark/deploy/yarn/Client.scala | 40 +--
.../spark/deploy/yarn/ClientArguments.scala | 2 +-
.../spark/deploy/yarn/WorkerLauncher.scala | 4 +-
.../deploy/yarn/YarnAllocationHandler.scala | 4 +-
.../cluster/YarnClientSchedulerBackend.scala | 4 +-
project/SparkBuild.scala | 1 +
python/epydoc.conf | 2 +-
python/pyspark/__init__.py | 32 ++-
python/pyspark/broadcast.py | 11 +
python/pyspark/conf.py | 171 +++++++++++++
python/pyspark/context.py | 59 +++--
python/pyspark/java_gateway.py | 1 +
python/run-tests | 3 +-
.../org/apache/spark/repl/SparkILoop.scala | 17 +-
.../org/apache/spark/repl/SparkIMain.scala | 7 +-
.../org/apache/spark/streaming/Checkpoint.scala | 18 +-
.../org/apache/spark/streaming/DStream.scala | 2 +-
.../spark/streaming/StreamingContext.scala | 55 +++--
.../api/java/JavaStreamingContext.scala | 9 +
.../streaming/dstream/NetworkInputDStream.scala | 6 +-
.../streaming/scheduler/JobGenerator.scala | 8 +-
.../streaming/scheduler/JobScheduler.scala | 4 +-
.../spark/streaming/util/RawTextSender.scala | 4 +-
.../apache/spark/streaming/JavaAPISuite.java | 10 +-
.../spark/streaming/BasicOperationsSuite.scala | 8 +-
.../spark/streaming/CheckpointSuite.scala | 15 +-
.../spark/streaming/InputStreamsSuite.scala | 18 +-
.../apache/spark/streaming/TestSuiteBase.scala | 34 +--
.../spark/streaming/WindowOperationsSuite.scala | 1 -
.../spark/deploy/yarn/ApplicationMaster.scala | 64 ++---
.../org/apache/spark/deploy/yarn/Client.scala | 52 ++--
.../spark/deploy/yarn/ClientArguments.scala | 2 +-
.../spark/deploy/yarn/WorkerLauncher.scala | 4 +-
.../deploy/yarn/YarnAllocationHandler.scala | 2 +-
.../cluster/YarnClientSchedulerBackend.scala | 4 +-
140 files changed, 1731 insertions(+), 941 deletions(-)
----------------------------------------------------------------------
[45/50] git commit: 0index docs
Posted by ma...@apache.org.
0index docs
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/5c639d70
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/5c639d70
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/5c639d70
Branch: refs/heads/master
Commit: 5c639d70df3da48bb52841aa57074ec151bb61cf
Parents: c9b4845
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 14:31:39 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 14:31:39 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/5c639d70/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 89ac64a..5be8ce1 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -463,7 +463,7 @@ such values, then the dimensions of the return will be:
* *U* is *m x k* and satisfies U^T*U = eye(k).
* *V* is *n x k* and satisfies V^TV = eye(k).
-All input and output is expected in sparse matrix format, 1-indexed
+All input and output is expected in sparse matrix format, 0-indexed
as tuples of the form ((i,j),value) all in
SparseMatrix RDDs. Below is example usage.
[07/50] git commit: initial large scale testing begin
Posted by ma...@apache.org.
initial large scale testing begin
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/642ab5c1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/642ab5c1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/642ab5c1
Branch: refs/heads/master
Commit: 642ab5c1e1ba98833265447447702c3c39fb2d40
Parents: 3369c2d
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Dec 27 01:51:19 2013 -0500
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Dec 27 01:51:19 2013 -0500
----------------------------------------------------------------------
ec2/spark_ec2.py | 12 +-----------
.../scala/org/apache/spark/mllib/linalg/sparsesvd.scala | 8 ++++----
2 files changed, 5 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/642ab5c1/ec2/spark_ec2.py
----------------------------------------------------------------------
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index a2b0e7e..5e8b381 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -113,16 +113,6 @@ def parse_args():
# Boto config check
# http://boto.cloudhackers.com/en/latest/boto_config_tut.html
home_dir = os.getenv('HOME')
- if home_dir == None or not os.path.isfile(home_dir + '/.boto'):
- if not os.path.isfile('/etc/boto.cfg'):
- if os.getenv('AWS_ACCESS_KEY_ID') == None:
- print >> stderr, ("ERROR: The environment variable AWS_ACCESS_KEY_ID " +
- "must be set")
- sys.exit(1)
- if os.getenv('AWS_SECRET_ACCESS_KEY') == None:
- print >> stderr, ("ERROR: The environment variable AWS_SECRET_ACCESS_KEY " +
- "must be set")
- sys.exit(1)
return (opts, action, cluster_name)
@@ -646,7 +636,7 @@ def get_partition(total, num_partitions, current_partitions):
def real_main():
(opts, action, cluster_name) = parse_args()
try:
- conn = ec2.connect_to_region(opts.region)
+ conn = ec2.connect_to_region(opts.region,aws_access_key_id="AKIAI2EGAQ7GYNL4LRAA", aws_secret_access_key="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")
except Exception as e:
print >> stderr, (e)
sys.exit(1)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/642ab5c1/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index 1c9f67e..edf715d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -32,8 +32,8 @@ import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
* There is no restriction on m, but we require n^2 doubles to fit in memory.
* Further, n should be less than m.
*
- * This is computed by first computing A'A = V S^2 V',
- * computing locally on that (since n x n is small),
+ * The decomposition is computed by first computing A'A = V S^2 V',
+ * computing svd locally on that (since n x n is small),
* from which we recover S and V.
* Then we compute U via easy matrix multiplication
* as U = A * V * S^-1
@@ -43,8 +43,8 @@ import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
* such values, then the dimensions of the return will be:
*
* S is k x k and diagonal, holding the singular values on diagonal
- * U is m x k and satisfies U'U = eye(k,k)
- * V is n x k and satisfies V'V = eye(k,k)
+ * U is m x k and satisfies U'U = eye(k)
+ * V is n x k and satisfies V'V = eye(k)
*
* All input and output is expected in sparse matrix format, 1-indexed
* as tuples of the form ((i,j),value) all in RDDs
[43/50] git commit: add rename computeSVD
Posted by ma...@apache.org.
add rename computeSVD
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/dbec69bb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/dbec69bb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/dbec69bb
Branch: refs/heads/master
Commit: dbec69bbf40db65563b754f2802a384de0c568e5
Parents: eb2d8c4
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 13:59:05 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 13:59:05 2014 -0800
----------------------------------------------------------------------
mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/dbec69bb/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index 83fcb01..91c6220 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -41,7 +41,7 @@ class SVD {
/**
* Compute SVD using the current set parameters
*/
- def computeSVD(matrix: SparseMatrix) : SVDecomposedMatrix = {
+ def compute(matrix: SparseMatrix) : SVDecomposedMatrix = {
SVD.sparseSVD(matrix, k)
}
}
[17/50] git commit: javadoc for sparsesvd
Posted by ma...@apache.org.
javadoc for sparsesvd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/915d53f8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/915d53f8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/915d53f8
Branch: refs/heads/master
Commit: 915d53f8acb1f7ab14894b1255eb334b0812d9d3
Parents: c868d71
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 20:20:16 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 20:20:16 2014 -0800
----------------------------------------------------------------------
.../scala/org/apache/spark/mllib/linalg/sparsesvd.scala | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/915d53f8/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index 83b2178..19173fd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -24,6 +24,7 @@ import org.apache.spark.rdd.RDD
import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
+object SVD {
/**
* Singular Value Decomposition for Tall and Skinny matrices.
* Given an m x n matrix A, this will compute matrices U, S, V such that
@@ -48,10 +49,13 @@ import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
*
* All input and output is expected in sparse matrix format, 1-indexed
* as tuples of the form ((i,j),value) all in RDDs
+ *
+ * @param data RDD Matrix in sparse 1-index format ((int, int), value)
+ * @param m number of rows
+ * @param n number of columns
+ * @param min_svalue Recover singular values greater or equal to min_svalue
+ * @return Three sparse matrices: U, S, V such that A = USV^T
*/
-
-
-object SVD {
def sparseSVD(
data: RDD[((Int, Int), Double)],
m: Int,
[41/50] git commit: use 0-indexing
Posted by ma...@apache.org.
use 0-indexing
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/cb13b15a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/cb13b15a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/cb13b15a
Branch: refs/heads/master
Commit: cb13b15a60ce8eb55b2d2971a57ac8d4bd5c7574
Parents: d28bf41
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 13:55:42 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 13:55:42 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 4 +--
.../apache/spark/mllib/linalg/MatrixEntry.scala | 4 +--
.../apache/spark/mllib/linalg/MatrixSVD.scala | 29 ++++++++++++++++++++
.../org/apache/spark/mllib/linalg/SVD.scala | 12 ++++----
.../spark/mllib/linalg/SVDecomposedMatrix.scala | 29 --------------------
.../apache/spark/mllib/linalg/SVDSuite.scala | 8 +++---
6 files changed, 43 insertions(+), 43 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/cb13b15a/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 26350ce..89ac64a 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -476,8 +476,8 @@ import org.apache.spark.mllib.linalg.MatrixEntry
// Load and parse the data file
val data = sc.textFile("mllib/data/als/test.data").map { line =>
- val parts = line.split(',')
- MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
+ val parts = line.split(',')
+ MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
}
val m = 4
val n = 4
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/cb13b15a/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixEntry.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixEntry.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixEntry.scala
index c7f2aba..416996f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixEntry.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixEntry.scala
@@ -20,8 +20,8 @@ package org.apache.spark.mllib.linalg
/**
* Class that represents an entry in a sparse matrix of doubles.
*
- * @param i row index (1 indexing used)
- * @param j column index (1 indexing used)
+ * @param i row index (0 indexing used)
+ * @param j column index (0 indexing used)
* @param mval value of entry in matrix
*/
case class MatrixEntry(val i: Int, val j: Int, val mval: Double)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/cb13b15a/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala
new file mode 100644
index 0000000..6220035
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixSVD.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+/**
+ * Class that represents the SV decomposition of a matrix
+ *
+ * @param U such that A = USV^T
+ * @param S such that A = USV^T
+ * @param V such that A = USV^T
+ */
+case class SVDecomposedMatrix(val U: SparseMatrix,
+ val S: SparseMatrix,
+ val V: SparseMatrix)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/cb13b15a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index 6590e8f..ba7a0fd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -49,7 +49,7 @@ class SVD {
/**
* Top-level methods for calling Singular Value Decomposition
- * NOTE: All matrices are in 1-indexed sparse format RDD[((int, int), value)]
+ * NOTE: All matrices are in 0-indexed sparse format RDD[((int, int), value)]
*/
object SVD {
/**
@@ -73,7 +73,7 @@ object SVD {
* U is m x k and satisfies U'U = eye(k)
* V is n x k and satisfies V'V = eye(k)
*
- * All input and output is expected in sparse matrix format, 1-indexed
+ * All input and output is expected in sparse matrix format, 0-indexed
* as tuples of the form ((i,j),value) all in RDDs using the
* SparseMatrix class
*
@@ -110,7 +110,7 @@ object SVD {
// Construct jblas A^T A locally
val ata = DoubleMatrix.zeros(n, n)
for (entry <- emits.toArray) {
- ata.put(entry._1._1 - 1, entry._1._2 - 1, entry._2)
+ ata.put(entry._1._1, entry._1._2, entry._2)
}
// Since A^T A is small, we can compute its SVD directly
@@ -129,18 +129,18 @@ object SVD {
// prepare V for returning
val retVdata = sc.makeRDD(
Array.tabulate(V.rows, sigma.length){ (i,j) =>
- MatrixEntry(i + 1, j + 1, V.get(i,j)) }.flatten)
+ MatrixEntry(i, j, V.get(i,j)) }.flatten)
val retV = SparseMatrix(retVdata, V.rows, sigma.length)
val retSdata = sc.makeRDD(Array.tabulate(sigma.length){
- x => MatrixEntry(x + 1, x + 1, sigma(x))})
+ x => MatrixEntry(x, x, sigma(x))})
val retS = SparseMatrix(retSdata, sigma.length, sigma.length)
// Compute U as U = A V S^-1
// turn V S^-1 into an RDD as a sparse matrix
val vsirdd = sc.makeRDD(Array.tabulate(V.rows, sigma.length)
- { (i,j) => ((i + 1, j + 1), V.get(i,j) / sigma(j)) }.flatten)
+ { (i,j) => ((i, j), V.get(i,j) / sigma(j)) }.flatten)
// Multiply A by VS^-1
val aCols = data.map(entry => (entry.j, (entry.i, entry.mval)))
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/cb13b15a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
deleted file mode 100644
index 6220035..0000000
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.linalg
-
-/**
- * Class that represents the SV decomposition of a matrix
- *
- * @param U such that A = USV^T
- * @param S such that A = USV^T
- * @param V such that A = USV^T
- */
-case class SVDecomposedMatrix(val U: SparseMatrix,
- val S: SparseMatrix,
- val V: SparseMatrix)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/cb13b15a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
index f239e85..12b3801 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
@@ -50,7 +50,7 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val m = matrix.m
val n = matrix.n
val ret = DoubleMatrix.zeros(m, n)
- matrix.data.toArray.map(x => ret.put(x.i - 1, x.j - 1, x.mval))
+ matrix.data.toArray.map(x => ret.put(x.i, x.j, x.mval))
ret
}
@@ -68,7 +68,7 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val m = 10
val n = 3
val data = sc.makeRDD(Array.tabulate(m,n){ (a, b) =>
- MatrixEntry(a + 1, b + 1, (a + 2).toDouble * (b + 1) / (1 + a + b)) }.flatten )
+ MatrixEntry(a, b, (a + 2).toDouble * (b + 1) / (1 + a + b)) }.flatten )
val a = SparseMatrix(data, m, n)
@@ -97,7 +97,7 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val m = 10
val n = 3
val data = sc.makeRDD(Array.tabulate(m, n){ (a,b) =>
- MatrixEntry(a + 1, b + 1, 1.0) }.flatten )
+ MatrixEntry(a, b, 1.0) }.flatten )
val k = 1
val a = SparseMatrix(data, m, n)
@@ -130,7 +130,7 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val m = 10
val n = 3
val data = sc.makeRDD(Array.tabulate(m,n){ (a, b) =>
- MatrixEntry(a + 1, b + 1, (a + 2).toDouble * (b + 1)/(1 + a + b)) }.flatten )
+ MatrixEntry(a, b, (a + 2).toDouble * (b + 1)/(1 + a + b)) }.flatten )
val a = SparseMatrix(data, m, n)
val k = 1 // only one svalue above this
[38/50] git commit: Merge remote-tracking branch 'upstream/master'
into sparsesvd
Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master' into sparsesvd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/f324d535
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/f324d535
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/f324d535
Branch: refs/heads/master
Commit: f324d5355514b1c7ae85019b476046bb64b5593e
Parents: 1afdeae ee6e7f9
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sat Jan 11 13:27:15 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sat Jan 11 13:27:15 2014 -0800
----------------------------------------------------------------------
conf/log4j.properties.template | 4 +-
.../org/apache/spark/log4j-defaults.properties | 4 +-
.../scala/org/apache/spark/Aggregator.scala | 61 ++--
.../scala/org/apache/spark/SparkContext.scala | 88 ++---
.../main/scala/org/apache/spark/SparkEnv.scala | 6 +-
.../deploy/master/ui/ApplicationPage.scala | 4 +-
.../org/apache/spark/executor/Executor.scala | 5 +
.../org/apache/spark/rdd/CoGroupedRDD.scala | 88 +++--
.../scala/org/apache/spark/rdd/HadoopRDD.scala | 29 +-
.../org/apache/spark/rdd/NewHadoopRDD.scala | 20 +-
.../org/apache/spark/rdd/PairRDDFunctions.scala | 7 +-
.../apache/spark/scheduler/DAGScheduler.scala | 4 +-
.../org/apache/spark/storage/BlockId.scala | 12 +-
.../org/apache/spark/storage/BlockManager.scala | 4 +-
.../spark/storage/BlockObjectWriter.scala | 4 +
.../apache/spark/storage/DiskBlockManager.scala | 11 +-
.../spark/storage/ShuffleBlockManager.scala | 2 +-
.../org/apache/spark/ui/jobs/StagePage.scala | 2 +-
.../org/apache/spark/util/AppendOnlyMap.scala | 237 -------------
.../org/apache/spark/util/MetadataCleaner.scala | 2 +-
.../apache/spark/util/TimeStampedHashMap.scala | 17 +-
.../scala/org/apache/spark/util/Utils.scala | 28 +-
.../scala/org/apache/spark/util/Vector.scala | 8 +
.../spark/util/collection/AppendOnlyMap.scala | 297 ++++++++++++++++
.../util/collection/ExternalAppendOnlyMap.scala | 350 +++++++++++++++++++
.../collection/SizeTrackingAppendOnlyMap.scala | 101 ++++++
.../apache/spark/util/AppendOnlyMapSuite.scala | 154 --------
.../util/SizeTrackingAppendOnlyMapSuite.scala | 120 +++++++
.../org/apache/spark/util/VectorSuite.scala | 44 +++
.../util/collection/AppendOnlyMapSuite.scala | 198 +++++++++++
.../collection/ExternalAppendOnlyMapSuite.scala | 230 ++++++++++++
docs/configuration.md | 25 +-
docs/running-on-yarn.md | 15 +-
ec2/spark_ec2.py | 12 +-
.../streaming/examples/JavaFlumeEventCount.java | 2 +
.../streaming/examples/JavaKafkaWordCount.java | 2 +
.../examples/JavaNetworkWordCount.java | 11 +-
.../streaming/examples/JavaQueueStream.java | 2 +
.../streaming/examples/ActorWordCount.scala | 12 +-
.../streaming/examples/FlumeEventCount.scala | 4 +-
.../streaming/examples/HdfsWordCount.scala | 3 +-
.../streaming/examples/KafkaWordCount.scala | 5 +-
.../streaming/examples/MQTTWordCount.scala | 8 +-
.../streaming/examples/NetworkWordCount.scala | 7 +-
.../spark/streaming/examples/QueueStream.scala | 8 +-
.../streaming/examples/RawNetworkGrep.scala | 5 +-
.../examples/RecoverableNetworkWordCount.scala | 118 +++++++
.../examples/StatefulNetworkWordCount.scala | 2 +
.../streaming/examples/StreamingExamples.scala | 21 ++
.../streaming/examples/TwitterAlgebirdCMS.scala | 10 +-
.../streaming/examples/TwitterAlgebirdHLL.scala | 9 +-
.../streaming/examples/TwitterPopularTags.scala | 2 +
.../streaming/examples/ZeroMQWordCount.scala | 1 +
.../examples/clickstream/PageViewStream.scala | 4 +-
external/kafka/pom.xml | 4 +-
.../spark/streaming/mqtt/MQTTInputDStream.scala | 2 +-
project/SparkBuild.scala | 18 +-
.../org/apache/spark/streaming/Checkpoint.scala | 188 ++++++----
.../org/apache/spark/streaming/DStream.scala | 17 +-
.../spark/streaming/DStreamCheckpointData.scala | 106 +++---
.../apache/spark/streaming/DStreamGraph.scala | 38 +-
.../spark/streaming/StreamingContext.scala | 75 +++-
.../api/java/JavaStreamingContext.scala | 96 ++++-
.../streaming/dstream/FileInputDStream.scala | 42 ++-
.../streaming/scheduler/JobGenerator.scala | 31 +-
.../streaming/util/MasterFailureTest.scala | 55 +--
.../apache/spark/streaming/JavaAPISuite.java | 29 +-
.../spark/streaming/BasicOperationsSuite.scala | 2 +-
.../spark/streaming/CheckpointSuite.scala | 10 +-
.../spark/deploy/yarn/ApplicationMaster.scala | 13 +-
.../spark/deploy/yarn/WorkerLauncher.scala | 28 +-
.../cluster/YarnClientSchedulerBackend.scala | 50 +--
.../spark/deploy/yarn/ApplicationMaster.scala | 13 +-
.../org/apache/spark/deploy/yarn/Client.scala | 1 +
.../spark/deploy/yarn/WorkerLauncher.scala | 28 +-
75 files changed, 2458 insertions(+), 817 deletions(-)
----------------------------------------------------------------------
[11/50] git commit: tweaks to docs
Posted by ma...@apache.org.
tweaks to docs
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/185c8826
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/185c8826
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/185c8826
Branch: refs/heads/master
Commit: 185c882606112a49c1d7359cc1de00bd273c3050
Parents: dd0d3f0
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 19:53:14 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 19:53:14 2014 -0800
----------------------------------------------------------------------
.../scala/org/apache/spark/mllib/linalg/sparsesvd.scala | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/185c8826/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index 0ab05de..83b2178 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -65,12 +65,10 @@ object SVD {
throw new IllegalArgumentException("Expecting a tall and skinny matrix")
}
- if (min_svalue < 1.0e-9) {
- throw new IllegalArgumentException("Minimum singular value must be greater than 1e-9")
+ if (min_svalue < 1.0e-8) {
+ throw new IllegalArgumentException("Minimum singular value requested must be greater than 1e-9")
}
- val sc = data.sparkContext
-
// Compute A^T A, assuming rows are sparse enough to fit in memory
val rows = data.map(entry =>
(entry._1._1, (entry._1._2, entry._2))).groupByKey().cache()
@@ -80,7 +78,6 @@ object SVD {
((colind1, colind2), mval1*mval2) } }
}.reduceByKey(_+_)
-
// Construct jblas A^T A locally
val ata = DoubleMatrix.zeros(n, n)
for(entry <- emits.toArray) {
@@ -97,6 +94,8 @@ object SVD {
throw new Exception("All singular values are smaller than min_svalue: " + min_svalue)
}
+ val sc = data.sparkContext
+
// prepare V for returning
val retV = sc.makeRDD(
Array.tabulate(V.rows, sigma.length){ (i,j) =>
[47/50] git commit: Merge remote-tracking branch 'upstream/master'
into sparsesvd
Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master' into sparsesvd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/caf97a25
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/caf97a25
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/caf97a25
Branch: refs/heads/master
Commit: caf97a25a2bd70ef5164c3ce0e8b59a8e39eb288
Parents: 4e96757 d749d47
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 14:34:03 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 14:34:03 2014 -0800
----------------------------------------------------------------------
assembly/pom.xml | 116 +++++-
assembly/src/deb/RELEASE | 2 +
assembly/src/deb/control/control | 8 +
bin/spark-class2.cmd | 2 +-
bin/spark-shell.cmd | 4 +-
.../scala/org/apache/spark/Accumulators.scala | 40 +-
.../scala/org/apache/spark/Aggregator.scala | 25 +-
.../main/scala/org/apache/spark/SparkConf.scala | 20 +-
.../scala/org/apache/spark/SparkContext.scala | 4 +-
.../spark/api/java/JavaSparkContext.scala | 127 +++++-
.../org/apache/spark/api/java/package.scala | 23 ++
.../org/apache/spark/broadcast/Broadcast.scala | 33 +-
.../org/apache/spark/broadcast/package.scala | 25 ++
.../spark/deploy/worker/CommandUtils.scala | 17 +
.../spark/deploy/worker/DriverWrapper.scala | 17 +
.../org/apache/spark/deploy/worker/Worker.scala | 5 +-
.../spark/deploy/worker/WorkerWatcher.scala | 17 +
.../org/apache/spark/rdd/CoGroupedRDD.scala | 3 +-
.../apache/spark/scheduler/TaskSetManager.scala | 2 +-
.../org/apache/spark/storage/BlockManager.scala | 2 +-
.../org/apache/spark/DistributedSuite.scala | 17 +
.../scala/org/apache/spark/SparkConfSuite.scala | 17 +
.../spark/deploy/worker/DriverRunnerTest.scala | 17 +
.../deploy/worker/WorkerWatcherSuite.scala | 17 +
.../collection/ExternalAppendOnlyMapSuite.scala | 17 +
docs/building-with-maven.md | 6 +-
docs/configuration.md | 2 +-
docs/graphx-programming-guide.md | 315 +++++++++-----
docs/mllib-guide.md | 3 +-
docs/python-programming-guide.md | 4 +-
docs/streaming-programming-guide.md | 32 +-
docs/tuning.md | 3 +-
examples/pom.xml | 6 +
.../streaming/examples/NetworkWordCount.scala | 3 +-
.../streaming/examples/StreamingExamples.scala | 17 +
.../spark/streaming/flume/FlumeUtils.scala | 1 -
.../flume/src/test/resources/log4j.properties | 2 +-
.../streaming/flume/FlumeStreamSuite.scala | 2 +-
.../spark/streaming/kafka/KafkaUtils.scala | 4 +-
.../kafka/src/test/resources/log4j.properties | 2 +-
.../streaming/kafka/KafkaStreamSuite.scala | 1 +
.../apache/spark/streaming/mqtt/MQTTUtils.scala | 4 +-
.../mqtt/src/test/resources/log4j.properties | 2 +-
.../spark/streaming/mqtt/MQTTStreamSuite.scala | 1 +
.../spark/streaming/twitter/TwitterUtils.scala | 4 +-
.../twitter/src/test/resources/log4j.properties | 2 +-
.../streaming/twitter/TwitterStreamSuite.scala | 1 +
.../zeromq/src/test/resources/log4j.properties | 2 +-
.../streaming/zeromq/ZeroMQStreamSuite.scala | 1 +
graphx/pom.xml | 1 -
.../scala/org/apache/spark/graphx/Edge.scala | 25 +-
.../org/apache/spark/graphx/EdgeDirection.scala | 17 +
.../scala/org/apache/spark/graphx/EdgeRDD.scala | 21 +-
.../org/apache/spark/graphx/EdgeTriplet.scala | 21 +-
.../scala/org/apache/spark/graphx/Graph.scala | 35 +-
.../spark/graphx/GraphKryoRegistrator.scala | 19 +-
.../org/apache/spark/graphx/GraphLoader.scala | 17 +
.../org/apache/spark/graphx/GraphOps.scala | 49 ++-
.../apache/spark/graphx/PartitionStrategy.scala | 31 +-
.../scala/org/apache/spark/graphx/Pregel.scala | 25 +-
.../org/apache/spark/graphx/VertexRDD.scala | 42 +-
.../spark/graphx/impl/EdgePartition.scala | 33 +-
.../graphx/impl/EdgePartitionBuilder.scala | 27 +-
.../spark/graphx/impl/EdgeTripletIterator.scala | 19 +-
.../apache/spark/graphx/impl/GraphImpl.scala | 49 ++-
.../spark/graphx/impl/MessageToPartition.scala | 29 +-
.../graphx/impl/ReplicatedVertexView.scala | 47 ++-
.../apache/spark/graphx/impl/RoutingTable.scala | 33 +-
.../apache/spark/graphx/impl/Serializers.scala | 27 +-
.../spark/graphx/impl/VertexPartition.scala | 61 ++-
.../org/apache/spark/graphx/impl/package.scala | 19 +-
.../org/apache/spark/graphx/lib/Analytics.scala | 17 +
.../spark/graphx/lib/ConnectedComponents.scala | 21 +-
.../org/apache/spark/graphx/lib/PageRank.scala | 21 +-
.../apache/spark/graphx/lib/SVDPlusPlus.scala | 29 +-
.../lib/StronglyConnectedComponents.scala | 23 +-
.../apache/spark/graphx/lib/TriangleCount.scala | 19 +-
.../scala/org/apache/spark/graphx/package.scala | 21 +-
.../spark/graphx/util/BytecodeUtils.scala | 17 +
.../spark/graphx/util/GraphGenerators.scala | 29 +-
graphx/src/test/resources/als-test.data | 16 +
.../org/apache/spark/graphx/GraphOpsSuite.scala | 27 +-
.../org/apache/spark/graphx/GraphSuite.scala | 45 +-
.../apache/spark/graphx/LocalSparkContext.scala | 17 +
.../org/apache/spark/graphx/PregelSuite.scala | 25 +-
.../apache/spark/graphx/SerializerSuite.scala | 35 +-
.../apache/spark/graphx/VertexRDDSuite.scala | 17 +
.../spark/graphx/impl/EdgePartitionSuite.scala | 19 +-
.../graphx/impl/VertexPartitionSuite.scala | 17 +
.../graphx/lib/ConnectedComponentsSuite.scala | 19 +-
.../apache/spark/graphx/lib/PageRankSuite.scala | 17 +
.../spark/graphx/lib/SVDPlusPlusSuite.scala | 27 +-
.../lib/StronglyConnectedComponentsSuite.scala | 17 +
.../spark/graphx/lib/TriangleCountSuite.scala | 17 +
.../spark/graphx/util/BytecodeUtilsSuite.scala | 17 +
.../spark/mllib/api/python/PythonMLLibAPI.scala | 4 +-
.../apache/spark/mllib/classification/SVM.scala | 2 -
.../spark/mllib/clustering/KMeansModel.scala | 4 +-
.../mllib/regression/LinearRegression.scala | 2 +-
.../mllib/regression/RidgeRegression.scala | 6 +-
.../spark/mllib/util/LinearDataGenerator.scala | 4 +-
.../spark/mllib/util/MFDataGenerator.scala | 17 +-
.../org/apache/spark/mllib/util/MLUtils.scala | 2 +-
.../spark/mllib/util/SVMDataGenerator.scala | 2 +-
.../classification/JavaNaiveBayesSuite.java | 17 +
.../LogisticRegressionSuite.scala | 6 +-
.../spark/mllib/classification/SVMSuite.scala | 9 +-
.../spark/mllib/clustering/KMeansSuite.scala | 3 -
.../spark/mllib/recommendation/ALSSuite.scala | 1 -
.../spark/mllib/regression/LassoSuite.scala | 6 +-
.../regression/LinearRegressionSuite.scala | 5 +-
.../mllib/regression/RidgeRegressionSuite.scala | 3 -
pom.xml | 10 -
repl-bin/pom.xml | 184 ---------
repl-bin/src/deb/bin/run | 57 ---
repl-bin/src/deb/bin/spark-executor | 22 -
repl-bin/src/deb/bin/spark-shell | 21 -
repl-bin/src/deb/control/control | 8 -
.../scala/org/apache/spark/repl/ReplSuite.scala | 17 +
.../org/apache/spark/streaming/Checkpoint.scala | 10 +-
.../apache/spark/streaming/ContextWaiter.scala | 17 +
.../apache/spark/streaming/DStreamGraph.scala | 2 +
.../spark/streaming/StreamingContext.scala | 40 +-
.../api/java/JavaStreamingContext.scala | 13 +-
.../spark/streaming/dstream/DStream.scala | 20 +-
.../dstream/DStreamCheckpointData.scala | 26 +-
.../spark/streaming/dstream/InputDStream.scala | 24 +-
.../streaming/dstream/NetworkInputDStream.scala | 9 +-
.../dstream/PairDStreamFunctions.scala | 5 +
.../org/apache/spark/streaming/package.scala | 38 ++
.../streaming/util/MasterFailureTest.scala | 2 +-
.../util/RateLimitedOutputStream.scala | 1 +
.../spark/streaming/util/RawTextHelper.scala | 1 +
.../spark/streaming/util/RawTextSender.scala | 1 +
.../apache/spark/streaming/JavaAPISuite.java | 6 +-
.../apache/spark/streaming/JavaTestUtils.scala | 3 +-
.../spark/streaming/BasicOperationsSuite.scala | 1 -
.../spark/streaming/CheckpointSuite.scala | 30 +-
.../spark/streaming/InputStreamsSuite.scala | 8 +-
.../spark/streaming/StreamingContextSuite.scala | 1 -
.../apache/spark/streaming/TestSuiteBase.scala | 7 +-
.../org/apache/spark/deploy/yarn/Client.scala | 371 +----------------
.../spark/deploy/yarn/WorkerRunnable.scala | 126 +-----
.../apache/spark/deploy/yarn/ClientBase.scala | 410 +++++++++++++++++++
.../spark/deploy/yarn/WorkerRunnableUtil.scala | 176 ++++++++
.../org/apache/spark/deploy/yarn/Client.scala | 365 +----------------
.../spark/deploy/yarn/WorkerRunnable.scala | 130 +-----
147 files changed, 2621 insertions(+), 1807 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/caf97a25/docs/mllib-guide.md
----------------------------------------------------------------------
[31/50] git commit: use SparseMatrix everywhere
Posted by ma...@apache.org.
use SparseMatrix everywhere
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/06c0f762
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/06c0f762
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/06c0f762
Branch: refs/heads/master
Commit: 06c0f7628a213a08ef5adeab903160b806680acf
Parents: cdff9fc
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sat Jan 4 14:28:07 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sat Jan 4 14:28:07 2014 -0800
----------------------------------------------------------------------
.../org/apache/spark/examples/SparkSVD.scala | 9 +--
.../org/apache/spark/mllib/linalg/SVD.scala | 67 +++++++-------------
.../spark/mllib/linalg/SVDecomposedMatrix.scala | 8 +--
.../spark/mllib/linalg/SparseMatrix.scala | 30 +++++++++
.../apache/spark/mllib/linalg/SVDSuite.scala | 50 +++++++++------
5 files changed, 89 insertions(+), 75 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/06c0f762/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala b/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
index 5590ee7..4b9e674 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
@@ -20,6 +20,7 @@ package org.apache.spark.examples
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.SVD
import org.apache.spark.mllib.linalg.MatrixEntry
+import org.apache.spark.mllib.linalg.SparseMatrix
/**
* Compute SVD of an example matrix
@@ -48,10 +49,10 @@ object SparkSVD {
val n = 4
// recover largest singular vector
- val decomposed = SVD.sparseSVD(data, m, n, 1)
- val u = decomposed.U
- val s = decomposed.S
- val v = decomposed.V
+ val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), 1)
+ val u = decomposed.U.data
+ val s = decomposed.S.data
+ val v = decomposed.V.data
println("singular values = " + s.toArray.mkString)
}
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/06c0f762/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index 31990b0..a8efdc7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -26,11 +26,8 @@ import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
/**
* Class used to obtain singular value decompositions
- * @param data Matrix in sparse matrix format
- * @param m number of rows
- * @param n number of columns
*/
-class SVD(var data: RDD[MatrixEntry], var m: Int, var n: Int) {
+class SVD {
private var k: Int = 1
/**
@@ -41,35 +38,11 @@ class SVD(var data: RDD[MatrixEntry], var m: Int, var n: Int) {
this
}
- /**
- * Set matrix to be used for SVD
- */
- def setDatadata(data: RDD[MatrixEntry]): this.type = {
- this.data = data
- this
- }
-
- /**
- * Set dimensions of matrix: rows
- */
- def setNumRows(m: Int): this.type = {
- this.m = m
- this
- }
-
- /**
- * Set dimensions of matrix: columns
- */
- def setNumCols(n: Int): this.type = {
- this.n = n
- this
- }
-
/**
* Compute SVD using the current set parameters
*/
- def computeSVD() : SVDecomposedMatrix = {
- SVD.sparseSVD(data, m, n, k)
+ def computeSVD(matrix: SparseMatrix) : SVDecomposedMatrix = {
+ SVD.sparseSVD(matrix, k)
}
}
@@ -103,19 +76,19 @@ object SVD {
* All input and output is expected in sparse matrix format, 1-indexed
* as tuples of the form ((i,j),value) all in RDDs
*
- * @param data RDD Matrix in sparse 1-index format ((int, int), value)
- * @param m number of rows
- * @param n number of columns
+ * @param matrix sparse matrix to factorize
* @param k Recover k singular values and vectors
* @return Three sparse matrices: U, S, V such that A = USV^T
*/
def sparseSVD(
- data: RDD[MatrixEntry],
- m: Int,
- n: Int,
+ matrix: SparseMatrix,
k: Int)
: SVDecomposedMatrix =
{
+ val data = matrix.data
+ val m = matrix.m
+ val n = matrix.n
+
if (m < n || m <= 0 || n <= 0) {
throw new IllegalArgumentException("Expecting a tall and skinny matrix")
}
@@ -153,13 +126,16 @@ object SVD {
val sc = data.sparkContext
// prepare V for returning
- val retV = sc.makeRDD(
+ val retVdata = sc.makeRDD(
Array.tabulate(V.rows, sigma.length){ (i,j) =>
MatrixEntry(i + 1, j + 1, V.get(i,j)) }.flatten)
-
- val retS = sc.makeRDD(Array.tabulate(sigma.length){
+ val retV = SparseMatrix(retVdata, V.rows, sigma.length)
+
+ val retSdata = sc.makeRDD(Array.tabulate(sigma.length){
x => MatrixEntry(x + 1, x + 1, sigma(x))})
+ val retS = SparseMatrix(retSdata, sigma.length, sigma.length)
+
// Compute U as U = A V S^-1
// turn V S^-1 into an RDD as a sparse matrix
val vsirdd = sc.makeRDD(Array.tabulate(V.rows, sigma.length)
@@ -168,10 +144,11 @@ object SVD {
// Multiply A by VS^-1
val aCols = data.map(entry => (entry.j, (entry.i, entry.mval)))
val bRows = vsirdd.map(entry => (entry._1._1, (entry._1._2, entry._2)))
- val retU = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal)) )
+ val retUdata = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal)) )
=> ((rowInd, colInd), rowVal*colVal)}).reduceByKey(_+_)
.map{ case ((row, col), mval) => MatrixEntry(row, col, mval)}
-
+ val retU = SparseMatrix(retUdata, m, sigma.length)
+
SVDecomposedMatrix(retU, retS, retV)
}
@@ -195,10 +172,10 @@ object SVD {
MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
}
- val decomposed = SVD.sparseSVD(data, m, n, k)
- val u = decomposed.U
- val s = decomposed.S
- val v = decomposed.V
+ val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), k)
+ val u = decomposed.U.data
+ val s = decomposed.S.data
+ val v = decomposed.V.data
println("Computed " + s.toArray.length + " singular values and vectors")
u.saveAsTextFile(output_u)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/06c0f762/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
index e0bcdab..6220035 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
@@ -17,8 +17,6 @@
package org.apache.spark.mllib.linalg
-import org.apache.spark.rdd.RDD
-
/**
* Class that represents the SV decomposition of a matrix
*
@@ -26,6 +24,6 @@ import org.apache.spark.rdd.RDD
* @param S such that A = USV^T
* @param V such that A = USV^T
*/
-case class SVDecomposedMatrix(val U: RDD[MatrixEntry],
- val S: RDD[MatrixEntry],
- val V: RDD[MatrixEntry])
+case class SVDecomposedMatrix(val U: SparseMatrix,
+ val S: SparseMatrix,
+ val V: SparseMatrix)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/06c0f762/mllib/src/main/scala/org/apache/spark/mllib/linalg/SparseMatrix.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SparseMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SparseMatrix.scala
new file mode 100644
index 0000000..cbd1a2a
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SparseMatrix.scala
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+import org.apache.spark.rdd.RDD
+
+
+/**
+ * Class that represents a sparse matrix
+ *
+ * @param data RDD of nonzero entries
+ * @param m number of rows
+ * @param n numner of columns
+ */
+case class SparseMatrix(val data: RDD[MatrixEntry], val m: Int, val n: Int)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/06c0f762/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
index 4126e81..f239e85 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
@@ -45,9 +45,12 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val EPSILON = 1e-4
// Return jblas matrix from sparse matrix RDD
- def getDenseMatrix(matrix:RDD[MatrixEntry], m:Int, n:Int) : DoubleMatrix = {
+ def getDenseMatrix(matrix:SparseMatrix) : DoubleMatrix = {
+ val data = matrix.data
+ val m = matrix.m
+ val n = matrix.n
val ret = DoubleMatrix.zeros(m, n)
- matrix.toArray.map(x => ret.put(x.i - 1, x.j - 1, x.mval))
+ matrix.data.toArray.map(x => ret.put(x.i - 1, x.j - 1, x.mval))
ret
}
@@ -67,24 +70,26 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val data = sc.makeRDD(Array.tabulate(m,n){ (a, b) =>
MatrixEntry(a + 1, b + 1, (a + 2).toDouble * (b + 1) / (1 + a + b)) }.flatten )
- val decomposed = SVD.sparseSVD(data, m, n, n)
+ val a = SparseMatrix(data, m, n)
+
+ val decomposed = SVD.sparseSVD(a, n)
val u = decomposed.U
val v = decomposed.V
- val s = decomposed.S
+ val s = decomposed.S
- val densea = getDenseMatrix(data, m, n)
+ val densea = getDenseMatrix(a)
val svd = Singular.sparseSVD(densea)
- val retu = getDenseMatrix(u, m, n)
- val rets = getDenseMatrix(s, n, n)
- val retv = getDenseMatrix(v, n, n)
+ val retu = getDenseMatrix(u)
+ val rets = getDenseMatrix(s)
+ val retv = getDenseMatrix(v)
// check individual decomposition
assertMatrixEquals(retu, svd(0))
assertMatrixEquals(rets, DoubleMatrix.diag(svd(1)))
assertMatrixEquals(retv, svd(2))
- // check multiplication guarantee
+ // check multiplication guarantee
assertMatrixEquals(retu.mmul(rets).mmul(retv.transpose), densea)
}
@@ -95,20 +100,22 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
MatrixEntry(a + 1, b + 1, 1.0) }.flatten )
val k = 1
- val decomposed = SVD.sparseSVD(data, m, n, k)
+ val a = SparseMatrix(data, m, n)
+
+ val decomposed = SVD.sparseSVD(a, k)
val u = decomposed.U
val s = decomposed.S
val v = decomposed.V
- val retrank = s.toArray.length
+ val retrank = s.data.toArray.length
assert(retrank == 1, "rank returned not one")
- val densea = getDenseMatrix(data, m, n)
+ val densea = getDenseMatrix(a)
val svd = Singular.sparseSVD(densea)
- val retu = getDenseMatrix(u, m, retrank)
- val rets = getDenseMatrix(s, retrank, retrank)
- val retv = getDenseMatrix(v, n, retrank)
+ val retu = getDenseMatrix(u)
+ val rets = getDenseMatrix(s)
+ val retv = getDenseMatrix(v)
// check individual decomposition
assertMatrixEquals(retu, svd(0).getColumn(0))
@@ -124,21 +131,22 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val n = 3
val data = sc.makeRDD(Array.tabulate(m,n){ (a, b) =>
MatrixEntry(a + 1, b + 1, (a + 2).toDouble * (b + 1)/(1 + a + b)) }.flatten )
+ val a = SparseMatrix(data, m, n)
val k = 1 // only one svalue above this
- val decomposed = SVD.sparseSVD(data, m, n, k)
+ val decomposed = SVD.sparseSVD(a, k)
val u = decomposed.U
val s = decomposed.S
val v = decomposed.V
- val retrank = s.toArray.length
+ val retrank = s.data.toArray.length
- val densea = getDenseMatrix(data, m, n)
+ val densea = getDenseMatrix(a)
val svd = Singular.sparseSVD(densea)
- val retu = getDenseMatrix(u, m, retrank)
- val rets = getDenseMatrix(s, retrank, retrank)
- val retv = getDenseMatrix(v, n, retrank)
+ val retu = getDenseMatrix(u)
+ val rets = getDenseMatrix(s)
+ val retv = getDenseMatrix(v)
assert(retrank == 1, "rank returned not one")
[25/50] git commit: using decomposed matrix struct now
Posted by ma...@apache.org.
using decomposed matrix struct now
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/26a74f0c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/26a74f0c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/26a74f0c
Branch: refs/heads/master
Commit: 26a74f0c4131d506384b94a913b8c6e1a30be9a4
Parents: d2d5e5e
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sat Jan 4 00:38:53 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sat Jan 4 00:38:53 2014 -0800
----------------------------------------------------------------------
.../scala/org/apache/spark/mllib/linalg/SVD.scala | 14 ++++++--------
.../spark/mllib/linalg/SVDecomposedMatrix.scala | 2 --
.../org/apache/spark/mllib/linalg/SVDSuite.scala | 17 ++++++++++-------
3 files changed, 16 insertions(+), 17 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/26a74f0c/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index 08af2c8..ac9178e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -23,7 +23,6 @@ import org.apache.spark.rdd.RDD
import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
-import org.apache.spark.linalg.MatrixEntry
/**
* Top-level methods for calling Singular Value Decomposition
@@ -66,9 +65,7 @@ object SVD {
m: Int,
n: Int,
min_svalue: Double)
- : ( RDD[MatrixEntry],
- RDD[MatrixEntry],
- RDD[MatrixEntry]) =
+ : SVDecomposedMatrix =
{
if (m < n || m <= 0 || n <= 0) {
throw new IllegalArgumentException("Expecting a tall and skinny matrix")
@@ -118,16 +115,16 @@ object SVD {
{ (i,j) => ((i+1, j+1), V.get(i,j)/sigma(j)) }.flatten).cache()
// Multiply A by VS^-1
- val aCols = data.map(entry => (entry._1._2, (entry._1._1, entry._2)))
+ val aCols = data.map(entry => (entry.j, (entry.i, entry.mval)))
val bRows = vsirdd.map(entry => (entry._1._1, (entry._1._2, entry._2)))
val retU = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal)) )
=> ((rowInd, colInd), rowVal*colVal)}).reduceByKey(_+_)
- .map( case (row, col, mval) => MatrixEntry(row, col, mval))
+ .map{ case ((row, col), mval) => MatrixEntry(row, col, mval)}
- (retU, retS, retV)
+ SVDecomposedMatrix(retU, retS, retV)
}
-
+/*
def main(args: Array[String]) {
if (args.length < 8) {
println("Usage: SVD <master> <matrix_file> <m> <n>
@@ -153,6 +150,7 @@ object SVD {
v.saveAsTextFile(output_v)
System.exit(0)
}
+*/
}
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/26a74f0c/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
index c3ec428..e0bcdab 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
@@ -19,8 +19,6 @@ package org.apache.spark.mllib.linalg
import org.apache.spark.rdd.RDD
-import org.apache.spark.linalg.MatrixEntry
-
/**
* Class that represents the SV decomposition of a matrix
*
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/26a74f0c/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
index 726650a..71749ff 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
@@ -45,9 +45,9 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val EPSILON = 1e-4
// Return jblas matrix from sparse matrix RDD
- def getDenseMatrix(matrix:RDD[((Int, Int), Double)], m:Int, n:Int) : DoubleMatrix = {
+ def getDenseMatrix(matrix:RDD[MatrixEntry], m:Int, n:Int) : DoubleMatrix = {
val ret = DoubleMatrix.zeros(m, n)
- matrix.toArray.map(x => ret.put(x._1._1-1, x._1._2-1, x._2))
+ matrix.toArray.map(x => ret.put(x.i-1, x.j-1, x.mval))
ret
}
@@ -65,11 +65,14 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val m = 10
val n = 3
val data = sc.makeRDD(Array.tabulate(m,n){ (a,b)=>
- ((a+1,b+1), (a+2).toDouble*(b+1)/(1+a+b)) }.flatten )
+ MatrixEntry(a+1,b+1, (a+2).toDouble*(b+1)/(1+a+b)) }.flatten )
val min_svalue = 1.0e-8
- val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
-
+ val decomposed = SVD.sparseSVD(data, m, n, min_svalue)
+ val u = decomposed.U
+ val v = decomposed.V
+ val s = decomposed.S
+
val densea = getDenseMatrix(data, m, n)
val svd = Singular.sparseSVD(densea)
@@ -85,7 +88,7 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
// check multiplication guarantee
assertMatrixEquals(retu.mmul(rets).mmul(retv.transpose), densea)
}
-
+/*
test("rank one matrix svd") {
val m = 10
val n = 3
@@ -138,5 +141,5 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
assertMatrixEquals(retu, svd(0).getColumn(0))
assertMatrixEquals(rets, DoubleMatrix.diag(svd(1).getRow(0)))
assertMatrixEquals(retv, svd(2).getColumn(0))
- }
+ }*/
}
[40/50] git commit: changes from PR
Posted by ma...@apache.org.
changes from PR
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/d28bf418
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/d28bf418
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/d28bf418
Branch: refs/heads/master
Commit: d28bf4182758f08862d5838c918756801a9d7327
Parents: 845e568
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 13:39:40 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 13:39:40 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 5 +-
.../org/apache/spark/examples/SparkSVD.scala | 59 --------------------
.../apache/spark/examples/mllib/SparkSVD.scala | 59 ++++++++++++++++++++
3 files changed, 62 insertions(+), 61 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/d28bf418/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index a140ecb..26350ce 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -445,11 +445,12 @@ Given an *m x n* matrix *A*, we can compute matrices *U, S, V* such that
*A = U * S * V^T*
-There is no restriction on m, but we require n^2 doubles to fit in memory.
+There is no restriction on m, but we require n^2 doubles to
+fit in memory locally on one machine.
Further, n should be less than m.
The decomposition is computed by first computing *A^TA = V S^2 V^T*,
-computing svd locally on that (since n x n is small),
+computing SVD locally on that (since n x n is small),
from which we recover S and V.
Then we compute U via easy matrix multiplication
as *U = A * V * S^-1*
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/d28bf418/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala b/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
deleted file mode 100644
index ce7c1c4..0000000
--- a/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples
-
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg.SVD
-import org.apache.spark.mllib.linalg.MatrixEntry
-import org.apache.spark.mllib.linalg.SparseMatrix
-
-/**
- * Compute SVD of an example matrix
- * Input file should be comma separated, 1 indexed of the form
- * i,j,value
- * Where i is the column, j the row, and value is the matrix entry
- *
- * For example input file, see:
- * mllib/data/als/test.data (example is 4 x 4)
- */
-object SparkSVD {
- def main(args: Array[String]) {
- if (args.length != 4) {
- System.err.println("Usage: SparkSVD <master> <file> m n")
- System.exit(1)
- }
- val sc = new SparkContext(args(0), "SVD",
- System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))
-
- // Load and parse the data file
- val data = sc.textFile(args(1)).map { line =>
- val parts = line.split(',')
- MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
- }
- val m = args(2).toInt
- val n = args(3).toInt
-
- // recover largest singular vector
- val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), 1)
- val u = decomposed.U.data
- val s = decomposed.S.data
- val v = decomposed.V.data
-
- println("singular values = " + s.toArray.mkString)
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/d28bf418/examples/src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala
new file mode 100644
index 0000000..50e5f5b
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SparkSVD.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.SVD
+import org.apache.spark.mllib.linalg.MatrixEntry
+import org.apache.spark.mllib.linalg.SparseMatrix
+
+/**
+ * Compute SVD of an example matrix
+ * Input file should be comma separated, 1 indexed of the form
+ * i,j,value
+ * Where i is the column, j the row, and value is the matrix entry
+ *
+ * For example input file, see:
+ * mllib/data/als/test.data (example is 4 x 4)
+ */
+object SparkSVD {
+ def main(args: Array[String]) {
+ if (args.length != 4) {
+ System.err.println("Usage: SparkSVD <master> <file> m n")
+ System.exit(1)
+ }
+ val sc = new SparkContext(args(0), "SVD",
+ System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))
+
+ // Load and parse the data file
+ val data = sc.textFile(args(1)).map { line =>
+ val parts = line.split(',')
+ MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
+ }
+ val m = args(2).toInt
+ val n = args(3).toInt
+
+ // recover largest singular vector
+ val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), 1)
+ val u = decomposed.U.data
+ val s = decomposed.S.data
+ val v = decomposed.V.data
+
+ println("singular values = " + s.toArray.mkString)
+ }
+}
[02/50] git commit: Main method added for svd
Posted by ma...@apache.org.
Main method added for svd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/fe1a132d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/fe1a132d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/fe1a132d
Branch: refs/heads/master
Commit: fe1a132d403e301a674da6f4af7163df210ec2af
Parents: 1a21ba2
Author: Reza Zadeh <ri...@gmail.com>
Authored: Thu Dec 26 18:13:21 2013 -0500
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Thu Dec 26 18:13:21 2013 -0500
----------------------------------------------------------------------
.../main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/fe1a132d/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index be8ccff..7bb393d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -128,10 +128,10 @@ object SVD {
}
val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
- println("Computed " + s.size + " singular values and vectors")
- u.saveAsText(output_u)
- s.saveAsText(output_s)
- v.saveAsText(output_v)
+ println("Computed " + s.toArray.length + " singular values and vectors")
+ u.saveAsTextFile(output_u)
+ s.saveAsTextFile(output_s)
+ v.saveAsTextFile(output_v)
System.exit(0)
}
}
[21/50] git commit: New matrix entry file
Posted by ma...@apache.org.
New matrix entry file
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/b059a2a0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/b059a2a0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/b059a2a0
Branch: refs/heads/master
Commit: b059a2a00c1e4a46dacbd63cacbfe0a06f3c61fa
Parents: e617ae2
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 3 21:54:57 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 3 21:54:57 2014 -0800
----------------------------------------------------------------------
.../apache/spark/mllib/linalg/MatrixEntry.scala | 27 ++++++++++++++++++++
1 file changed, 27 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/b059a2a0/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixEntry.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixEntry.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixEntry.scala
new file mode 100644
index 0000000..c7f2aba
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/MatrixEntry.scala
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+/**
+ * Class that represents an entry in a sparse matrix of doubles.
+ *
+ * @param i row index (1 indexing used)
+ * @param j column index (1 indexing used)
+ * @param mval value of entry in matrix
+ */
+case class MatrixEntry(val i: Int, val j: Int, val mval: Double)
[37/50] git commit: add dimension parameters to example
Posted by ma...@apache.org.
add dimension parameters to example
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/1afdeaeb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/1afdeaeb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/1afdeaeb
Branch: refs/heads/master
Commit: 1afdeaeb2f436084a6fbe8d73690f148f7b462c4
Parents: 21c8a54
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 10 21:30:54 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 10 21:30:54 2014 -0800
----------------------------------------------------------------------
.../main/scala/org/apache/spark/examples/SparkSVD.scala | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/1afdeaeb/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala b/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
index d9c672f..ce7c1c4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
@@ -29,12 +29,12 @@ import org.apache.spark.mllib.linalg.SparseMatrix
* Where i is the column, j the row, and value is the matrix entry
*
* For example input file, see:
- * mllib/data/als/test.data
+ * mllib/data/als/test.data (example is 4 x 4)
*/
object SparkSVD {
def main(args: Array[String]) {
- if (args.length != 2) {
- System.err.println("Usage: SparkSVD <master> <file>")
+ if (args.length != 4) {
+ System.err.println("Usage: SparkSVD <master> <file> m n")
System.exit(1)
}
val sc = new SparkContext(args(0), "SVD",
@@ -45,8 +45,8 @@ object SparkSVD {
val parts = line.split(',')
MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
}
- val m = 4
- val n = 4
+ val m = args(2).toInt
+ val n = args(3).toInt
// recover largest singular vector
val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), 1)
[03/50] git commit: full rank matrix test added
Posted by ma...@apache.org.
full rank matrix test added
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/16de5268
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/16de5268
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/16de5268
Branch: refs/heads/master
Commit: 16de5268e3652498b47b0600fe5cf9cf10d0dd83
Parents: fe1a132
Author: Reza Zadeh <ri...@gmail.com>
Authored: Thu Dec 26 23:21:57 2013 -0500
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Thu Dec 26 23:21:57 2013 -0500
----------------------------------------------------------------------
.../scala/org/apache/spark/mllib/linalg/sparsesvd.scala | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/16de5268/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index 7bb393d..2ce0df1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -61,6 +61,14 @@ object SVD {
RDD[((Int, Int), Double)],
RDD[((Int, Int), Double)]) =
{
+ if (m < n) {
+ throw new IllegalArgumentException("Expecting a tall and skinny matrix")
+ }
+
+ if (min_svalue < 1.0e-9) {
+ throw new IllegalArgumentException("Minimum singular value must be greater than 1e-9")
+ }
+
val sc = data.sparkContext
// Compute A^T A, assuming rows are sparse enough to fit in memory
@@ -86,7 +94,7 @@ object SVD {
// threshold s values
if(sigma.isEmpty) {
- // TODO: return empty
+ throw new Exception("All singular values are smaller than min_svalue: " + min_svalue)
}
// prepare V for returning
[13/50] git commit: doc tweak
Posted by ma...@apache.org.
doc tweak
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/97dc5278
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/97dc5278
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/97dc5278
Branch: refs/heads/master
Commit: 97dc527849b836703811acdbd6767685585099df
Parents: b941b6f
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 20:02:37 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 20:02:37 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/97dc5278/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 08d6d74..8c490eb 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -215,17 +215,18 @@ Available algorithms for gradient descent:
# Singular Value Decomposition
Singular Value Decomposition for Tall and Skinny matrices.
-Given an m x n matrix A, this will compute matrices U, S, V such that
-A = U * S * V^T
+Given an *m x n* matrix *A*, this will compute matrices *U, S, V* such that
+
+*A = U * S * V^T*
There is no restriction on m, but we require n^2 doubles to fit in memory.
Further, n should be less than m.
-The decomposition is computed by first computing A^TA = V S^2 V^T,
+The decomposition is computed by first computing *A^TA = V S^2 V^T*,
computing svd locally on that (since n x n is small),
from which we recover S and V.
Then we compute U via easy matrix multiplication
-as U = A * V * S^-1
+as *U = A * V * S^-1*
Only singular vectors associated with singular values
greater or equal to MIN_SVALUE are recovered. If there are k
[30/50] git commit: prettify
Posted by ma...@apache.org.
prettify
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/cdff9fc8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/cdff9fc8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/cdff9fc8
Branch: refs/heads/master
Commit: cdff9fc858b9b83eb1119ec2a6d1d3c9a66f47a9
Parents: e9bd6cb
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sat Jan 4 12:44:04 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sat Jan 4 12:44:04 2014 -0800
----------------------------------------------------------------------
.../org/apache/spark/mllib/linalg/SVD.scala | 9 +++---
.../apache/spark/mllib/linalg/SVDSuite.scala | 34 ++++++++++----------
2 files changed, 22 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/cdff9fc8/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index e58b8e8..31990b0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -136,7 +136,7 @@ object SVD {
// Construct jblas A^T A locally
val ata = DoubleMatrix.zeros(n, n)
for (entry <- emits.toArray) {
- ata.put(entry._1._1-1, entry._1._2-1, entry._2)
+ ata.put(entry._1._1 - 1, entry._1._2 - 1, entry._2)
}
// Since A^T A is small, we can compute its SVD directly
@@ -158,12 +158,12 @@ object SVD {
MatrixEntry(i + 1, j + 1, V.get(i,j)) }.flatten)
val retS = sc.makeRDD(Array.tabulate(sigma.length){
- x => MatrixEntry(x + 1,x + 1, sigma(x))})
+ x => MatrixEntry(x + 1, x + 1, sigma(x))})
// Compute U as U = A V S^-1
// turn V S^-1 into an RDD as a sparse matrix
val vsirdd = sc.makeRDD(Array.tabulate(V.rows, sigma.length)
- { (i,j) => ((i + 1, j + 1), V.get(i,j)/sigma(j)) }.flatten)
+ { (i,j) => ((i + 1, j + 1), V.get(i,j) / sigma(j)) }.flatten)
// Multiply A by VS^-1
val aCols = data.map(entry => (entry.j, (entry.i, entry.mval)))
@@ -178,10 +178,11 @@ object SVD {
def main(args: Array[String]) {
if (args.length < 8) {
- println("Usage: SVD <master> <matrix_file> <m> <n>" +
+ println("Usage: SVD <master> <matrix_file> <m> <n> " +
"<k> <output_U_file> <output_S_file> <output_V_file>")
System.exit(1)
}
+
val (master, inputFile, m, n, k, output_u, output_s, output_v) =
(args(0), args(1), args(2).toInt, args(3).toInt,
args(4).toInt, args(5), args(6), args(7))
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/cdff9fc8/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
index a83d9d0..4126e81 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
@@ -47,25 +47,25 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
// Return jblas matrix from sparse matrix RDD
def getDenseMatrix(matrix:RDD[MatrixEntry], m:Int, n:Int) : DoubleMatrix = {
val ret = DoubleMatrix.zeros(m, n)
- matrix.toArray.map(x => ret.put(x.i-1, x.j-1, x.mval))
+ matrix.toArray.map(x => ret.put(x.i - 1, x.j - 1, x.mval))
ret
}
def assertMatrixEquals(a:DoubleMatrix, b:DoubleMatrix) {
assert(a.rows == b.rows && a.columns == b.columns, "dimension mismatch")
val diff = DoubleMatrix.zeros(a.rows, a.columns)
- Array.tabulate(a.rows, a.columns){(i,j) =>
- diff.put(i,j,
- Math.min(Math.abs(a.get(i,j)-b.get(i,j)),
- Math.abs(a.get(i,j)+b.get(i,j)))) }
+ Array.tabulate(a.rows, a.columns){(i, j) =>
+ diff.put(i, j,
+ Math.min(Math.abs(a.get(i, j) - b.get(i, j)),
+ Math.abs(a.get(i, j) + b.get(i, j)))) }
assert(diff.norm1 < EPSILON, "matrix mismatch: " + diff.norm1)
}
test("full rank matrix svd") {
val m = 10
val n = 3
- val data = sc.makeRDD(Array.tabulate(m,n){ (a,b)=>
- MatrixEntry(a+1,b+1, (a+2).toDouble*(b+1)/(1+a+b)) }.flatten )
+ val data = sc.makeRDD(Array.tabulate(m,n){ (a, b) =>
+ MatrixEntry(a + 1, b + 1, (a + 2).toDouble * (b + 1) / (1 + a + b)) }.flatten )
val decomposed = SVD.sparseSVD(data, m, n, n)
val u = decomposed.U
@@ -75,9 +75,9 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val densea = getDenseMatrix(data, m, n)
val svd = Singular.sparseSVD(densea)
- val retu = getDenseMatrix(u,m,n)
- val rets = getDenseMatrix(s,n,n)
- val retv = getDenseMatrix(v,n,n)
+ val retu = getDenseMatrix(u, m, n)
+ val rets = getDenseMatrix(s, n, n)
+ val retv = getDenseMatrix(v, n, n)
// check individual decomposition
assertMatrixEquals(retu, svd(0))
@@ -106,9 +106,9 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val densea = getDenseMatrix(data, m, n)
val svd = Singular.sparseSVD(densea)
- val retu = getDenseMatrix(u,m,retrank)
- val rets = getDenseMatrix(s,retrank,retrank)
- val retv = getDenseMatrix(v,n,retrank)
+ val retu = getDenseMatrix(u, m, retrank)
+ val rets = getDenseMatrix(s, retrank, retrank)
+ val retv = getDenseMatrix(v, n, retrank)
// check individual decomposition
assertMatrixEquals(retu, svd(0).getColumn(0))
@@ -123,7 +123,7 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val m = 10
val n = 3
val data = sc.makeRDD(Array.tabulate(m,n){ (a, b) =>
- MatrixEntry(a + 1, b + 1, (a + 2).toDouble*(b + 1)/(1 + a + b)) }.flatten )
+ MatrixEntry(a + 1, b + 1, (a + 2).toDouble * (b + 1)/(1 + a + b)) }.flatten )
val k = 1 // only one svalue above this
@@ -136,9 +136,9 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val densea = getDenseMatrix(data, m, n)
val svd = Singular.sparseSVD(densea)
- val retu = getDenseMatrix(u,m,retrank)
- val rets = getDenseMatrix(s,retrank,retrank)
- val retv = getDenseMatrix(v,n,retrank)
+ val retu = getDenseMatrix(u, m, retrank)
+ val rets = getDenseMatrix(s, retrank, retrank)
+ val retv = getDenseMatrix(v, n, retrank)
assert(retrank == 1, "rank returned not one")
[06/50] git commit: cleanup documentation
Posted by ma...@apache.org.
cleanup documentation
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/3369c2d4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/3369c2d4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/3369c2d4
Branch: refs/heads/master
Commit: 3369c2d48795d831acd841b8ecb67b5a84083883
Parents: bdb5037
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Dec 27 00:41:46 2013 -0500
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Dec 27 00:41:46 2013 -0500
----------------------------------------------------------------------
.../src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/3369c2d4/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index a799aa3..1c9f67e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -29,7 +29,7 @@ import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
* Given an m x n matrix A, this will compute matrices U, S, V such that
* A = U * S * V'
*
- * There is no restriction on m, but we require n doubles to be held in memory.
+ * There is no restriction on m, but we require n^2 doubles to fit in memory.
* Further, n should be less than m.
*
* This is computed by first computing A'A = V S^2 V',
@@ -81,7 +81,7 @@ object SVD {
}.reduceByKey(_+_)
- // Constructi jblas A^T A locally
+ // Construct jblas A^T A locally
val ata = DoubleMatrix.zeros(n, n)
for(entry <- emits.toArray) {
ata.put(entry._1._1-1, entry._1._2-1, entry._2)
[04/50] git commit: test for truncated svd
Posted by ma...@apache.org.
test for truncated svd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/fa1e8d8c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/fa1e8d8c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/fa1e8d8c
Branch: refs/heads/master
Commit: fa1e8d8cbf916f963e1ea000683a11d83551f870
Parents: 16de526
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Dec 27 00:34:59 2013 -0500
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Dec 27 00:34:59 2013 -0500
----------------------------------------------------------------------
.../apache/spark/mllib/linalg/sparsesvd.scala | 101 +++++++++----------
1 file changed, 50 insertions(+), 51 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/fa1e8d8c/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index 2ce0df1..a799aa3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -57,65 +57,65 @@ object SVD {
m: Int,
n: Int,
min_svalue: Double)
- : ( RDD[((Int, Int), Double)],
- RDD[((Int, Int), Double)],
- RDD[((Int, Int), Double)]) =
+ : ( RDD[((Int, Int), Double)],
+ RDD[((Int, Int), Double)],
+ RDD[((Int, Int), Double)]) =
{
- if (m < n) {
+ if (m < n || m <= 0 || n <= 0) {
throw new IllegalArgumentException("Expecting a tall and skinny matrix")
}
- if (min_svalue < 1.0e-9) {
+ if (min_svalue < 1.0e-9) {
throw new IllegalArgumentException("Minimum singular value must be greater than 1e-9")
}
- val sc = data.sparkContext
+ val sc = data.sparkContext
- // Compute A^T A, assuming rows are sparse enough to fit in memory
- val rows = data.map(entry =>
- (entry._1._1, (entry._1._2, entry._2))).groupByKey().cache()
- val emits = rows.flatMap{ case (rowind, cols) =>
- cols.flatMap{ case (colind1, mval1) =>
- cols.map{ case (colind2, mval2) =>
- ((colind1, colind2), mval1*mval2) } }
- }.reduceByKey(_+_)
+ // Compute A^T A, assuming rows are sparse enough to fit in memory
+ val rows = data.map(entry =>
+ (entry._1._1, (entry._1._2, entry._2))).groupByKey().cache()
+ val emits = rows.flatMap{ case (rowind, cols) =>
+ cols.flatMap{ case (colind1, mval1) =>
+ cols.map{ case (colind2, mval2) =>
+ ((colind1, colind2), mval1*mval2) } }
+ }.reduceByKey(_+_)
- // Constructi jblas A^T A locally
- val ata = DoubleMatrix.zeros(n, n)
- for(entry <- emits.toArray) {
- ata.put(entry._1._1-1, entry._1._2-1, entry._2)
- }
+ // Constructi jblas A^T A locally
+ val ata = DoubleMatrix.zeros(n, n)
+ for(entry <- emits.toArray) {
+ ata.put(entry._1._1-1, entry._1._2-1, entry._2)
+ }
- // Since A^T A is small, we can compute its SVD directly
- val svd = Singular.sparseSVD(ata)
- val V = svd(0)
- val sigma = MatrixFunctions.sqrt(svd(1)).toArray.filter(x => x >= min_svalue)
+ // Since A^T A is small, we can compute its SVD directly
+ val svd = Singular.sparseSVD(ata)
+ val V = svd(0)
+ val sigma = MatrixFunctions.sqrt(svd(1)).toArray.filter(x => x >= min_svalue)
- // threshold s values
- if(sigma.isEmpty) {
+ // threshold s values
+ if(sigma.isEmpty) {
throw new Exception("All singular values are smaller than min_svalue: " + min_svalue)
- }
+ }
- // prepare V for returning
- val retV = sc.makeRDD(
- Array.tabulate(V.rows, sigma.length){ (i,j) =>
- ((i+1, j+1), V.get(i,j)) }.flatten)
+ // prepare V for returning
+ val retV = sc.makeRDD(
+ Array.tabulate(V.rows, sigma.length){ (i,j) =>
+ ((i+1, j+1), V.get(i,j)) }.flatten)
- val retS = sc.makeRDD(Array.tabulate(sigma.length){x=>((x+1,x+1),sigma(x))})
+ val retS = sc.makeRDD(Array.tabulate(sigma.length){x=>((x+1,x+1),sigma(x))})
- // Compute U as U = A V S^-1
- // turn V S^-1 into an RDD as a sparse matrix and cache it
- val vsirdd = sc.makeRDD(Array.tabulate(V.rows, sigma.length)
+ // Compute U as U = A V S^-1
+ // turn V S^-1 into an RDD as a sparse matrix and cache it
+ val vsirdd = sc.makeRDD(Array.tabulate(V.rows, sigma.length)
{ (i,j) => ((i+1, j+1), V.get(i,j)/sigma(j)) }.flatten).cache()
- // Multiply A by VS^-1
- val aCols = data.map(entry => (entry._1._2, (entry._1._1, entry._2)))
- val bRows = vsirdd.map(entry => (entry._1._1, (entry._1._2, entry._2)))
- val retU = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal)) )
+ // Multiply A by VS^-1
+ val aCols = data.map(entry => (entry._1._2, (entry._1._1, entry._2)))
+ val bRows = vsirdd.map(entry => (entry._1._1, (entry._1._2, entry._2)))
+ val retU = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal)) )
=> ((rowInd, colInd), rowVal*colVal)}).reduceByKey(_+_)
-
- (retU, retS, retV)
+
+ (retU, retS, retV)
}
@@ -125,24 +125,23 @@ object SVD {
System.exit(1)
}
val (master, inputFile, m, n, min_svalue, output_u, output_s, output_v) =
- (args(0), args(1), args(2).toInt, args(3).toInt, args(4).toDouble, args(5), args(6), args(7))
+ (args(0), args(1), args(2).toInt, args(3).toInt, args(4).toDouble, args(5), args(6), args(7))
val sc = new SparkContext(master, "SVD")
- val rawdata = sc.textFile(inputFile)
- val data = rawdata.map { line =>
- val parts = line.split(',')
- ((parts(0).toInt, parts(1).toInt), parts(2).toDouble)
- }
+ val rawdata = sc.textFile(inputFile)
+ val data = rawdata.map { line =>
+ val parts = line.split(',')
+ ((parts(0).toInt, parts(1).toInt), parts(2).toDouble)
+ }
- val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
+ val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
println("Computed " + s.toArray.length + " singular values and vectors")
- u.saveAsTextFile(output_u)
- s.saveAsTextFile(output_s)
- v.saveAsTextFile(output_v)
+ u.saveAsTextFile(output_u)
+ s.saveAsTextFile(output_s)
+ v.saveAsTextFile(output_v)
System.exit(0)
}
}
-
[29/50] git commit: new example file
Posted by ma...@apache.org.
new example file
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/e9bd6cb5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/e9bd6cb5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/e9bd6cb5
Branch: refs/heads/master
Commit: e9bd6cb51dce9222a5a284cd171b299b0169852b
Parents: 8bfcce1
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sat Jan 4 12:33:22 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sat Jan 4 12:33:22 2014 -0800
----------------------------------------------------------------------
.../org/apache/spark/examples/SparkSVD.scala | 58 ++++++++++++++++++++
.../org/apache/spark/mllib/linalg/SVD.scala | 1 -
2 files changed, 58 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/e9bd6cb5/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
----------------------------------------------------------------------
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala b/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
new file mode 100644
index 0000000..5590ee7
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkSVD.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.SVD
+import org.apache.spark.mllib.linalg.MatrixEntry
+
+/**
+ * Compute SVD of an example matrix
+ * Input file should be comma separated, 1 indexed of the form
+ * i,j,value
+ * Where i is the column, j the row, and value is the matrix entry
+ *
+ * For example input file, see:
+ * mllib/data/als/test.data
+ */
+object SparkSVD {
+ def main(args: Array[String]) {
+ if (args.length < 3) {
+ System.err.println("Usage: SVD <master> <file>")
+ System.exit(1)
+ }
+ val sc = new SparkContext(args(0), "SVD",
+ System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))
+
+ // Load and parse the data file
+ val data = sc.textFile(args(1)).map { line =>
+ val parts = line.split(',')
+ MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
+ }
+ val m = 4
+ val n = 4
+
+ // recover largest singular vector
+ val decomposed = SVD.sparseSVD(data, m, n, 1)
+ val u = decomposed.U
+ val s = decomposed.S
+ val v = decomposed.V
+
+ println("singular values = " + s.toArray.mkString)
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/e9bd6cb5/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index d1ee6c6..e58b8e8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -24,7 +24,6 @@ import org.apache.spark.rdd.RDD
import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
-
/**
* Class used to obtain singular value decompositions
* @param data Matrix in sparse matrix format
[10/50] git commit: New documentation
Posted by ma...@apache.org.
New documentation
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/dd0d3f00
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/dd0d3f00
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/dd0d3f00
Branch: refs/heads/master
Commit: dd0d3f008b5dd478fdfb6d20c53713ca0c7c2be1
Parents: 7c04b31
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 19:53:04 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 19:53:04 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 52 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/dd0d3f00/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index c1ff9c4..8c86369 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -210,3 +210,55 @@ at each iteration.
Available algorithms for gradient descent:
* [GradientDescent](api/mllib/index.html#org.apache.spark.mllib.optimization.GradientDescent)
+
+
+
+# Singular Value Decomposition
+Singular Value Decomposition for Tall and Skinny matrices.
+Given an m x n matrix A, this will compute matrices U, S, V such that
+A = U * S * V^T
+
+There is no restriction on m, but we require n^2 doubles to fit in memory.
+Further, n should be less than m.
+
+The decomposition is computed by first computing A^TA = V S^2 V^T,
+computing svd locally on that (since n x n is small),
+from which we recover S and V.
+Then we compute U via easy matrix multiplication
+as U = A * V * S^-1
+
+Only singular vectors associated with singular values
+greater or equal to MIN_SVALUE are recovered. If there are k
+such values, then the dimensions of the return will be:
+
+S is k x k and diagonal, holding the singular values on diagonal
+U is m x k and satisfies U^T*U = eye(k)
+V is n x k and satisfies V^TV = eye(k)
+
+All input and output is expected in sparse matrix format, 1-indexed
+as tuples of the form ((i,j),value) all in RDDs
+
+{% highlight scala %}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.SVD
+
+// Load and parse the data file
+val data = sc.textFile("mllib/data/als/test.data").map { line =>
+ val parts = line.split(',')
+ ((parts(0).toInt, parts(1).toInt), parts(2).toDouble)
+}
+val m = 4
+val n = 4
+
+// recover singular vectors for singular values at or above 1e-5
+val (u, s, v) = SVD.sparseSVD(data, m, n, 1e-5)
+
+println("singular values = " + s.toArray.mkString)
+
+{% endhighlight %}
+
+
+
+
+
[18/50] git commit: more docs yay
Posted by ma...@apache.org.
more docs yay
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/2612164f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/2612164f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/2612164f
Branch: refs/heads/master
Commit: 2612164f85ae3249c78c130fc51427ace33b3580
Parents: 915d53f
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 20:22:29 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 20:22:29 2014 -0800
----------------------------------------------------------------------
.../main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/2612164f/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index 19173fd..2c82c6b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -23,7 +23,10 @@ import org.apache.spark.rdd.RDD
import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
-
+/**
+ * Top-level methods for calling Singular Value Decomposition
+ * NOTE: All matrices are in 1-indexed sparse format RDD[((int, int), value)]
+ */
object SVD {
/**
* Singular Value Decomposition for Tall and Skinny matrices.
[36/50] git commit: Merge remote-tracking branch 'upstream/master'
into sparsesvd
Posted by ma...@apache.org.
Merge remote-tracking branch 'upstream/master' into sparsesvd
Conflicts:
docs/mllib-guide.md
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/21c8a54c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/21c8a54c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/21c8a54c
Branch: refs/heads/master
Commit: 21c8a54c08354f8934fd8ec58b43879c1686ccad
Parents: cf5bd4a 300eaa9
Author: Reza Zadeh <ri...@gmail.com>
Authored: Thu Jan 9 22:45:32 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Thu Jan 9 22:45:32 2014 -0800
----------------------------------------------------------------------
.gitignore | 3 +
README.md | 20 +-
assembly/lib/PY4J_LICENSE.txt | 27 -
assembly/lib/PY4J_VERSION.txt | 1 -
assembly/lib/net/sf/py4j/py4j/0.7/py4j-0.7.jar | Bin 103286 -> 0 bytes
assembly/lib/net/sf/py4j/py4j/0.7/py4j-0.7.pom | 9 -
.../net/sf/py4j/py4j/maven-metadata-local.xml | 12 -
assembly/pom.xml | 20 +-
assembly/src/main/assembly/assembly.xml | 11 +-
bin/compute-classpath.cmd | 2 +-
bin/compute-classpath.sh | 2 +-
bin/pyspark | 65 ++
bin/pyspark.cmd | 23 +
bin/pyspark2.cmd | 55 +
bin/run-example | 86 ++
bin/run-example.cmd | 23 +
bin/run-example2.cmd | 61 ++
bin/slaves.sh | 91 --
bin/spark-class | 154 +++
bin/spark-class.cmd | 23 +
bin/spark-class2.cmd | 85 ++
bin/spark-config.sh | 36 -
bin/spark-daemon.sh | 183 ----
bin/spark-daemons.sh | 35 -
bin/spark-shell | 102 ++
bin/spark-shell.cmd | 23 +
bin/start-all.sh | 34 -
bin/start-master.sh | 52 -
bin/start-slave.sh | 35 -
bin/start-slaves.sh | 48 -
bin/stop-all.sh | 32 -
bin/stop-master.sh | 27 -
bin/stop-slaves.sh | 35 -
conf/log4j.properties.template | 1 +
conf/spark-env.sh.template | 2 +-
core/pom.xml | 10 +
.../apache/spark/network/netty/FileClient.java | 5 +-
.../netty/FileClientChannelInitializer.java | 2 +-
.../apache/spark/network/netty/FileServer.java | 8 +-
.../netty/FileServerChannelInitializer.java | 4 +-
.../spark/network/netty/FileServerHandler.java | 6 +-
.../spark/network/netty/PathResolver.java | 52 +-
.../org/apache/spark/default-log4j.properties | 8 -
.../org/apache/spark/log4j-defaults.properties | 9 +
.../main/scala/org/apache/spark/Logging.scala | 10 +-
.../org/apache/spark/MapOutputTracker.scala | 14 +-
.../scala/org/apache/spark/Partitioner.scala | 7 +-
.../main/scala/org/apache/spark/SparkConf.scala | 13 +-
.../scala/org/apache/spark/SparkContext.scala | 84 +-
.../main/scala/org/apache/spark/SparkEnv.scala | 18 +-
.../org/apache/spark/SparkHadoopWriter.scala | 4 -
.../apache/spark/api/java/JavaDoubleRDD.scala | 12 +-
.../org/apache/spark/api/java/JavaPairRDD.scala | 6 +
.../org/apache/spark/api/java/JavaRDD.scala | 6 +
.../org/apache/spark/api/java/JavaRDDLike.scala | 6 +
.../spark/api/java/JavaSparkContext.scala | 72 ++
.../org/apache/spark/api/python/PythonRDD.scala | 4 +-
.../apache/spark/broadcast/HttpBroadcast.scala | 4 +-
.../spark/broadcast/TorrentBroadcast.scala | 2 +-
.../spark/deploy/ApplicationDescription.scala | 2 +-
.../scala/org/apache/spark/deploy/Client.scala | 151 +++
.../apache/spark/deploy/ClientArguments.scala | 117 ++
.../org/apache/spark/deploy/DeployMessage.scala | 52 +-
.../apache/spark/deploy/DriverDescription.scala | 29 +
.../apache/spark/deploy/client/AppClient.scala | 201 ++++
.../spark/deploy/client/AppClientListener.scala | 39 +
.../org/apache/spark/deploy/client/Client.scala | 190 ----
.../spark/deploy/client/ClientListener.scala | 39 -
.../apache/spark/deploy/client/TestClient.scala | 6 +-
.../spark/deploy/master/ApplicationInfo.scala | 7 +-
.../apache/spark/deploy/master/DriverInfo.scala | 36 +
.../spark/deploy/master/DriverState.scala | 33 +
.../master/FileSystemPersistenceEngine.scala | 17 +-
.../org/apache/spark/deploy/master/Master.scala | 206 +++-
.../spark/deploy/master/PersistenceEngine.scala | 11 +-
.../apache/spark/deploy/master/WorkerInfo.scala | 20 +-
.../master/ZooKeeperPersistenceEngine.scala | 14 +-
.../spark/deploy/master/ui/IndexPage.scala | 56 +-
.../spark/deploy/worker/CommandUtils.scala | 63 ++
.../spark/deploy/worker/DriverRunner.scala | 234 ++++
.../spark/deploy/worker/DriverWrapper.scala | 31 +
.../spark/deploy/worker/ExecutorRunner.scala | 67 +-
.../org/apache/spark/deploy/worker/Worker.scala | 65 +-
.../spark/deploy/worker/WorkerWatcher.scala | 55 +
.../spark/deploy/worker/ui/IndexPage.scala | 65 +-
.../spark/deploy/worker/ui/WorkerWebUI.scala | 43 +-
.../executor/CoarseGrainedExecutorBackend.scala | 27 +-
.../org/apache/spark/executor/Executor.scala | 24 +-
.../org/apache/spark/io/CompressionCodec.scala | 2 +-
.../spark/network/ConnectionManager.scala | 18 +-
.../spark/network/netty/ShuffleCopier.scala | 2 +-
.../org/apache/spark/rdd/CheckpointRDD.scala | 4 +-
.../org/apache/spark/rdd/NewHadoopRDD.scala | 2 +-
.../org/apache/spark/rdd/PairRDDFunctions.scala | 59 +-
.../spark/rdd/PartitionerAwareUnionRDD.scala | 110 ++
.../main/scala/org/apache/spark/rdd/RDD.scala | 32 +-
.../apache/spark/rdd/RDDCheckpointData.scala | 2 +-
.../spark/scheduler/TaskResultGetter.scala | 2 +-
.../spark/scheduler/TaskSchedulerImpl.scala | 13 +-
.../apache/spark/scheduler/TaskSetManager.scala | 13 +-
.../cluster/CoarseGrainedSchedulerBackend.scala | 5 +-
.../cluster/SimrSchedulerBackend.scala | 2 +-
.../cluster/SparkDeploySchedulerBackend.scala | 12 +-
.../mesos/CoarseMesosSchedulerBackend.scala | 6 +-
.../cluster/mesos/MesosSchedulerBackend.scala | 6 +-
.../spark/serializer/KryoSerializer.scala | 4 +-
.../apache/spark/serializer/Serializer.scala | 3 +
.../spark/serializer/SerializerManager.scala | 15 +-
.../spark/storage/BlockFetcherIterator.scala | 2 +-
.../org/apache/spark/storage/BlockManager.scala | 22 +-
.../spark/storage/BlockManagerMaster.scala | 12 +-
.../spark/storage/BlockManagerMasterActor.scala | 7 +-
.../apache/spark/storage/DiskBlockManager.scala | 2 +-
.../spark/storage/ShuffleBlockManager.scala | 4 +-
.../apache/spark/storage/ThreadingTest.scala | 2 +-
.../apache/spark/ui/UIWorkloadGenerator.scala | 4 +-
.../spark/ui/jobs/JobProgressListener.scala | 2 +-
.../scala/org/apache/spark/util/AkkaUtils.scala | 43 +-
.../org/apache/spark/util/MetadataCleaner.scala | 2 +-
.../org/apache/spark/util/XORShiftRandom.scala | 2 +-
.../test/resources/uncommons-maths-1.2.2.jar | Bin 49019 -> 0 bytes
.../org/apache/spark/CheckpointSuite.scala | 361 ++++---
.../scala/org/apache/spark/DriverSuite.scala | 8 +-
.../org/apache/spark/FileServerSuite.scala | 108 +-
.../apache/spark/MapOutputTrackerSuite.scala | 17 +-
.../apache/spark/deploy/JsonProtocolSuite.scala | 45 +-
.../spark/deploy/worker/DriverRunnerTest.scala | 131 +++
.../deploy/worker/ExecutorRunnerTest.scala | 10 +-
.../deploy/worker/WorkerWatcherSuite.scala | 32 +
.../scala/org/apache/spark/rdd/RDDSuite.scala | 27 +
.../spark/scheduler/TaskSetManagerSuite.scala | 2 +-
.../spark/storage/BlockManagerSuite.scala | 12 +-
.../spark/storage/DiskBlockManagerSuite.scala | 4 +-
.../apache/spark/util/XORShiftRandomSuite.scala | 2 +-
data/kmeans_data.txt | 6 +
data/lr_data.txt | 1000 ++++++++++++++++++
data/pagerank_data.txt | 6 +
docs/bagel-programming-guide.md | 4 +-
docs/building-with-maven.md | 14 +-
docs/configuration.md | 49 +-
docs/css/bootstrap.min.css | 2 +-
docs/index.md | 10 +-
docs/java-programming-guide.md | 4 +-
docs/job-scheduling.md | 5 +-
docs/mllib-guide.md | 344 ++++--
docs/python-programming-guide.md | 31 +-
docs/quick-start.md | 14 +-
docs/running-on-yarn.md | 17 +-
docs/scala-programming-guide.md | 14 +-
docs/spark-debugger.md | 2 +-
docs/spark-standalone.md | 68 +-
docs/streaming-programming-guide.md | 4 +-
ec2/spark_ec2.py | 2 +-
examples/pom.xml | 45 +-
.../org/apache/spark/examples/JavaHdfsLR.java | 31 +-
.../org/apache/spark/examples/JavaKMeans.java | 25 +-
.../org/apache/spark/examples/JavaLogQuery.java | 22 +-
.../org/apache/spark/examples/JavaPageRank.java | 13 +-
.../org/apache/spark/examples/JavaSparkPi.java | 12 +-
.../java/org/apache/spark/examples/JavaTC.java | 18 +-
.../apache/spark/examples/JavaWordCount.java | 12 +-
.../apache/spark/mllib/examples/JavaALS.java | 20 +-
.../apache/spark/mllib/examples/JavaKMeans.java | 18 +-
.../org/apache/spark/mllib/examples/JavaLR.java | 21 +-
.../streaming/examples/JavaFlumeEventCount.java | 18 +-
.../streaming/examples/JavaKafkaWordCount.java | 28 +-
.../examples/JavaNetworkWordCount.java | 18 +-
.../streaming/examples/JavaQueueStream.java | 13 +-
.../apache/spark/examples/BroadcastTest.scala | 2 +-
.../spark/examples/DriverSubmissionTest.scala | 46 +
.../spark/examples/ExceptionHandlingTest.scala | 2 +-
.../org/apache/spark/examples/GroupByTest.scala | 2 +-
.../org/apache/spark/examples/HBaseTest.scala | 2 +-
.../org/apache/spark/examples/HdfsTest.scala | 2 +-
.../org/apache/spark/examples/LogQuery.scala | 2 +-
.../spark/examples/MultiBroadcastTest.scala | 2 +-
.../examples/SimpleSkewedGroupByTest.scala | 2 +-
.../spark/examples/SkewedGroupByTest.scala | 2 +-
.../org/apache/spark/examples/SparkALS.scala | 2 +-
.../org/apache/spark/examples/SparkHdfsLR.scala | 2 +-
.../org/apache/spark/examples/SparkKMeans.scala | 2 +-
.../org/apache/spark/examples/SparkLR.scala | 2 +-
.../apache/spark/examples/SparkPageRank.scala | 2 +-
.../org/apache/spark/examples/SparkPi.scala | 2 +-
.../org/apache/spark/examples/SparkTC.scala | 2 +-
.../streaming/examples/ActorWordCount.scala | 6 +-
.../streaming/examples/FlumeEventCount.scala | 5 +-
.../streaming/examples/HdfsWordCount.scala | 4 +-
.../streaming/examples/KafkaWordCount.scala | 7 +-
.../streaming/examples/MQTTWordCount.scala | 18 +-
.../streaming/examples/NetworkWordCount.scala | 4 +-
.../spark/streaming/examples/QueueStream.scala | 2 +-
.../streaming/examples/RawNetworkGrep.scala | 2 +-
.../examples/StatefulNetworkWordCount.scala | 4 +-
.../streaming/examples/TwitterAlgebirdCMS.scala | 8 +-
.../streaming/examples/TwitterAlgebirdHLL.scala | 6 +-
.../streaming/examples/TwitterPopularTags.scala | 5 +-
.../streaming/examples/ZeroMQWordCount.scala | 15 +-
.../clickstream/PageViewGenerator.scala | 4 +-
.../examples/clickstream/PageViewStream.scala | 6 +-
external/flume/pom.xml | 93 ++
.../streaming/flume/FlumeInputDStream.scala | 155 +++
.../spark/streaming/flume/FlumeUtils.scala | 70 ++
.../streaming/flume/JavaFlumeStreamSuite.java | 34 +
.../flume/src/test/resources/log4j.properties | 29 +
.../streaming/flume/FlumeStreamSuite.scala | 86 ++
external/kafka/pom.xml | 97 ++
.../streaming/kafka/KafkaInputDStream.scala | 154 +++
.../spark/streaming/kafka/KafkaUtils.scala | 153 +++
.../streaming/kafka/JavaKafkaStreamSuite.java | 45 +
.../kafka/src/test/resources/log4j.properties | 29 +
.../streaming/kafka/KafkaStreamSuite.scala | 39 +
external/mqtt/pom.xml | 108 ++
.../spark/streaming/mqtt/MQTTInputDStream.scala | 110 ++
.../apache/spark/streaming/mqtt/MQTTUtils.scala | 75 ++
.../streaming/mqtt/JavaMQTTStreamSuite.java | 37 +
.../mqtt/src/test/resources/log4j.properties | 29 +
.../spark/streaming/mqtt/MQTTStreamSuite.scala | 36 +
external/twitter/pom.xml | 89 ++
.../streaming/twitter/TwitterInputDStream.scala | 100 ++
.../spark/streaming/twitter/TwitterUtils.scala | 126 +++
.../twitter/JavaTwitterStreamSuite.java | 46 +
.../twitter/src/test/resources/log4j.properties | 29 +
.../streaming/twitter/TwitterStreamSuite.scala | 43 +
external/zeromq/pom.xml | 89 ++
.../spark/streaming/zeromq/ZeroMQReceiver.scala | 54 +
.../spark/streaming/zeromq/ZeroMQUtils.scala | 126 +++
.../streaming/zeromq/JavaZeroMQStreamSuite.java | 50 +
.../zeromq/src/test/resources/log4j.properties | 29 +
.../streaming/zeromq/ZeroMQStreamSuite.scala | 44 +
kmeans_data.txt | 6 -
lr_data.txt | 1000 ------------------
make-distribution.sh | 27 +-
.../spark/mllib/api/python/PythonMLLibAPI.scala | 30 +
.../spark/mllib/classification/NaiveBayes.scala | 119 +++
.../spark/mllib/optimization/Gradient.scala | 4 +-
.../MatrixFactorizationModel.scala | 38 +-
.../mllib/classification/NaiveBayesSuite.scala | 108 ++
.../optimization/GradientDescentSuite.scala | 116 ++
.../spark/mllib/recommendation/ALSSuite.scala | 33 +-
new-yarn/pom.xml | 161 ---
.../spark/deploy/yarn/ApplicationMaster.scala | 446 --------
.../yarn/ApplicationMasterArguments.scala | 94 --
.../org/apache/spark/deploy/yarn/Client.scala | 521 ---------
.../spark/deploy/yarn/ClientArguments.scala | 149 ---
.../yarn/ClientDistributedCacheManager.scala | 228 ----
.../spark/deploy/yarn/WorkerLauncher.scala | 222 ----
.../spark/deploy/yarn/WorkerRunnable.scala | 209 ----
.../deploy/yarn/YarnAllocationHandler.scala | 687 ------------
.../spark/deploy/yarn/YarnSparkHadoopUtil.scala | 43 -
.../cluster/YarnClientClusterScheduler.scala | 48 -
.../cluster/YarnClientSchedulerBackend.scala | 110 --
.../cluster/YarnClusterScheduler.scala | 56 -
.../ClientDistributedCacheManagerSuite.scala | 220 ----
pagerank_data.txt | 6 -
pom.xml | 89 +-
project/SparkBuild.scala | 142 ++-
project/build.properties | 1 -
pyspark | 70 --
pyspark.cmd | 23 -
pyspark2.cmd | 55 -
python/lib/py4j-0.8.1-src.zip | Bin 0 -> 37662 bytes
python/lib/py4j0.7.egg | Bin 191756 -> 0 bytes
python/pyspark/__init__.py | 2 +-
python/pyspark/java_gateway.py | 2 +-
python/pyspark/mllib/_common.py | 25 +
python/pyspark/mllib/recommendation.py | 12 +-
python/pyspark/rdd.py | 66 +-
python/pyspark/shell.py | 2 +-
python/run-tests | 2 +-
repl-bin/src/deb/bin/run | 3 +-
repl/pom.xml | 1 -
.../apache/spark/repl/SparkCommandLine.scala | 37 +
.../org/apache/spark/repl/SparkILoop.scala | 11 +-
.../apache/spark/repl/SparkRunnerSettings.scala | 32 +
run-example | 91 --
run-example.cmd | 23 -
run-example2.cmd | 61 --
sbin/slaves.sh | 91 ++
sbin/spark-config.sh | 36 +
sbin/spark-daemon.sh | 183 ++++
sbin/spark-daemons.sh | 35 +
sbin/spark-executor | 23 +
sbin/start-all.sh | 34 +
sbin/start-master.sh | 52 +
sbin/start-slave.sh | 35 +
sbin/start-slaves.sh | 48 +
sbin/stop-all.sh | 32 +
sbin/stop-master.sh | 27 +
sbin/stop-slaves.sh | 35 +
sbt/sbt | 53 +-
sbt/sbt-launch-0.11.3-2.jar | Bin 1096763 -> 0 bytes
sbt/sbt.cmd | 25 -
spark-class | 154 ---
spark-class.cmd | 23 -
spark-class2.cmd | 85 --
spark-executor | 22 -
spark-shell | 102 --
spark-shell.cmd | 22 -
streaming/pom.xml | 83 +-
.../spark/streaming/PairDStreamFunctions.scala | 13 +-
.../spark/streaming/StreamingContext.scala | 151 +--
.../streaming/api/java/JavaPairDStream.scala | 18 +-
.../api/java/JavaStreamingContext.scala | 260 +----
.../streaming/dstream/FlumeInputDStream.scala | 154 ---
.../streaming/dstream/KafkaInputDStream.scala | 153 ---
.../streaming/dstream/MQTTInputDStream.scala | 110 --
.../streaming/dstream/NetworkInputDStream.scala | 4 +-
.../streaming/dstream/ShuffledDStream.scala | 9 +-
.../streaming/dstream/TwitterInputDStream.scala | 99 --
.../streaming/dstream/WindowedDStream.scala | 16 +-
.../streaming/receivers/ZeroMQReceiver.scala | 53 -
.../apache/spark/streaming/scheduler/Job.scala | 2 +-
.../streaming/scheduler/JobGenerator.scala | 2 +-
.../streaming/scheduler/JobScheduler.scala | 2 +-
.../scheduler/NetworkInputTracker.scala | 4 +-
.../apache/spark/streaming/JavaAPISuite.java | 104 +-
.../streaming/LocalJavaStreamingContext.java | 46 +
.../spark/streaming/InputStreamsSuite.scala | 82 +-
.../apache/spark/streaming/TestSuiteBase.scala | 20 +-
.../spark/streaming/WindowOperationsSuite.scala | 4 +-
yarn/README.md | 12 +
yarn/alpha/pom.xml | 32 +
.../spark/deploy/yarn/ApplicationMaster.scala | 464 ++++++++
.../org/apache/spark/deploy/yarn/Client.scala | 509 +++++++++
.../spark/deploy/yarn/WorkerLauncher.scala | 250 +++++
.../spark/deploy/yarn/WorkerRunnable.scala | 236 +++++
.../deploy/yarn/YarnAllocationHandler.scala | 680 ++++++++++++
.../yarn/ApplicationMasterArguments.scala | 94 ++
.../spark/deploy/yarn/ClientArguments.scala | 150 +++
.../yarn/ClientDistributedCacheManager.scala | 228 ++++
.../spark/deploy/yarn/YarnSparkHadoopUtil.scala | 43 +
.../cluster/YarnClientClusterScheduler.scala | 48 +
.../cluster/YarnClientSchedulerBackend.scala | 112 ++
.../cluster/YarnClusterScheduler.scala | 56 +
.../ClientDistributedCacheManagerSuite.scala | 220 ++++
yarn/pom.xml | 84 +-
.../spark/deploy/yarn/ApplicationMaster.scala | 477 ---------
.../yarn/ApplicationMasterArguments.scala | 94 --
.../org/apache/spark/deploy/yarn/Client.scala | 505 ---------
.../spark/deploy/yarn/ClientArguments.scala | 146 ---
.../yarn/ClientDistributedCacheManager.scala | 228 ----
.../spark/deploy/yarn/WorkerLauncher.scala | 243 -----
.../spark/deploy/yarn/WorkerRunnable.scala | 235 ----
.../deploy/yarn/YarnAllocationHandler.scala | 673 ------------
.../spark/deploy/yarn/YarnSparkHadoopUtil.scala | 43 -
.../cluster/YarnClientClusterScheduler.scala | 48 -
.../cluster/YarnClientSchedulerBackend.scala | 110 --
.../cluster/YarnClusterScheduler.scala | 59 --
.../ClientDistributedCacheManagerSuite.scala | 220 ----
yarn/stable/pom.xml | 32 +
.../spark/deploy/yarn/ApplicationMaster.scala | 432 ++++++++
.../org/apache/spark/deploy/yarn/Client.scala | 525 +++++++++
.../spark/deploy/yarn/WorkerLauncher.scala | 230 ++++
.../spark/deploy/yarn/WorkerRunnable.scala | 210 ++++
.../deploy/yarn/YarnAllocationHandler.scala | 695 ++++++++++++
356 files changed, 14177 insertions(+), 11330 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/21c8a54c/docs/mllib-guide.md
----------------------------------------------------------------------
diff --cc docs/mllib-guide.md
index 44e6c8f,45ee166..21d0464
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@@ -211,60 -163,268 +163,318 @@@ Available algorithms for gradient desce
* [GradientDescent](api/mllib/index.html#org.apache.spark.mllib.optimization.GradientDescent)
+ # Using MLLib in Scala
+
+ Following code snippets can be executed in `spark-shell`.
+
+ ## Binary Classification
+
+ The following code snippet illustrates how to load a sample dataset, execute a
+ training algorithm on this training data using a static method in the algorithm
+ object, and make predictions with the resulting model to compute the training
+ error.
+
+ {% highlight scala %}
+ import org.apache.spark.SparkContext
+ import org.apache.spark.mllib.classification.SVMWithSGD
+ import org.apache.spark.mllib.regression.LabeledPoint
+
+ // Load and parse the data file
+ val data = sc.textFile("mllib/data/sample_svm_data.txt")
+ val parsedData = data.map { line =>
+ val parts = line.split(' ')
+ LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
+ }
+
+ // Run training algorithm to build the model
+ val numIterations = 20
+ val model = SVMWithSGD.train(parsedData, numIterations)
+
+ // Evaluate model on training examples and compute training error
+ val labelAndPreds = parsedData.map { point =>
+ val prediction = model.predict(point.features)
+ (point.label, prediction)
+ }
+ val trainErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / parsedData.count
+ println("Training Error = " + trainErr)
+ {% endhighlight %}
+
+
+ The `SVMWithSGD.train()` method by default performs L2 regularization with the
+ regularization parameter set to 1.0. If we want to configure this algorithm, we
+ can customize `SVMWithSGD` further by creating a new object directly and
+ calling setter methods. All other MLlib algorithms support customization in
+ this way as well. For example, the following code produces an L1 regularized
+ variant of SVMs with regularization parameter set to 0.1, and runs the training
+ algorithm for 200 iterations.
+
+ {% highlight scala %}
+ import org.apache.spark.mllib.optimization.L1Updater
+
+ val svmAlg = new SVMWithSGD()
+ svmAlg.optimizer.setNumIterations(200)
+ .setRegParam(0.1)
+ .setUpdater(new L1Updater)
+ val modelL1 = svmAlg.run(parsedData)
+ {% endhighlight %}
+
+ ## Linear Regression
+ The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint. The
+ example then uses LinearRegressionWithSGD to build a simple linear model to predict label values. We
+ compute the Mean Squared Error at the end to evaluate
+ [goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit)
+
+ {% highlight scala %}
+ import org.apache.spark.mllib.regression.LinearRegressionWithSGD
+ import org.apache.spark.mllib.regression.LabeledPoint
+
+ // Load and parse the data
+ val data = sc.textFile("mllib/data/ridge-data/lpsa.data")
+ val parsedData = data.map { line =>
+ val parts = line.split(',')
+ LabeledPoint(parts(0).toDouble, parts(1).split(' ').map(x => x.toDouble).toArray)
+ }
+
+ // Building the model
+ val numIterations = 20
+ val model = LinearRegressionWithSGD.train(parsedData, numIterations)
+
+ // Evaluate model on training examples and compute training error
+ val valuesAndPreds = parsedData.map { point =>
+ val prediction = model.predict(point.features)
+ (point.label, prediction)
+ }
+ val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _)/valuesAndPreds.count
+ println("training Mean Squared Error = " + MSE)
+ {% endhighlight %}
+
+
+ Similarly you can use RidgeRegressionWithSGD and LassoWithSGD and compare training
+ [Mean Squared Errors](http://en.wikipedia.org/wiki/Mean_squared_error).
+
+ ## Clustering
+ In the following example after loading and parsing data, we use the KMeans object to cluster the data
+ into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within
+ Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the
+ optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
+
+ {% highlight scala %}
+ import org.apache.spark.mllib.clustering.KMeans
+
+ // Load and parse the data
+ val data = sc.textFile("kmeans_data.txt")
+ val parsedData = data.map( _.split(' ').map(_.toDouble))
+
+ // Cluster the data into two classes using KMeans
+ val numIterations = 20
+ val numClusters = 2
+ val clusters = KMeans.train(parsedData, numClusters, numIterations)
+
+ // Evaluate clustering by computing Within Set Sum of Squared Errors
+ val WSSSE = clusters.computeCost(parsedData)
+ println("Within Set Sum of Squared Errors = " + WSSSE)
+ {% endhighlight %}
+
+
+ ## Collaborative Filtering
+ In the following example we load rating data. Each row consists of a user, a product and a rating.
+ We use the default ALS.train() method which assumes ratings are explicit. We evaluate the recommendation
+ model by measuring the Mean Squared Error of rating prediction.
+
+ {% highlight scala %}
+ import org.apache.spark.mllib.recommendation.ALS
+ import org.apache.spark.mllib.recommendation.Rating
+
+ // Load and parse the data
+ val data = sc.textFile("mllib/data/als/test.data")
+ val ratings = data.map(_.split(',') match {
+ case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble)
+ })
+
+ // Build the recommendation model using ALS
+ val numIterations = 20
+ val model = ALS.train(ratings, 1, 20, 0.01)
+
+ // Evaluate the model on rating data
+ val usersProducts = ratings.map{ case Rating(user, product, rate) => (user, product)}
+ val predictions = model.predict(usersProducts).map{
+ case Rating(user, product, rate) => ((user, product), rate)
+ }
+ val ratesAndPreds = ratings.map{
+ case Rating(user, product, rate) => ((user, product), rate)
+ }.join(predictions)
+ val MSE = ratesAndPreds.map{
+ case ((user, product), (r1, r2)) => math.pow((r1- r2), 2)
+ }.reduce(_ + _)/ratesAndPreds.count
+ println("Mean Squared Error = " + MSE)
+ {% endhighlight %}
+
+ If the rating matrix is derived from other source of information (i.e., it is inferred from
+ other signals), you can use the trainImplicit method to get better results.
+
+ {% highlight scala %}
+ val model = ALS.trainImplicit(ratings, 1, 20, 0.01)
+ {% endhighlight %}
+
+ # Using MLLib in Python
+ Following examples can be tested in the PySpark shell.
+
+ ## Binary Classification
+ The following example shows how to load a sample dataset, build Logistic Regression model,
+ and make predictions with the resulting model to compute the training error.
+
+ {% highlight python %}
+ from pyspark.mllib.classification import LogisticRegressionWithSGD
+ from numpy import array
+
+ # Load and parse the data
+ data = sc.textFile("mllib/data/sample_svm_data.txt")
+ parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
+ model = LogisticRegressionWithSGD.train(sc, parsedData)
+
+ # Build the model
+ labelsAndPreds = parsedData.map(lambda point: (int(point.item(0)),
+ model.predict(point.take(range(1, point.size)))))
+
+ # Evaluating the model on training data
+ trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
+ print("Training Error = " + str(trainErr))
+ {% endhighlight %}
+
+ ## Linear Regression
+ The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint. The
+ example then uses LinearRegressionWithSGD to build a simple linear model to predict label values. We
+ compute the Mean Squared Error at the end to evaluate
+ [goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit)
+
+ {% highlight python %}
+ from pyspark.mllib.regression import LinearRegressionWithSGD
+ from numpy import array
+
+ # Load and parse the data
+ data = sc.textFile("mllib/data/ridge-data/lpsa.data")
+ parsedData = data.map(lambda line: array([float(x) for x in line.replace(',', ' ').split(' ')]))
+
+ # Build the model
+ model = LinearRegressionWithSGD.train(sc, parsedData)
+
+ # Evaluate the model on training data
+ valuesAndPreds = parsedData.map(lambda point: (point.item(0),
+ model.predict(point.take(range(1, point.size)))))
+ MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)/valuesAndPreds.count()
+ print("Mean Squared Error = " + str(MSE))
+ {% endhighlight %}
+
+
+ ## Clustering
+ In the following example after loading and parsing data, we use the KMeans object to cluster the data
+ into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within
+ Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the
+ optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
+
+ {% highlight python %}
+ from pyspark.mllib.clustering import KMeans
+ from numpy import array
+ from math import sqrt
+
+ # Load and parse the data
+ data = sc.textFile("kmeans_data.txt")
+ parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
+
+ # Build the model (cluster the data)
+ clusters = KMeans.train(sc, parsedData, 2, maxIterations=10,
+ runs=30, initialization_mode="random")
+
+ # Evaluate clustering by computing Within Set Sum of Squared Errors
+ def error(point):
+ center = clusters.centers[clusters.predict(point)]
+ return sqrt(sum([x**2 for x in (point - center)]))
+
+ WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
+ print("Within Set Sum of Squared Error = " + str(WSSSE))
+ {% endhighlight %}
+
+ Similarly you can use RidgeRegressionWithSGD and LassoWithSGD and compare training Mean Squared
+ Errors.
+
+ ## Collaborative Filtering
+ In the following example we load rating data. Each row consists of a user, a product and a rating.
+ We use the default ALS.train() method which assumes ratings are explicit. We evaluate the
+ recommendation by measuring the Mean Squared Error of rating prediction.
+
+ {% highlight python %}
+ from pyspark.mllib.recommendation import ALS
+ from numpy import array
+
+ # Load and parse the data
+ data = sc.textFile("mllib/data/als/test.data")
+ ratings = data.map(lambda line: array([float(x) for x in line.split(',')]))
+
+ # Build the recommendation model using Alternating Least Squares
+ model = ALS.train(sc, ratings, 1, 20)
+
+ # Evaluate the model on training data
+ testdata = ratings.map(lambda p: (int(p[0]), int(p[1])))
+ predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
+ ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
+ MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y)/ratesAndPreds.count()
+ print("Mean Squared Error = " + str(MSE))
+ {% endhighlight %}
+
+ If the rating matrix is derived from other source of information (i.e., it is inferred from other
+ signals), you can use the trainImplicit method to get better results.
+
+ {% highlight python %}
+ # Build the recommendation model using Alternating Least Squares based on implicit ratings
+ model = ALS.trainImplicit(sc, ratings, 1, 20)
+ {% endhighlight %}
+
+
+# Singular Value Decomposition
+Singular Value Decomposition for Tall and Skinny matrices.
+Given an *m x n* matrix *A*, we can compute matrices *U, S, V* such that
+
+*A = U * S * V^T*
+
+There is no restriction on m, but we require n^2 doubles to fit in memory.
+Further, n should be less than m.
-
++
+The decomposition is computed by first computing *A^TA = V S^2 V^T*,
+computing svd locally on that (since n x n is small),
- from which we recover S and V.
++from which we recover S and V.
+Then we compute U via easy matrix multiplication
+as *U = A * V * S^-1*
-
++
+Only singular vectors associated with largest k singular values
+are recovered. If there are k
+such values, then the dimensions of the return will be:
+
+* *S* is *k x k* and diagonal, holding the singular values on diagonal.
+* *U* is *m x k* and satisfies U^T*U = eye(k).
+* *V* is *n x k* and satisfies V^TV = eye(k).
+
+All input and output is expected in sparse matrix format, 1-indexed
- as tuples of the form ((i,j),value) all in
++as tuples of the form ((i,j),value) all in
+SparseMatrix RDDs. Below is example usage.
+
+{% highlight scala %}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.SVD
+import org.apache.spark.mllib.linalg.SparseMatrix
+import org.apache.spark.mllib.linalg.MatrixEntry
+
+// Load and parse the data file
+val data = sc.textFile("mllib/data/als/test.data").map { line =>
+ val parts = line.split(',')
+ MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
+}
+val m = 4
+val n = 4
+val k = 1
+
+// recover largest singular vector
+val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), k)
+val = decomposed.S.data
+
+println("singular values = " + s.toArray.mkString)
-
- {% endhighlight %}
-
-
-
-
-
[14/50] git commit: doc tweaks
Posted by ma...@apache.org.
doc tweaks
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/53ccf653
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/53ccf653
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/53ccf653
Branch: refs/heads/master
Commit: 53ccf65362d935f89fb9e27b4a3485454fa4c882
Parents: 97dc527
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 20:03:47 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 20:03:47 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/53ccf653/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 8c490eb..711187f 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -215,7 +215,7 @@ Available algorithms for gradient descent:
# Singular Value Decomposition
Singular Value Decomposition for Tall and Skinny matrices.
-Given an *m x n* matrix *A*, this will compute matrices *U, S, V* such that
+Given an *m x n* matrix *A*, we can compute matrices *U, S, V* such that
*A = U * S * V^T*
[05/50] git commit: add all tests
Posted by ma...@apache.org.
add all tests
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/bdb50379
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/bdb50379
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/bdb50379
Branch: refs/heads/master
Commit: bdb5037987d472c31f4d7891e60a8997e0e9bdcc
Parents: fa1e8d8
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Dec 27 00:36:41 2013 -0500
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Dec 27 00:36:41 2013 -0500
----------------------------------------------------------------------
.../apache/spark/mllib/linalg/SVDSuite.scala | 142 +++++++++++++++++++
1 file changed, 142 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/bdb50379/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
new file mode 100644
index 0000000..726650a
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+
+import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+
+import org.jblas._
+
+class SVDSuite extends FunSuite with BeforeAndAfterAll {
+ @transient private var sc: SparkContext = _
+
+ override def beforeAll() {
+ sc = new SparkContext("local", "test")
+ }
+
+ override def afterAll() {
+ sc.stop()
+ System.clearProperty("spark.driver.port")
+ }
+
+ val EPSILON = 1e-4
+
+ // Return jblas matrix from sparse matrix RDD
+ def getDenseMatrix(matrix:RDD[((Int, Int), Double)], m:Int, n:Int) : DoubleMatrix = {
+ val ret = DoubleMatrix.zeros(m, n)
+ matrix.toArray.map(x => ret.put(x._1._1-1, x._1._2-1, x._2))
+ ret
+ }
+
+ def assertMatrixEquals(a:DoubleMatrix, b:DoubleMatrix) {
+ assert(a.rows == b.rows && a.columns == b.columns, "dimension mismatch")
+ val diff = DoubleMatrix.zeros(a.rows, a.columns)
+ Array.tabulate(a.rows, a.columns){(i,j) =>
+ diff.put(i,j,
+ Math.min(Math.abs(a.get(i,j)-b.get(i,j)),
+ Math.abs(a.get(i,j)+b.get(i,j)))) }
+ assert(diff.norm1 < EPSILON, "matrix mismatch: " + diff.norm1)
+ }
+
+ test("full rank matrix svd") {
+ val m = 10
+ val n = 3
+ val data = sc.makeRDD(Array.tabulate(m,n){ (a,b)=>
+ ((a+1,b+1), (a+2).toDouble*(b+1)/(1+a+b)) }.flatten )
+ val min_svalue = 1.0e-8
+
+ val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
+
+ val densea = getDenseMatrix(data, m, n)
+ val svd = Singular.sparseSVD(densea)
+
+ val retu = getDenseMatrix(u,m,n)
+ val rets = getDenseMatrix(s,n,n)
+ val retv = getDenseMatrix(v,n,n)
+
+ // check individual decomposition
+ assertMatrixEquals(retu, svd(0))
+ assertMatrixEquals(rets, DoubleMatrix.diag(svd(1)))
+ assertMatrixEquals(retv, svd(2))
+
+ // check multiplication guarantee
+ assertMatrixEquals(retu.mmul(rets).mmul(retv.transpose), densea)
+ }
+
+ test("rank one matrix svd") {
+ val m = 10
+ val n = 3
+ val data = sc.makeRDD(Array.tabulate(m,n){ (a,b)=>
+ ((a+1,b+1), 1.0) }.flatten )
+ val min_svalue = 1.0e-4
+
+ val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
+ val retrank = s.toArray.length
+
+ assert(retrank == 1, "rank returned not one")
+
+ val densea = getDenseMatrix(data, m, n)
+ val svd = Singular.sparseSVD(densea)
+
+ val retu = getDenseMatrix(u,m,retrank)
+ val rets = getDenseMatrix(s,retrank,retrank)
+ val retv = getDenseMatrix(v,n,retrank)
+
+ // check individual decomposition
+ assertMatrixEquals(retu, svd(0).getColumn(0))
+ assertMatrixEquals(rets, DoubleMatrix.diag(svd(1).getRow(0)))
+ assertMatrixEquals(retv, svd(2).getColumn(0))
+
+ // check multiplication guarantee
+ assertMatrixEquals(retu.mmul(rets).mmul(retv.transpose), densea)
+ }
+
+ test("truncated with min singular value") {
+ val m = 10
+ val n = 3
+ val data = sc.makeRDD(Array.tabulate(m,n){ (a,b)=>
+ ((a+1,b+1), (a+2).toDouble*(b+1)/(1+a+b)) }.flatten )
+
+ val min_svalue = 5.0 // only one svalue above this
+
+ val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
+ val retrank = s.toArray.length
+
+ val densea = getDenseMatrix(data, m, n)
+ val svd = Singular.sparseSVD(densea)
+
+ val retu = getDenseMatrix(u,m,retrank)
+ val rets = getDenseMatrix(s,retrank,retrank)
+ val retv = getDenseMatrix(v,n,retrank)
+
+ assert(retrank == 1, "rank returned not one")
+
+ // check individual decomposition
+ assertMatrixEquals(retu, svd(0).getColumn(0))
+ assertMatrixEquals(rets, DoubleMatrix.diag(svd(1).getRow(0)))
+ assertMatrixEquals(retv, svd(2).getColumn(0))
+ }
+}
[16/50] git commit: old version of spark_ec2
Posted by ma...@apache.org.
old version of spark_ec2
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/c868d71b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/c868d71b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/c868d71b
Branch: refs/heads/master
Commit: c868d71b0ba122906f9041823ef2442578fa06b9
Parents: 0c3797d
Author: Reza Zadeh <ri...@gmail.com>
Authored: Wed Jan 1 20:08:01 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Wed Jan 1 20:08:01 2014 -0800
----------------------------------------------------------------------
ec2/spark_ec2.py | 10 ++++++++++
1 file changed, 10 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/c868d71b/ec2/spark_ec2.py
----------------------------------------------------------------------
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index ac309cc..a2b0e7e 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -113,6 +113,16 @@ def parse_args():
# Boto config check
# http://boto.cloudhackers.com/en/latest/boto_config_tut.html
home_dir = os.getenv('HOME')
+ if home_dir == None or not os.path.isfile(home_dir + '/.boto'):
+ if not os.path.isfile('/etc/boto.cfg'):
+ if os.getenv('AWS_ACCESS_KEY_ID') == None:
+ print >> stderr, ("ERROR: The environment variable AWS_ACCESS_KEY_ID " +
+ "must be set")
+ sys.exit(1)
+ if os.getenv('AWS_SECRET_ACCESS_KEY') == None:
+ print >> stderr, ("ERROR: The environment variable AWS_SECRET_ACCESS_KEY " +
+ "must be set")
+ sys.exit(1)
return (opts, action, cluster_name)
[49/50] git commit: rename to MatrixSVD
Posted by ma...@apache.org.
rename to MatrixSVD
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/85b95d03
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/85b95d03
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/85b95d03
Branch: refs/heads/master
Commit: 85b95d039ddfc7a2b2b27f506852859181ed16c1
Parents: fa32998
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 14:40:51 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 14:40:51 2014 -0800
----------------------------------------------------------------------
mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/85b95d03/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index cab98b3..e476b53 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -41,7 +41,7 @@ class SVD {
/**
* Compute SVD using the current set parameters
*/
- def compute(matrix: SparseMatrix) : SVDecomposedMatrix = {
+ def compute(matrix: SparseMatrix) : MatrixSVD = {
SVD.sparseSVD(matrix, k)
}
}
@@ -84,7 +84,7 @@ object SVD {
def sparseSVD(
matrix: SparseMatrix,
k: Int)
- : SVDecomposedMatrix =
+ : MatrixSVD =
{
val data = matrix.data
val m = matrix.m
[26/50] git commit: add k parameter
Posted by ma...@apache.org.
add k parameter
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/73daa700
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/73daa700
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/73daa700
Branch: refs/heads/master
Commit: 73daa700bd2acff7ff196c9262dffb2d8b9354bf
Parents: 26a74f0
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sat Jan 4 01:52:28 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sat Jan 4 01:52:28 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 5 ++--
.../org/apache/spark/mllib/linalg/SVD.scala | 24 ++++++++++----------
.../apache/spark/mllib/linalg/SVDSuite.scala | 3 +--
3 files changed, 16 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/73daa700/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 711187f..abeb55d 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -251,9 +251,10 @@ val data = sc.textFile("mllib/data/als/test.data").map { line =>
}
val m = 4
val n = 4
+val k = 1
-// recover singular vectors for singular values at or above 1e-5
-val (u, s, v) = SVD.sparseSVD(data, m, n, 1e-5)
+// recover largest singular vector
+val (u, s, v) = SVD.sparseSVD(data, m, n, 1)
println("singular values = " + s.toArray.mkString)
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/73daa700/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index ac9178e..465fc74 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -43,9 +43,8 @@ object SVD {
* Then we compute U via easy matrix multiplication
* as U = A * V * S^-1
*
- * Only singular vectors associated with singular values
- * greater or equal to MIN_SVALUE are recovered. If there are k
- * such values, then the dimensions of the return will be:
+ * Only the k largest singular values and associated vectors are found.
+ * If there are k such values, then the dimensions of the return will be:
*
* S is k x k and diagonal, holding the singular values on diagonal
* U is m x k and satisfies U'U = eye(k)
@@ -57,22 +56,22 @@ object SVD {
* @param data RDD Matrix in sparse 1-index format ((int, int), value)
* @param m number of rows
* @param n number of columns
- * @param min_svalue Recover singular values greater or equal to min_svalue
+ * @param k Recover k singular values and vectors
* @return Three sparse matrices: U, S, V such that A = USV^T
*/
def sparseSVD(
data: RDD[MatrixEntry],
m: Int,
n: Int,
- min_svalue: Double)
+ k: Int)
: SVDecomposedMatrix =
{
if (m < n || m <= 0 || n <= 0) {
throw new IllegalArgumentException("Expecting a tall and skinny matrix")
}
- if (min_svalue < 1.0e-8) {
- throw new IllegalArgumentException("Minimum singular value requested is too small")
+ if (k < 1 || k > n) {
+ throw new IllegalArgumentException("Must request up to n singular values")
}
// Compute A^T A, assuming rows are sparse enough to fit in memory
@@ -93,12 +92,13 @@ object SVD {
// Since A^T A is small, we can compute its SVD directly
val svd = Singular.sparseSVD(ata)
val V = svd(0)
- val sigma = MatrixFunctions.sqrt(svd(1)).toArray.filter(x => x >= min_svalue)
+ val sigmas = MatrixFunctions.sqrt(svd(1)).toArray.filter(x => x > 1e-9)
- // threshold s values
- if(sigma.isEmpty) {
- throw new Exception("All singular values are smaller than min_svalue: " + min_svalue)
- }
+ if(sigmas.size < k) {
+ throw new Exception("Not enough singular values to return")
+ }
+
+ val sigma = sigmas.take(k)
val sc = data.sparkContext
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/73daa700/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
index 71749ff..dc4e923 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
@@ -66,9 +66,8 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val n = 3
val data = sc.makeRDD(Array.tabulate(m,n){ (a,b)=>
MatrixEntry(a+1,b+1, (a+2).toDouble*(b+1)/(1+a+b)) }.flatten )
- val min_svalue = 1.0e-8
- val decomposed = SVD.sparseSVD(data, m, n, min_svalue)
+ val decomposed = SVD.sparseSVD(data, m, n, n)
val u = decomposed.U
val v = decomposed.V
val s = decomposed.S
[24/50] git commit: new return struct
Posted by ma...@apache.org.
new return struct
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/d2d5e5e0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/d2d5e5e0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/d2d5e5e0
Branch: refs/heads/master
Commit: d2d5e5e062e8aab5c3f019fbf97ad5e673a3f75f
Parents: 7f631dd
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sat Jan 4 00:15:04 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sat Jan 4 00:15:04 2014 -0800
----------------------------------------------------------------------
.../spark/mllib/linalg/SVDecomposedMatrix.scala | 33 ++++++++++++++++++++
1 file changed, 33 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/d2d5e5e0/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
new file mode 100644
index 0000000..c3ec428
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVDecomposedMatrix.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+import org.apache.spark.rdd.RDD
+
+import org.apache.spark.linalg.MatrixEntry
+
+/**
+ * Class that represents the SV decomposition of a matrix
+ *
+ * @param U such that A = USV^T
+ * @param S such that A = USV^T
+ * @param V such that A = USV^T
+ */
+case class SVDecomposedMatrix(val U: RDD[MatrixEntry],
+ val S: RDD[MatrixEntry],
+ val V: RDD[MatrixEntry])
[33/50] git commit: More sparse matrix usage.
Posted by ma...@apache.org.
More sparse matrix usage.
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/7d7490b6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/7d7490b6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/7d7490b6
Branch: refs/heads/master
Commit: 7d7490b67b8d0ff7f731e9ff6328ed0fca3f43c1
Parents: 746148b
Author: Reza Zadeh <ri...@gmail.com>
Authored: Tue Jan 7 17:16:17 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Tue Jan 7 17:16:17 2014 -0800
----------------------------------------------------------------------
mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/7d7490b6/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index a8efdc7..6590e8f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -74,7 +74,8 @@ object SVD {
* V is n x k and satisfies V'V = eye(k)
*
* All input and output is expected in sparse matrix format, 1-indexed
- * as tuples of the form ((i,j),value) all in RDDs
+ * as tuples of the form ((i,j),value) all in RDDs using the
+ * SparseMatrix class
*
* @param matrix sparse matrix to factorize
* @param k Recover k singular values and vectors
[44/50] git commit: prettify
Posted by ma...@apache.org.
prettify
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/c9b4845b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/c9b4845b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/c9b4845b
Branch: refs/heads/master
Commit: c9b4845bc120b8b63d5e033dd1d506d84c420b20
Parents: dbec69b
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 17 14:14:29 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 17 14:14:29 2014 -0800
----------------------------------------------------------------------
.../src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/c9b4845b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
index 12b3801..32f3f14 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
@@ -45,7 +45,7 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
val EPSILON = 1e-4
// Return jblas matrix from sparse matrix RDD
- def getDenseMatrix(matrix:SparseMatrix) : DoubleMatrix = {
+ def getDenseMatrix(matrix: SparseMatrix) : DoubleMatrix = {
val data = matrix.data
val m = matrix.m
val n = matrix.n
@@ -54,7 +54,7 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
ret
}
- def assertMatrixEquals(a:DoubleMatrix, b:DoubleMatrix) {
+ def assertMatrixEquals(a: DoubleMatrix, b: DoubleMatrix) {
assert(a.rows == b.rows && a.columns == b.columns, "dimension mismatch")
val diff = DoubleMatrix.zeros(a.rows, a.columns)
Array.tabulate(a.rows, a.columns){(i, j) =>
[27/50] git commit: set methods
Posted by ma...@apache.org.
set methods
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/35adc727
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/35adc727
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/35adc727
Branch: refs/heads/master
Commit: 35adc72794f25223502562f2dc0077f61d91cb79
Parents: 73daa70
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sat Jan 4 11:30:36 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sat Jan 4 11:30:36 2014 -0800
----------------------------------------------------------------------
.../org/apache/spark/mllib/linalg/SVD.scala | 59 +++++++++++++++++---
1 file changed, 52 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/35adc727/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
index 465fc74..9703e84 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -24,6 +24,50 @@ import org.apache.spark.rdd.RDD
import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
+
+/**
+ * Class used to obtain singular value decompositions
+ * @param data Matrix in sparse matrix format
+ * @param m number of rows
+ * @param n number of columns
+ */
+class GradientDescent(var data: RDD[MatrixEntry], var m: Int, var n: Int) {
+ private var k: Int = 1
+
+ /**
+ * Set the number of top-k singular vectors to return
+ */
+ def setK(k: Int): this.type = {
+ this.k = k
+ this
+ }
+
+ /**
+ * Set matrix to be used for SVD
+ */
+ def setDatadata(data: RDD[MatrixEntry]): this.type = {
+ this.data = data
+ this
+ }
+
+ /**
+ * Set dimensions of matrix: rows
+ */
+ def setNumRows(m: Int): this.type = {
+ this.m = m
+ this
+ }
+
+ /**
+ * Set dimensions of matrix: columns
+ */
+ def setNumCols(n: Int): this.type = {
+ this.n = n
+ this
+ }
+}
+
+
/**
* Top-level methods for calling Singular Value Decomposition
* NOTE: All matrices are in 1-indexed sparse format RDD[((int, int), value)]
@@ -76,7 +120,7 @@ object SVD {
// Compute A^T A, assuming rows are sparse enough to fit in memory
val rows = data.map(entry =>
- (entry.i, (entry.j, entry.mval))).groupByKey().cache()
+ (entry.i, (entry.j, entry.mval))).groupByKey()
val emits = rows.flatMap{ case (rowind, cols) =>
cols.flatMap{ case (colind1, mval1) =>
cols.map{ case (colind2, mval2) =>
@@ -85,7 +129,7 @@ object SVD {
// Construct jblas A^T A locally
val ata = DoubleMatrix.zeros(n, n)
- for(entry <- emits.toArray) {
+ for (entry <- emits.toArray) {
ata.put(entry._1._1-1, entry._1._2-1, entry._2)
}
@@ -94,7 +138,7 @@ object SVD {
val V = svd(0)
val sigmas = MatrixFunctions.sqrt(svd(1)).toArray.filter(x => x > 1e-9)
- if(sigmas.size < k) {
+ if (sigmas.size < k) {
throw new Exception("Not enough singular values to return")
}
@@ -105,14 +149,15 @@ object SVD {
// prepare V for returning
val retV = sc.makeRDD(
Array.tabulate(V.rows, sigma.length){ (i,j) =>
- MatrixEntry(i+1, j+1, V.get(i,j)) }.flatten)
+ MatrixEntry(i + 1, j + 1, V.get(i,j)) }.flatten)
- val retS = sc.makeRDD(Array.tabulate(sigma.length){x=>MatrixEntry(x+1,x+1,sigma(x))})
+ val retS = sc.makeRDD(Array.tabulate(sigma.length){
+ x => MatrixEntry(x + 1,x + 1, sigma(x))})
// Compute U as U = A V S^-1
- // turn V S^-1 into an RDD as a sparse matrix and cache it
+ // turn V S^-1 into an RDD as a sparse matrix
val vsirdd = sc.makeRDD(Array.tabulate(V.rows, sigma.length)
- { (i,j) => ((i+1, j+1), V.get(i,j)/sigma(j)) }.flatten).cache()
+ { (i,j) => ((i + 1, j + 1), V.get(i,j)/sigma(j)) }.flatten)
// Multiply A by VS^-1
val aCols = data.map(entry => (entry.j, (entry.i, entry.mval)))
[20/50] git commit: fix error message
Posted by ma...@apache.org.
fix error message
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/e617ae2d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/e617ae2d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/e617ae2d
Branch: refs/heads/master
Commit: e617ae2dad20950e5358c15fa1290d52ca03a874
Parents: 6140578
Author: Reza Zadeh <ri...@gmail.com>
Authored: Thu Jan 2 01:51:38 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Thu Jan 2 01:51:38 2014 -0800
----------------------------------------------------------------------
mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/e617ae2d/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
index 2c82c6b..2198e6a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
@@ -73,7 +73,7 @@ object SVD {
}
if (min_svalue < 1.0e-8) {
- throw new IllegalArgumentException("Minimum singular value requested must be greater than 1e-9")
+ throw new IllegalArgumentException("Minimum singular value requested is too small")
}
// Compute A^T A, assuming rows are sparse enough to fit in memory
[50/50] git commit: Merge pull request #315 from rezazadeh/sparsesvd
Posted by ma...@apache.org.
Merge pull request #315 from rezazadeh/sparsesvd
Sparse SVD
# Singular Value Decomposition
Given an *m x n* matrix *A*, compute matrices *U, S, V* such that
*A = U * S * V^T*
There is no restriction on m, but we require n^2 doubles to fit in memory.
Further, n should be less than m.
The decomposition is computed by first computing *A^TA = V S^2 V^T*,
computing svd locally on that (since n x n is small),
from which we recover S and V.
Then we compute U via easy matrix multiplication
as *U = A * V * S^-1*
Only singular vectors associated with the largest k singular values
If there are k such values, then the dimensions of the return will be:
* *S* is *k x k* and diagonal, holding the singular values on diagonal.
* *U* is *m x k* and satisfies U^T*U = eye(k).
* *V* is *n x k* and satisfies V^TV = eye(k).
All input and output is expected in sparse matrix format, 0-indexed
as tuples of the form ((i,j),value) all in RDDs.
# Testing
Tests included. They test:
- Decomposition promise (A = USV^T)
- For small matrices, output is compared to that of jblas
- Rank 1 matrix test included
- Full Rank matrix test included
- Middle-rank matrix forced via k included
# Example Usage
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.SVD
import org.apache.spark.mllib.linalg.SparseMatrix
import org.apache.spark.mllib.linalg.MatrixyEntry
// Load and parse the data file
val data = sc.textFile("mllib/data/als/test.data").map { line =>
val parts = line.split(',')
MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
}
val m = 4
val n = 4
// recover top 1 singular vector
val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), 1)
println("singular values = " + decomposed.S.data.toArray.mkString)
# Documentation
Added to docs/mllib-guide.md
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/d009b17d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/d009b17d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/d009b17d
Branch: refs/heads/master
Commit: d009b17d137edf2f1b9da04254e55fb7455faa3d
Parents: 749f842 85b95d0
Author: Matei Zaharia <ma...@databricks.com>
Authored: Wed Jan 22 14:01:30 2014 -0800
Committer: Matei Zaharia <ma...@databricks.com>
Committed: Wed Jan 22 14:01:30 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 51 +++++
.../apache/spark/examples/mllib/SparkSVD.scala | 59 ++++++
.../apache/spark/mllib/linalg/MatrixEntry.scala | 27 +++
.../apache/spark/mllib/linalg/MatrixSVD.scala | 29 +++
.../org/apache/spark/mllib/linalg/SVD.scala | 189 +++++++++++++++++++
.../spark/mllib/linalg/SparseMatrix.scala | 30 +++
.../apache/spark/mllib/linalg/SVDSuite.scala | 158 ++++++++++++++++
7 files changed, 543 insertions(+)
----------------------------------------------------------------------
[22/50] git commit: rename sparsesvd.scala
Posted by ma...@apache.org.
rename sparsesvd.scala
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/6bcdb762
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/6bcdb762
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/6bcdb762
Branch: refs/heads/master
Commit: 6bcdb762a107c82ef095553ab31284623475cb2c
Parents: b059a2a
Author: Reza Zadeh <ri...@gmail.com>
Authored: Fri Jan 3 21:55:38 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Fri Jan 3 21:55:38 2014 -0800
----------------------------------------------------------------------
.../org/apache/spark/mllib/linalg/SVD.scala | 153 +++++++++++++++++++
.../apache/spark/mllib/linalg/sparsesvd.scala | 153 -------------------
2 files changed, 153 insertions(+), 153 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/6bcdb762/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
new file mode 100644
index 0000000..2198e6a
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SVD.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+
+import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
+
+/**
+ * Top-level methods for calling Singular Value Decomposition
+ * NOTE: All matrices are in 1-indexed sparse format RDD[((int, int), value)]
+ */
+object SVD {
+/**
+ * Singular Value Decomposition for Tall and Skinny matrices.
+ * Given an m x n matrix A, this will compute matrices U, S, V such that
+ * A = U * S * V'
+ *
+ * There is no restriction on m, but we require n^2 doubles to fit in memory.
+ * Further, n should be less than m.
+ *
+ * The decomposition is computed by first computing A'A = V S^2 V',
+ * computing svd locally on that (since n x n is small),
+ * from which we recover S and V.
+ * Then we compute U via easy matrix multiplication
+ * as U = A * V * S^-1
+ *
+ * Only singular vectors associated with singular values
+ * greater or equal to MIN_SVALUE are recovered. If there are k
+ * such values, then the dimensions of the return will be:
+ *
+ * S is k x k and diagonal, holding the singular values on diagonal
+ * U is m x k and satisfies U'U = eye(k)
+ * V is n x k and satisfies V'V = eye(k)
+ *
+ * All input and output is expected in sparse matrix format, 1-indexed
+ * as tuples of the form ((i,j),value) all in RDDs
+ *
+ * @param data RDD Matrix in sparse 1-index format ((int, int), value)
+ * @param m number of rows
+ * @param n number of columns
+ * @param min_svalue Recover singular values greater or equal to min_svalue
+ * @return Three sparse matrices: U, S, V such that A = USV^T
+ */
+ def sparseSVD(
+ data: RDD[((Int, Int), Double)],
+ m: Int,
+ n: Int,
+ min_svalue: Double)
+ : ( RDD[((Int, Int), Double)],
+ RDD[((Int, Int), Double)],
+ RDD[((Int, Int), Double)]) =
+ {
+ if (m < n || m <= 0 || n <= 0) {
+ throw new IllegalArgumentException("Expecting a tall and skinny matrix")
+ }
+
+ if (min_svalue < 1.0e-8) {
+ throw new IllegalArgumentException("Minimum singular value requested is too small")
+ }
+
+ // Compute A^T A, assuming rows are sparse enough to fit in memory
+ val rows = data.map(entry =>
+ (entry._1._1, (entry._1._2, entry._2))).groupByKey().cache()
+ val emits = rows.flatMap{ case (rowind, cols) =>
+ cols.flatMap{ case (colind1, mval1) =>
+ cols.map{ case (colind2, mval2) =>
+ ((colind1, colind2), mval1*mval2) } }
+ }.reduceByKey(_+_)
+
+ // Construct jblas A^T A locally
+ val ata = DoubleMatrix.zeros(n, n)
+ for(entry <- emits.toArray) {
+ ata.put(entry._1._1-1, entry._1._2-1, entry._2)
+ }
+
+ // Since A^T A is small, we can compute its SVD directly
+ val svd = Singular.sparseSVD(ata)
+ val V = svd(0)
+ val sigma = MatrixFunctions.sqrt(svd(1)).toArray.filter(x => x >= min_svalue)
+
+ // threshold s values
+ if(sigma.isEmpty) {
+ throw new Exception("All singular values are smaller than min_svalue: " + min_svalue)
+ }
+
+ val sc = data.sparkContext
+
+ // prepare V for returning
+ val retV = sc.makeRDD(
+ Array.tabulate(V.rows, sigma.length){ (i,j) =>
+ ((i+1, j+1), V.get(i,j)) }.flatten)
+
+ val retS = sc.makeRDD(Array.tabulate(sigma.length){x=>((x+1,x+1),sigma(x))})
+
+ // Compute U as U = A V S^-1
+ // turn V S^-1 into an RDD as a sparse matrix and cache it
+ val vsirdd = sc.makeRDD(Array.tabulate(V.rows, sigma.length)
+ { (i,j) => ((i+1, j+1), V.get(i,j)/sigma(j)) }.flatten).cache()
+
+ // Multiply A by VS^-1
+ val aCols = data.map(entry => (entry._1._2, (entry._1._1, entry._2)))
+ val bRows = vsirdd.map(entry => (entry._1._1, (entry._1._2, entry._2)))
+ val retU = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal)) )
+ => ((rowInd, colInd), rowVal*colVal)}).reduceByKey(_+_)
+
+ (retU, retS, retV)
+ }
+
+
+ def main(args: Array[String]) {
+ if (args.length < 8) {
+ println("Usage: SVD <master> <matrix_file> <m> <n> <minimum_singular_value> <output_U_file> <output_S_file> <output_V_file>")
+ System.exit(1)
+ }
+ val (master, inputFile, m, n, min_svalue, output_u, output_s, output_v) =
+ (args(0), args(1), args(2).toInt, args(3).toInt, args(4).toDouble, args(5), args(6), args(7))
+
+ val sc = new SparkContext(master, "SVD")
+
+ val rawdata = sc.textFile(inputFile)
+ val data = rawdata.map { line =>
+ val parts = line.split(',')
+ ((parts(0).toInt, parts(1).toInt), parts(2).toDouble)
+ }
+
+ val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
+ println("Computed " + s.toArray.length + " singular values and vectors")
+ u.saveAsTextFile(output_u)
+ s.saveAsTextFile(output_s)
+ v.saveAsTextFile(output_v)
+ System.exit(0)
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/6bcdb762/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
deleted file mode 100644
index 2198e6a..0000000
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/sparsesvd.scala
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.linalg
-
-import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-import org.apache.spark.rdd.RDD
-
-import org.jblas.{DoubleMatrix, Singular, MatrixFunctions}
-
-/**
- * Top-level methods for calling Singular Value Decomposition
- * NOTE: All matrices are in 1-indexed sparse format RDD[((int, int), value)]
- */
-object SVD {
-/**
- * Singular Value Decomposition for Tall and Skinny matrices.
- * Given an m x n matrix A, this will compute matrices U, S, V such that
- * A = U * S * V'
- *
- * There is no restriction on m, but we require n^2 doubles to fit in memory.
- * Further, n should be less than m.
- *
- * The decomposition is computed by first computing A'A = V S^2 V',
- * computing svd locally on that (since n x n is small),
- * from which we recover S and V.
- * Then we compute U via easy matrix multiplication
- * as U = A * V * S^-1
- *
- * Only singular vectors associated with singular values
- * greater or equal to MIN_SVALUE are recovered. If there are k
- * such values, then the dimensions of the return will be:
- *
- * S is k x k and diagonal, holding the singular values on diagonal
- * U is m x k and satisfies U'U = eye(k)
- * V is n x k and satisfies V'V = eye(k)
- *
- * All input and output is expected in sparse matrix format, 1-indexed
- * as tuples of the form ((i,j),value) all in RDDs
- *
- * @param data RDD Matrix in sparse 1-index format ((int, int), value)
- * @param m number of rows
- * @param n number of columns
- * @param min_svalue Recover singular values greater or equal to min_svalue
- * @return Three sparse matrices: U, S, V such that A = USV^T
- */
- def sparseSVD(
- data: RDD[((Int, Int), Double)],
- m: Int,
- n: Int,
- min_svalue: Double)
- : ( RDD[((Int, Int), Double)],
- RDD[((Int, Int), Double)],
- RDD[((Int, Int), Double)]) =
- {
- if (m < n || m <= 0 || n <= 0) {
- throw new IllegalArgumentException("Expecting a tall and skinny matrix")
- }
-
- if (min_svalue < 1.0e-8) {
- throw new IllegalArgumentException("Minimum singular value requested is too small")
- }
-
- // Compute A^T A, assuming rows are sparse enough to fit in memory
- val rows = data.map(entry =>
- (entry._1._1, (entry._1._2, entry._2))).groupByKey().cache()
- val emits = rows.flatMap{ case (rowind, cols) =>
- cols.flatMap{ case (colind1, mval1) =>
- cols.map{ case (colind2, mval2) =>
- ((colind1, colind2), mval1*mval2) } }
- }.reduceByKey(_+_)
-
- // Construct jblas A^T A locally
- val ata = DoubleMatrix.zeros(n, n)
- for(entry <- emits.toArray) {
- ata.put(entry._1._1-1, entry._1._2-1, entry._2)
- }
-
- // Since A^T A is small, we can compute its SVD directly
- val svd = Singular.sparseSVD(ata)
- val V = svd(0)
- val sigma = MatrixFunctions.sqrt(svd(1)).toArray.filter(x => x >= min_svalue)
-
- // threshold s values
- if(sigma.isEmpty) {
- throw new Exception("All singular values are smaller than min_svalue: " + min_svalue)
- }
-
- val sc = data.sparkContext
-
- // prepare V for returning
- val retV = sc.makeRDD(
- Array.tabulate(V.rows, sigma.length){ (i,j) =>
- ((i+1, j+1), V.get(i,j)) }.flatten)
-
- val retS = sc.makeRDD(Array.tabulate(sigma.length){x=>((x+1,x+1),sigma(x))})
-
- // Compute U as U = A V S^-1
- // turn V S^-1 into an RDD as a sparse matrix and cache it
- val vsirdd = sc.makeRDD(Array.tabulate(V.rows, sigma.length)
- { (i,j) => ((i+1, j+1), V.get(i,j)/sigma(j)) }.flatten).cache()
-
- // Multiply A by VS^-1
- val aCols = data.map(entry => (entry._1._2, (entry._1._1, entry._2)))
- val bRows = vsirdd.map(entry => (entry._1._1, (entry._1._2, entry._2)))
- val retU = aCols.join(bRows).map( {case (key, ( (rowInd, rowVal), (colInd, colVal)) )
- => ((rowInd, colInd), rowVal*colVal)}).reduceByKey(_+_)
-
- (retU, retS, retV)
- }
-
-
- def main(args: Array[String]) {
- if (args.length < 8) {
- println("Usage: SVD <master> <matrix_file> <m> <n> <minimum_singular_value> <output_U_file> <output_S_file> <output_V_file>")
- System.exit(1)
- }
- val (master, inputFile, m, n, min_svalue, output_u, output_s, output_v) =
- (args(0), args(1), args(2).toInt, args(3).toInt, args(4).toDouble, args(5), args(6), args(7))
-
- val sc = new SparkContext(master, "SVD")
-
- val rawdata = sc.textFile(inputFile)
- val data = rawdata.map { line =>
- val parts = line.split(',')
- ((parts(0).toInt, parts(1).toInt), parts(2).toDouble)
- }
-
- val (u, s, v) = SVD.sparseSVD(data, m, n, min_svalue)
- println("Computed " + s.toArray.length + " singular values and vectors")
- u.saveAsTextFile(output_u)
- s.saveAsTextFile(output_s)
- v.saveAsTextFile(output_v)
- System.exit(0)
- }
-}
-
-
[32/50] git commit: fix docs to use SparseMatrix
Posted by ma...@apache.org.
fix docs to use SparseMatrix
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/746148bc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/746148bc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/746148bc
Branch: refs/heads/master
Commit: 746148bc18d5e25ea93f5ff17a6cb4da9b671b75
Parents: 06c0f76
Author: Reza Zadeh <ri...@gmail.com>
Authored: Sun Jan 5 18:03:57 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Sun Jan 5 18:03:57 2014 -0800
----------------------------------------------------------------------
docs/mllib-guide.md | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/746148bc/docs/mllib-guide.md
----------------------------------------------------------------------
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index abeb55d..653848b 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -243,18 +243,21 @@ as tuples of the form ((i,j),value) all in RDDs. Below is example usage.
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.SVD
+import org.apache.spark.mllib.linalg.SparseMatrix
+import org.apache.spark.mllib.linalg.MatrixEntry
// Load and parse the data file
val data = sc.textFile("mllib/data/als/test.data").map { line =>
val parts = line.split(',')
- ((parts(0).toInt, parts(1).toInt), parts(2).toDouble)
+ MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
}
val m = 4
val n = 4
val k = 1
// recover largest singular vector
-val (u, s, v) = SVD.sparseSVD(data, m, n, 1)
+val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), k)
+val = decomposed.S.data
println("singular values = " + s.toArray.mkString)