You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/01/22 23:02:22 UTC
[39/50] git commit: Merge remote-tracking branch 'upstream/master'
into sparsesvd
Merge remote-tracking branch 'upstream/master' into sparsesvd
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/845e568f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/845e568f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/845e568f
Branch: refs/heads/master
Commit: 845e568fada0550e632e7381748c5a9ebbe53e16
Parents: f324d53 fdaabdc
Author: Reza Zadeh <ri...@gmail.com>
Authored: Mon Jan 13 23:52:34 2014 -0800
Committer: Reza Zadeh <ri...@gmail.com>
Committed: Mon Jan 13 23:52:34 2014 -0800
----------------------------------------------------------------------
.../org/apache/spark/bagel/BagelSuite.scala | 1 -
bin/compute-classpath.sh | 2 +
.../scala/org/apache/spark/Accumulators.scala | 6 +-
.../scala/org/apache/spark/Aggregator.scala | 14 +-
.../scala/org/apache/spark/CacheManager.scala | 4 +-
.../scala/org/apache/spark/FutureAction.scala | 8 +-
.../scala/org/apache/spark/HttpFileServer.scala | 6 +-
.../apache/spark/InterruptibleIterator.scala | 2 +-
.../main/scala/org/apache/spark/Logging.scala | 4 +-
.../org/apache/spark/MapOutputTracker.scala | 12 +-
.../scala/org/apache/spark/Partitioner.scala | 4 +-
.../scala/org/apache/spark/SparkContext.scala | 75 +-
.../main/scala/org/apache/spark/SparkEnv.scala | 10 -
.../org/apache/spark/SparkHadoopWriter.scala | 15 +-
.../org/apache/spark/api/python/PythonRDD.scala | 3 +-
.../org/apache/spark/broadcast/Broadcast.scala | 1 +
.../spark/broadcast/BroadcastFactory.scala | 2 +-
.../apache/spark/broadcast/HttpBroadcast.scala | 5 +-
.../spark/broadcast/TorrentBroadcast.scala | 12 +-
.../scala/org/apache/spark/deploy/Client.scala | 3 +-
.../spark/deploy/worker/CommandUtils.scala | 3 +-
.../executor/CoarseGrainedExecutorBackend.scala | 1 -
.../org/apache/spark/executor/Executor.scala | 4 +-
.../org/apache/spark/executor/TaskMetrics.scala | 10 +
.../apache/spark/network/BufferMessage.scala | 2 +-
.../org/apache/spark/network/Connection.scala | 6 +-
.../org/apache/spark/network/Message.scala | 6 +-
.../spark/network/netty/ShuffleSender.scala | 2 +-
.../main/scala/org/apache/spark/package.scala | 3 +
.../org/apache/spark/rdd/CoGroupedRDD.scala | 3 +
.../org/apache/spark/rdd/CoalescedRDD.scala | 10 +-
.../scala/org/apache/spark/rdd/HadoopRDD.scala | 32 +-
.../org/apache/spark/rdd/NewHadoopRDD.scala | 24 +-
.../org/apache/spark/rdd/PairRDDFunctions.scala | 12 +-
.../scala/org/apache/spark/rdd/PipedRDD.scala | 5 +-
.../main/scala/org/apache/spark/rdd/RDD.scala | 9 +-
.../apache/spark/scheduler/DAGScheduler.scala | 4 +-
.../spark/scheduler/InputFormatInfo.scala | 8 +-
.../scala/org/apache/spark/scheduler/Pool.scala | 8 +-
.../spark/scheduler/SchedulingAlgorithm.scala | 11 +-
.../apache/spark/scheduler/SparkListener.scala | 16 +-
.../spark/scheduler/SparkListenerBus.scala | 17 +-
.../org/apache/spark/scheduler/Stage.scala | 2 +-
.../org/apache/spark/scheduler/TaskResult.scala | 2 +-
.../spark/scheduler/TaskResultGetter.scala | 2 +-
.../spark/scheduler/TaskSchedulerImpl.scala | 5 +-
.../apache/spark/scheduler/TaskSetManager.scala | 14 +-
.../cluster/CoarseGrainedSchedulerBackend.scala | 2 +-
.../mesos/CoarseMesosSchedulerBackend.scala | 2 +-
.../cluster/mesos/MesosSchedulerBackend.scala | 6 +-
.../org/apache/spark/storage/BlockManager.scala | 7 +-
.../spark/storage/BlockManagerWorker.scala | 20 +-
.../org/apache/spark/storage/BlockMessage.scala | 2 +-
.../spark/storage/BlockMessageArray.scala | 2 +-
.../spark/storage/BlockObjectWriter.scala | 4 +-
.../org/apache/spark/storage/MemoryStore.scala | 2 +-
.../spark/storage/ShuffleBlockManager.scala | 2 +-
.../org/apache/spark/storage/StorageLevel.scala | 6 +-
.../apache/spark/ui/jobs/ExecutorSummary.scala | 2 +
.../apache/spark/ui/jobs/ExecutorTable.scala | 4 +
.../spark/ui/jobs/JobProgressListener.scala | 14 +
.../org/apache/spark/ui/jobs/StagePage.scala | 53 +-
.../org/apache/spark/ui/jobs/StageTable.scala | 2 +-
.../org/apache/spark/util/ClosureCleaner.scala | 10 +-
.../apache/spark/util/CompletionIterator.scala | 11 +-
.../org/apache/spark/util/MetadataCleaner.scala | 8 +-
.../spark/util/RateLimitedOutputStream.scala | 79 --
.../org/apache/spark/util/SizeEstimator.scala | 10 +-
.../scala/org/apache/spark/util/Utils.scala | 36 +-
.../scala/org/apache/spark/util/Vector.scala | 12 +-
.../spark/util/collection/AppendOnlyMap.scala | 2 +-
.../apache/spark/util/collection/BitSet.scala | 87 +-
.../util/collection/ExternalAppendOnlyMap.scala | 72 +-
.../spark/util/collection/OpenHashSet.scala | 23 +-
.../org/apache/spark/LocalSparkContext.scala | 1 -
.../apache/spark/MapOutputTrackerSuite.scala | 1 -
.../spark/scheduler/ClusterSchedulerSuite.scala | 9 +-
.../spark/scheduler/DAGSchedulerSuite.scala | 3 +-
.../apache/spark/scheduler/JobLoggerSuite.scala | 7 +-
.../spark/storage/BlockManagerSuite.scala | 4 -
.../apache/spark/util/ClosureCleanerSuite.scala | 14 +-
.../util/RateLimitedOutputStreamSuite.scala | 40 -
.../collection/ExternalAppendOnlyMapSuite.scala | 77 +-
docs/_config.yml | 2 +-
docs/_layouts/global.html | 8 +-
docs/_plugins/copy_api_dirs.rb | 2 +-
docs/api.md | 1 +
docs/bagel-programming-guide.md | 10 +-
docs/configuration.md | 13 +-
docs/graphx-programming-guide.md | 1003 ++++++++++++++++++
docs/img/data_parallel_vs_graph_parallel.png | Bin 0 -> 432725 bytes
docs/img/edge-cut.png | Bin 0 -> 12563 bytes
docs/img/edge_cut_vs_vertex_cut.png | Bin 0 -> 79745 bytes
docs/img/graph_analytics_pipeline.png | Bin 0 -> 427220 bytes
docs/img/graph_parallel.png | Bin 0 -> 92288 bytes
docs/img/graphx_figures.pptx | Bin 0 -> 1123363 bytes
docs/img/graphx_logo.png | Bin 0 -> 40324 bytes
docs/img/graphx_performance_comparison.png | Bin 0 -> 166343 bytes
docs/img/property_graph.png | Bin 0 -> 225151 bytes
docs/img/tables_and_graphs.png | Bin 0 -> 166265 bytes
docs/img/triplet.png | Bin 0 -> 31489 bytes
docs/img/vertex-cut.png | Bin 0 -> 12246 bytes
docs/img/vertex_routing_edge_tables.png | Bin 0 -> 570007 bytes
docs/index.md | 4 +-
docs/mllib-guide.md | 19 +-
docs/python-programming-guide.md | 8 +-
docs/streaming-programming-guide.md | 6 +-
.../org/apache/spark/examples/LocalALS.scala | 8 +-
.../org/apache/spark/examples/LocalFileLR.scala | 2 +-
.../org/apache/spark/examples/LocalKMeans.scala | 2 +-
.../org/apache/spark/examples/SparkALS.scala | 6 +-
.../org/apache/spark/examples/SparkHdfsLR.scala | 2 +-
.../org/apache/spark/examples/SparkKMeans.scala | 12 +-
.../examples/graphx/LiveJournalPageRank.scala | 49 +
.../streaming/examples/RawNetworkGrep.scala | 2 +-
.../examples/RecoverableNetworkWordCount.scala | 2 +-
.../streaming/examples/TwitterAlgebirdCMS.scala | 4 +-
.../streaming/examples/TwitterAlgebirdHLL.scala | 4 +-
.../streaming/examples/TwitterPopularTags.scala | 4 +-
.../clickstream/PageViewGenerator.scala | 2 +-
.../examples/clickstream/PageViewStream.scala | 2 +-
.../spark/streaming/flume/FlumeUtils.scala | 4 +-
.../spark/streaming/kafka/KafkaUtils.scala | 6 +-
.../apache/spark/streaming/mqtt/MQTTUtils.scala | 4 +-
.../spark/streaming/mqtt/MQTTStreamSuite.scala | 2 +-
.../spark/streaming/twitter/TwitterUtils.scala | 7 +-
.../streaming/twitter/TwitterStreamSuite.scala | 2 +-
.../spark/streaming/zeromq/ZeroMQUtils.scala | 3 +-
graphx/data/followers.txt | 8 +
graphx/data/users.txt | 7 +
graphx/pom.xml | 67 ++
.../scala/org/apache/spark/graphx/Edge.scala | 45 +
.../org/apache/spark/graphx/EdgeDirection.scala | 44 +
.../scala/org/apache/spark/graphx/EdgeRDD.scala | 102 ++
.../org/apache/spark/graphx/EdgeTriplet.scala | 49 +
.../scala/org/apache/spark/graphx/Graph.scala | 405 +++++++
.../spark/graphx/GraphKryoRegistrator.scala | 31 +
.../org/apache/spark/graphx/GraphLoader.scala | 72 ++
.../org/apache/spark/graphx/GraphOps.scala | 301 ++++++
.../apache/spark/graphx/PartitionStrategy.scala | 103 ++
.../scala/org/apache/spark/graphx/Pregel.scala | 139 +++
.../org/apache/spark/graphx/VertexRDD.scala | 347 ++++++
.../spark/graphx/impl/EdgePartition.scala | 220 ++++
.../graphx/impl/EdgePartitionBuilder.scala | 45 +
.../spark/graphx/impl/EdgeTripletIterator.scala | 42 +
.../apache/spark/graphx/impl/GraphImpl.scala | 379 +++++++
.../spark/graphx/impl/MessageToPartition.scala | 98 ++
.../graphx/impl/ReplicatedVertexView.scala | 195 ++++
.../apache/spark/graphx/impl/RoutingTable.scala | 65 ++
.../apache/spark/graphx/impl/Serializers.scala | 395 +++++++
.../spark/graphx/impl/VertexPartition.scala | 261 +++++
.../org/apache/spark/graphx/impl/package.scala | 7 +
.../org/apache/spark/graphx/lib/Analytics.scala | 136 +++
.../spark/graphx/lib/ConnectedComponents.scala | 38 +
.../org/apache/spark/graphx/lib/PageRank.scala | 147 +++
.../apache/spark/graphx/lib/SVDPlusPlus.scala | 138 +++
.../lib/StronglyConnectedComponents.scala | 94 ++
.../apache/spark/graphx/lib/TriangleCount.scala | 76 ++
.../scala/org/apache/spark/graphx/package.scala | 18 +
.../spark/graphx/util/BytecodeUtils.scala | 117 ++
.../spark/graphx/util/GraphGenerators.scala | 218 ++++
.../collection/PrimitiveKeyOpenHashMap.scala | 153 +++
graphx/src/test/resources/log4j.properties | 28 +
.../org/apache/spark/graphx/GraphOpsSuite.scala | 66 ++
.../org/apache/spark/graphx/GraphSuite.scala | 273 +++++
.../apache/spark/graphx/LocalSparkContext.scala | 28 +
.../org/apache/spark/graphx/PregelSuite.scala | 41 +
.../apache/spark/graphx/SerializerSuite.scala | 183 ++++
.../apache/spark/graphx/VertexRDDSuite.scala | 85 ++
.../spark/graphx/impl/EdgePartitionSuite.scala | 76 ++
.../graphx/impl/VertexPartitionSuite.scala | 113 ++
.../graphx/lib/ConnectedComponentsSuite.scala | 113 ++
.../apache/spark/graphx/lib/PageRankSuite.scala | 119 +++
.../spark/graphx/lib/SVDPlusPlusSuite.scala | 31 +
.../lib/StronglyConnectedComponentsSuite.scala | 57 +
.../spark/graphx/lib/TriangleCountSuite.scala | 70 ++
.../spark/graphx/util/BytecodeUtilsSuite.scala | 93 ++
mllib/data/sample_naive_bayes_data.txt | 6 +
.../spark/mllib/api/python/PythonMLLibAPI.scala | 46 +-
.../classification/LogisticRegression.scala | 4 +-
.../spark/mllib/classification/NaiveBayes.scala | 65 +-
.../apache/spark/mllib/classification/SVM.scala | 2 +
.../apache/spark/mllib/recommendation/ALS.scala | 15 +-
.../spark/mllib/regression/LabeledPoint.scala | 6 +-
.../apache/spark/mllib/regression/Lasso.scala | 4 +-
.../mllib/regression/LinearRegression.scala | 2 +
.../mllib/regression/RidgeRegression.scala | 4 +-
.../classification/JavaNaiveBayesSuite.java | 72 ++
pom.xml | 5 +-
project/SparkBuild.scala | 21 +-
python/pyspark/mllib/_common.py | 2 +-
python/pyspark/mllib/classification.py | 77 +-
python/pyspark/mllib/clustering.py | 11 +-
python/pyspark/mllib/recommendation.py | 10 +-
python/pyspark/mllib/regression.py | 35 +-
python/pyspark/worker.py | 4 +
python/run-tests | 5 +
.../scala/org/apache/spark/repl/ReplSuite.scala | 2 -
.../org/apache/spark/streaming/Checkpoint.scala | 6 +-
.../apache/spark/streaming/ContextWaiter.scala | 28 +
.../org/apache/spark/streaming/DStream.scala | 741 -------------
.../spark/streaming/DStreamCheckpointData.scala | 128 ---
.../apache/spark/streaming/DStreamGraph.scala | 12 +-
.../spark/streaming/PairDStreamFunctions.scala | 621 -----------
.../spark/streaming/StreamingContext.scala | 117 +-
.../spark/streaming/api/java/JavaDStream.scala | 3 +-
.../streaming/api/java/JavaDStreamLike.scala | 30 +-
.../streaming/api/java/JavaPairDStream.scala | 1 +
.../api/java/JavaStreamingContext.scala | 28 +-
.../spark/streaming/dstream/DStream.scala | 775 ++++++++++++++
.../dstream/DStreamCheckpointData.scala | 126 +++
.../streaming/dstream/FileInputDStream.scala | 86 +-
.../streaming/dstream/FilteredDStream.scala | 2 +-
.../dstream/FlatMapValuedDStream.scala | 2 +-
.../streaming/dstream/FlatMappedDStream.scala | 2 +-
.../streaming/dstream/ForEachDStream.scala | 2 +-
.../streaming/dstream/GlommedDStream.scala | 2 +-
.../spark/streaming/dstream/InputDStream.scala | 4 +-
.../dstream/MapPartitionedDStream.scala | 2 +-
.../streaming/dstream/MapValuedDStream.scala | 2 +-
.../spark/streaming/dstream/MappedDStream.scala | 2 +-
.../streaming/dstream/NetworkInputDStream.scala | 2 +-
.../dstream/PairDStreamFunctions.scala | 622 +++++++++++
.../dstream/ReducedWindowedDStream.scala | 2 +-
.../streaming/dstream/ShuffledDStream.scala | 2 +-
.../spark/streaming/dstream/StateDStream.scala | 10 +-
.../streaming/dstream/TransformedDStream.scala | 2 +-
.../spark/streaming/dstream/UnionDStream.scala | 3 +-
.../streaming/dstream/WindowedDStream.scala | 17 +-
.../apache/spark/streaming/scheduler/Job.scala | 9 +-
.../streaming/scheduler/JobGenerator.scala | 81 +-
.../streaming/scheduler/JobScheduler.scala | 141 ++-
.../spark/streaming/scheduler/JobSet.scala | 18 +-
.../scheduler/NetworkInputTracker.scala | 40 +-
.../streaming/scheduler/StreamingListener.scala | 3 +-
.../scheduler/StreamingListenerBus.scala | 23 +-
.../org/apache/spark/streaming/util/Clock.scala | 4 +-
.../streaming/util/MasterFailureTest.scala | 4 +-
.../util/RateLimitedOutputStream.scala | 79 ++
.../spark/streaming/util/RawTextHelper.scala | 2 +-
.../spark/streaming/util/RawTextSender.scala | 13 +-
.../spark/streaming/util/RecurringTimer.scala | 13 +-
.../streaming/LocalJavaStreamingContext.java | 2 -
.../spark/streaming/BasicOperationsSuite.scala | 79 +-
.../spark/streaming/CheckpointSuite.scala | 25 +-
.../spark/streaming/StreamingContextSuite.scala | 219 ++++
.../streaming/StreamingListenerSuite.scala | 1 +
.../apache/spark/streaming/TestSuiteBase.scala | 10 +-
.../spark/streaming/WindowOperationsSuite.scala | 15 +
.../util/RateLimitedOutputStreamSuite.scala | 40 +
.../tools/JavaAPICompletenessChecker.scala | 50 +-
.../org/apache/spark/deploy/yarn/Client.scala | 7 +-
.../spark/deploy/yarn/WorkerLauncher.scala | 8 +-
.../spark/deploy/yarn/WorkerRunnable.scala | 7 +-
.../yarn/ClientDistributedCacheManager.scala | 10 +-
.../ClientDistributedCacheManagerSuite.scala | 2 +-
.../org/apache/spark/deploy/yarn/Client.scala | 3 +-
257 files changed, 10443 insertions(+), 2434 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/845e568f/docs/mllib-guide.md
----------------------------------------------------------------------
diff --cc docs/mllib-guide.md
index 21d0464,1a5c640..a140ecb
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@@ -426,55 -435,5 +435,55 @@@ signals), you can use the trainImplici
{% highlight python %}
# Build the recommendation model using Alternating Least Squares based on implicit ratings
- model = ALS.trainImplicit(sc, ratings, 1, 20)
+ model = ALS.trainImplicit(ratings, 1, 20)
{% endhighlight %}
+
+
+# Singular Value Decomposition
+Singular Value Decomposition for Tall and Skinny matrices.
+Given an *m x n* matrix *A*, we can compute matrices *U, S, V* such that
+
+*A = U * S * V^T*
+
+There is no restriction on m, but we require n^2 doubles to fit in memory.
+Further, n should be less than m.
+
+The decomposition is computed by first computing *A^TA = V S^2 V^T*,
+computing svd locally on that (since n x n is small),
+from which we recover S and V.
+Then we compute U via easy matrix multiplication
+as *U = A * V * S^-1*
+
+Only singular vectors associated with largest k singular values
+are recovered. If there are k
+such values, then the dimensions of the return will be:
+
+* *S* is *k x k* and diagonal, holding the singular values on diagonal.
+* *U* is *m x k* and satisfies U^T*U = eye(k).
+* *V* is *n x k* and satisfies V^TV = eye(k).
+
+All input and output is expected in sparse matrix format, 1-indexed
+as tuples of the form ((i,j),value) all in
+SparseMatrix RDDs. Below is example usage.
+
+{% highlight scala %}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.SVD
+import org.apache.spark.mllib.linalg.SparseMatrix
+import org.apache.spark.mllib.linalg.MatrixEntry
+
+// Load and parse the data file
+val data = sc.textFile("mllib/data/als/test.data").map { line =>
+ val parts = line.split(',')
+ MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble)
+}
+val m = 4
+val n = 4
+val k = 1
+
+// recover largest singular vector
+val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), k)
+val = decomposed.S.data
+
+println("singular values = " + s.toArray.mkString)