You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by pw...@apache.org on 2014/01/14 08:00:11 UTC

[47/50] git commit: Fix all code examples in guide

Fix all code examples in guide


Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/af645be5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/af645be5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/af645be5

Branch: refs/heads/master
Commit: af645be5b8d41d5a0fd4a529956c5ab438198db4
Parents: 2cd9358
Author: Ankur Dave <an...@gmail.com>
Authored: Mon Jan 13 22:29:45 2014 -0800
Committer: Ankur Dave <an...@gmail.com>
Committed: Mon Jan 13 22:29:45 2014 -0800

----------------------------------------------------------------------
 docs/graphx-programming-guide.md | 46 +++++++++++++++++------------------
 graphx/data/users.txt            | 13 +++++-----
 2 files changed, 30 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/af645be5/docs/graphx-programming-guide.md
----------------------------------------------------------------------
diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 226299a..a7ab003 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -357,7 +357,7 @@ val relationships: RDD[Edge[String]] =
 val defaultUser = ("John Doe", "Missing")
 // Build the initial Graph
 val graph = Graph(users, relationships, defaultUser)
-// Notice that there is a user 0 (for which we have no information) connecting users
+// Notice that there is a user 0 (for which we have no information) connected to users
 // 4 (peter) and 5 (franklin).
 graph.triplets.map(
     triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1
@@ -858,11 +858,11 @@ val graph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt")
 val ranks = graph.pageRank(0.0001).vertices
 // Join the ranks with the usernames
 val users = sc.textFile("graphx/data/users.txt").map { line =>
-  val fields = line.split("\\s+")
+  val fields = line.split(",")
   (fields(0).toLong, fields(1))
 }
-val ranksByUsername = users.leftOuterJoin(ranks).map {
-  case (id, (username, rankOpt)) => (username, rankOpt.getOrElse(0.0))
+val ranksByUsername = users.join(ranks).map {
+  case (id, (username, rank)) => (username, rank)
 }
 // Print the result
 println(ranksByUsername.collect().mkString("\n"))
@@ -881,11 +881,11 @@ val graph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt")
 val cc = graph.connectedComponents().vertices
 // Join the connected components with the usernames
 val users = sc.textFile("graphx/data/users.txt").map { line =>
-  val fields = line.split("\\s+")
+  val fields = line.split(",")
   (fields(0).toLong, fields(1))
 }
-val ccByUsername = users.join(cc).map { case (id, (username, cc)) =>
-  (username, cc)
+val ccByUsername = users.join(cc).map {
+  case (id, (username, cc)) => (username, cc)
 }
 // Print the result
 println(ccByUsername.collect().mkString("\n"))
@@ -900,12 +900,12 @@ A vertex is part of a triangle when it has two adjacent vertices with an edge be
 
 {% highlight scala %}
 // Load the edges in canonical order and partition the graph for triangle count
-val graph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt", true).partitionBy(RandomVertexCut)
+val graph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt", true).partitionBy(PartitionStrategy.RandomVertexCut)
 // Find the triangle count for each vertex
 val triCounts = graph.triangleCount().vertices
 // Join the triangle counts with the usernames
 val users = sc.textFile("graphx/data/users.txt").map { line =>
-  val fields = line.split("\\s+")
+  val fields = line.split(",")
   (fields(0).toLong, fields(1))
 }
 val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) =>
@@ -934,32 +934,32 @@ all of this in just a few lines with GraphX:
 // Connect to the Spark cluster
 val sc = new SparkContext("spark://master.amplab.org", "research")
 
-// Load my user data and prase into tuples of user id and attribute list
-val users = sc.textFile("hdfs://user_attributes.tsv")
-  .map(line => line.split).map( parts => (parts.head, parts.tail) )
+// Load my user data and parse into tuples of user id and attribute list
+val users = (sc.textFile("graphx/data/users.txt")
+  .map(line => line.split(",")).map( parts => (parts.head.toLong, parts.tail) ))
 
 // Parse the edge data which is already in userId -> userId format
-val followerGraph = Graph.textFile(sc, "hdfs://followers.tsv")
+val followerGraph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt")
 
 // Attach the user attributes
-val graph = followerGraph.outerJoinVertices(users){
+val graph = followerGraph.outerJoinVertices(users) {
   case (uid, deg, Some(attrList)) => attrList
   // Some users may not have attributes so we set them as empty
   case (uid, deg, None) => Array.empty[String]
-  }
+}
 
-// Restrict the graph to users which have exactly two attributes
-val subgraph = graph.subgraph((vid, attr) => attr.size == 2)
+// Restrict the graph to users with usernames and names
+val subgraph = graph.subgraph(vpred = (vid, attr) => attr.size == 2)
 
 // Compute the PageRank
-val pagerankGraph = Analytics.pagerank(subgraph)
+val pagerankGraph = subgraph.pageRank(0.001)
 
 // Get the attributes of the top pagerank users
-val userInfoWithPageRank = subgraph.outerJoinVertices(pagerankGraph.vertices){
-  case (uid, attrList, Some(pr)) => (pr, attrList)
-  case (uid, attrList, None) => (pr, attrList)
-  }
+val userInfoWithPageRank = subgraph.outerJoinVertices(pagerankGraph.vertices) {
+  case (uid, attrList, Some(pr)) => (pr, attrList.toList)
+  case (uid, attrList, None) => (0.0, attrList.toList)
+}
 
-println(userInfoWithPageRank.top(5))
+println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
 
 {% endhighlight %}

http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/af645be5/graphx/data/users.txt
----------------------------------------------------------------------
diff --git a/graphx/data/users.txt b/graphx/data/users.txt
index 26e3b3b..982d19d 100644
--- a/graphx/data/users.txt
+++ b/graphx/data/users.txt
@@ -1,6 +1,7 @@
-1 BarackObama
-2 ladygaga
-3 jeresig
-4 justinbieber
-6 matei_zaharia
-7 odersky
+1,BarackObama,Barack Obama
+2,ladygaga,Goddess of Love
+3,jeresig,John Resig
+4,justinbieber,Justin Bieber
+6,matei_zaharia,Matei Zaharia
+7,odersky,Martin Odersky
+8,anonsys