You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2015/04/02 22:46:23 UTC
mahout git commit: (nojira) set spark.executor.memory = 1g in spark-shell. fix -ma option in 20newsgroups shell script. a few other mostly cosmetic changes, version numbers, and shell script example. closes apache/mahout#95

Repository: mahout
Updated Branches:
  refs/heads/master 5e07c8646 -> 260753fdb


(nojira) set spark.executor.memory = 1g in spark-shell. fix -ma option in 20newsgroups shell script.  a few other mostly cosmetic changes, version numbers, and shell script example. closes apache/mahout#95


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/260753fd
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/260753fd
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/260753fd

Branch: refs/heads/master
Commit: 260753fdb9cc2b17a3e4dbf373e79c3de4887654
Parents: 5e07c86
Author: Andrew Palumbo <ap...@apache.org>
Authored: Thu Apr 2 13:08:26 2015 -0400
Committer: Andrew Palumbo <ap...@apache.org>
Committed: Thu Apr 2 16:43:51 2015 -0400

----------------------------------------------------------------------
 examples/bin/classify-20newsgroups.sh           |  6 ++---
 examples/bin/spark-document-classifier.mscala   | 27 +++++++++++++-------
 .../sparkbindings/shell/MahoutSparkILoop.scala  |  2 ++
 .../mahout/drivers/ItemSimilarityDriver.scala   |  2 +-
 .../mahout/drivers/RowSimilarityDriver.scala    |  2 +-
 .../apache/mahout/drivers/TestNBDriver.scala    |  2 +-
 .../apache/mahout/drivers/TrainNBDriver.scala   |  2 +-
 7 files changed, 27 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-20newsgroups.sh b/examples/bin/classify-20newsgroups.sh
index 7d44480..e92dc7d 100755
--- a/examples/bin/classify-20newsgroups.sh
+++ b/examples/bin/classify-20newsgroups.sh
@@ -154,19 +154,19 @@ if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ "x$alg" == "xcnaivebayes-MapR
       echo "Training Naive Bayes model"
       ./bin/mahout spark-trainnb \
         -i ${WORK_DIR}/20news-train-vectors \
-        -o ${WORK_DIR}/spark-model $c --ma $MASTER
+        -o ${WORK_DIR}/spark-model $c -ma $MASTER
 
       echo "Self testing on training set"
       ./bin/mahout spark-testnb \
         -i ${WORK_DIR}/20news-train-vectors\
         -o ${WORK_DIR}\
-        -m ${WORK_DIR}/spark-model $c --ma $MASTER
+        -m ${WORK_DIR}/spark-model $c -ma $MASTER
 
       echo "Testing on holdout set"
       ./bin/mahout spark-testnb \
         -i ${WORK_DIR}/20news-test-vectors\
         -o ${WORK_DIR}\
-        -m ${WORK_DIR}/spark-model $c --ma $MASTER
+        -m ${WORK_DIR}/spark-model $c -ma $MASTER
     fi
 elif [ "x$alg" == "xsgd-MapReduce" ]; then
   if [ ! -e "/tmp/news-group.model" ]; then

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/examples/bin/spark-document-classifier.mscala
----------------------------------------------------------------------
diff --git a/examples/bin/spark-document-classifier.mscala b/examples/bin/spark-document-classifier.mscala
index 9700253..62d1f55 100644
--- a/examples/bin/spark-document-classifier.mscala
+++ b/examples/bin/spark-document-classifier.mscala
@@ -25,7 +25,7 @@
  *    $MAHOUT_HOME/examples/bin/classify-wikipedia.sh --> option 2 
  *
  * then from the mahout spark-shell:
- *    :load $MAHOUT_HOME/examples/spark-document-classifier.mscala
+ *    :load {MAHOUT_HOME}/examples/spark-document-classifier.mscala
 */
  
 import org.apache.mahout.classifier.naivebayes._
@@ -40,6 +40,8 @@ val pathToData = "/tmp/mahout-work-wiki/"
 
 // read in our full set as vectorized by seq2sparse in classify-wikipedia.sh
 val fullData = drmDfsRead(pathToData + "wikipediaVecs/tfidf-vectors")
+
+// uncomment if you want to train and test on the split "fullData" set and adjust below as necessary
 //val trainData = drmDfsRead(pathToData + "training")
 //val testData = drmDfsRead(pathToData + "testing")
 
@@ -65,7 +67,7 @@ val dictionaryMap = dictionaryRDD.collect.map(x => x._1.toString -> x._2.toInt).
 val dfCountMap = documentFrequencyCountRDD.collect.map(x => x._1.toInt -> x._2.toLong).toMap
 
 // for this simple example, tokenize our document into unigrams using native string methods andvectorize using 
-// our dictionary and document frequencies.  You could also use a lucene analyzer for bigrams, trigrams, etc.   
+// our dictionary and document frequencies.  You could also use a lucene analyzer for bigrams, trigrams, etc.
 def vectorizeDocument(document: String,
 		      dictionaryMap: Map[String,Int],
 		      dfMap: Map[Int,Long]): Vector = {
@@ -115,13 +117,13 @@ def argmax(v: Vector): (Int, Double) = {
 }
   
 // our final classifier
-def classifyDocument(clvec: Vector) : String ={
+def classifyDocument(clvec: Vector) : String = {
   val cvec = classifier.classifyFull(clvec)
   val (bestIdx, bestScore) = argmax(cvec)
   reverseLabelMap(bestIdx)
 }   
 
-// A random United States footbal article
+// A random United States football article
 //http://www.reuters.com/article/2015/01/28/us-nfl-superbowl-security-idUSKBN0L12JR20150128
 val UStextToClassify = new String("(Reuters) - Super Bowl security officials acknowledge the NFL championship game represents" +
   " a high profile target on a world stage but are unaware of any specific credible threats against" + 
@@ -150,7 +152,7 @@ val UStextToClassify = new String("(Reuters) - Super Bowl security officials ack
   " planning process are going to have their best and brightest out there this weekend and we will have" +
   " a very safe Super Bowl.")
 
-// A random United Kingdom footbal article 
+// A random United Kingdom football article
 // http://www.reuters.com/article/2015/01/26/manchester-united-swissquote-idUSL6N0V52RZ20150126
 val UKtextToClassify = new String("(Reuters) - Manchester United have signed a sponsorship deal with online financial trading company" +
   " Swissquote, expanding the commercial partnerships that have helped to make the English club one of" +
@@ -180,16 +182,23 @@ val UKtextToClassify = new String("(Reuters) - Manchester United have signed a s
 val usVec = vectorizeDocument(UStextToClassify, dictionaryMap, dfCountMap)
 val ukVec = vectorizeDocument(UKtextToClassify, dictionaryMap, dfCountMap)
 
-println("Classifing the news article about the superbowl (united states)")
+println("Classifying the news article about superbowl security (united states)")
 classifyDocument(usVec)
 
-println("Classifing the news article about the Manchester United (united kingdom)")
+println("Classifying the news article about Manchester United (united kingdom)")
 classifyDocument(ukVec)
 
-// to classify new text, simply run this method on a string
-def classifyText(txt: String): String ={
+// to classify new text, tie everything together in a new method
+def classifyText(txt: String): String = {
   val v = vectorizeDocument(txt, dictionaryMap, dfCountMap)
   classifyDocument(v)
 }
+
+// now we can simply call our classifyText method on any string
+classifyText("Hello world from Queens")
+
+classifyText("Hello world from London")
+
+
   
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
----------------------------------------------------------------------
diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
index 7ef2b4c..5ffc18c 100644
--- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
+++ b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
@@ -45,6 +45,8 @@ class MahoutSparkILoop extends SparkILoop {
       conf.set("spark.executor.uri", execUri)
     }
 
+    conf.set("spark.executor.memory", "1g")
+
     sparkContext = mahoutSparkContext(
       masterUrl = master,
       appName = "Mahout Spark Shell",

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
index 63da80f..34e8cf9 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
@@ -59,7 +59,7 @@ object ItemSimilarityDriver extends MahoutSparkDriver {
   override def main(args: Array[String]): Unit = {
 
     parser = new MahoutSparkOptionParser(programName = "spark-itemsimilarity") {
-      head("spark-itemsimilarity", "Mahout 1.0")
+      head("spark-itemsimilarity", "Mahout 0.10.0")
 
       //Input output options, non-driver specific
       parseIOOptions(numInputs = 2)

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
index 3b47452..cfa8f99 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
@@ -54,7 +54,7 @@ object RowSimilarityDriver extends MahoutSparkDriver {
   override def main(args: Array[String]): Unit = {
 
     parser = new MahoutSparkOptionParser(programName = "spark-rowsimilarity") {
-      head("spark-rowsimilarity", "Mahout 1.0")
+      head("spark-rowsimilarity", "Mahout 0.10.0")
 
       //Input output options, non-driver specific
       parseIOOptions()

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
index 8531a0a..9e73094 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
@@ -35,7 +35,7 @@ object TestNBDriver extends MahoutSparkDriver {
   override def main(args: Array[String]): Unit = {
 
     parser = new MahoutSparkOptionParser(programName = "spark-testnb") {
-      head("spark-testnb", "Mahout 1.0")
+      head("spark-testnb", "Mahout 0.10.0")
 
       //Input output options, non-driver specific
       parseIOOptions(numInputs = 1)

http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
index 4f88c13..2edebca 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
@@ -35,7 +35,7 @@ object TrainNBDriver extends MahoutSparkDriver {
   override def main(args: Array[String]): Unit = {
 
     parser = new MahoutSparkOptionParser(programName = "spark-trainnb") {
-      head("spark-trainnb", "Mahout 1.0")
+      head("spark-trainnb", "Mahout 0.10.0")
 
       //Input output options, non-driver specific
       parseIOOptions(numInputs = 1)