You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2015/04/02 22:46:23 UTC
mahout git commit: (nojira) set spark.executor.memory = 1g in
spark-shell. fix -ma option in 20newsgroups shell script. a few other mostly
cosmetic changes, version numbers,
and shell script example. closes apache/mahout#95
Repository: mahout
Updated Branches:
refs/heads/master 5e07c8646 -> 260753fdb
(nojira) set spark.executor.memory = 1g in spark-shell. fix -ma option in 20newsgroups shell script. a few other mostly cosmetic changes, version numbers, and shell script example. closes apache/mahout#95
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/260753fd
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/260753fd
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/260753fd
Branch: refs/heads/master
Commit: 260753fdb9cc2b17a3e4dbf373e79c3de4887654
Parents: 5e07c86
Author: Andrew Palumbo <ap...@apache.org>
Authored: Thu Apr 2 13:08:26 2015 -0400
Committer: Andrew Palumbo <ap...@apache.org>
Committed: Thu Apr 2 16:43:51 2015 -0400
----------------------------------------------------------------------
examples/bin/classify-20newsgroups.sh | 6 ++---
examples/bin/spark-document-classifier.mscala | 27 +++++++++++++-------
.../sparkbindings/shell/MahoutSparkILoop.scala | 2 ++
.../mahout/drivers/ItemSimilarityDriver.scala | 2 +-
.../mahout/drivers/RowSimilarityDriver.scala | 2 +-
.../apache/mahout/drivers/TestNBDriver.scala | 2 +-
.../apache/mahout/drivers/TrainNBDriver.scala | 2 +-
7 files changed, 27 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-20newsgroups.sh b/examples/bin/classify-20newsgroups.sh
index 7d44480..e92dc7d 100755
--- a/examples/bin/classify-20newsgroups.sh
+++ b/examples/bin/classify-20newsgroups.sh
@@ -154,19 +154,19 @@ if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapR
echo "Training Naive Bayes model"
./bin/mahout spark-trainnb \
-i ${WORK_DIR}/20news-train-vectors \
- -o ${WORK_DIR}/spark-model $c --ma $MASTER
+ -o ${WORK_DIR}/spark-model $c -ma $MASTER
echo "Self testing on training set"
./bin/mahout spark-testnb \
-i ${WORK_DIR}/20news-train-vectors\
-o ${WORK_DIR}\
- -m ${WORK_DIR}/spark-model $c --ma $MASTER
+ -m ${WORK_DIR}/spark-model $c -ma $MASTER
echo "Testing on holdout set"
./bin/mahout spark-testnb \
-i ${WORK_DIR}/20news-test-vectors\
-o ${WORK_DIR}\
- -m ${WORK_DIR}/spark-model $c --ma $MASTER
+ -m ${WORK_DIR}/spark-model $c -ma $MASTER
fi
elif [ "x$alg" == "xsgd-MapReduce" ]; then
if [ ! -e "/tmp/news-group.model" ]; then
http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/examples/bin/spark-document-classifier.mscala
----------------------------------------------------------------------
diff --git a/examples/bin/spark-document-classifier.mscala b/examples/bin/spark-document-classifier.mscala
index 9700253..62d1f55 100644
--- a/examples/bin/spark-document-classifier.mscala
+++ b/examples/bin/spark-document-classifier.mscala
@@ -25,7 +25,7 @@
* $MAHOUT_HOME/examples/bin/classify-wikipedia.sh --> option 2
*
* then from the mahout spark-shell:
- * :load $MAHOUT_HOME/examples/spark-document-classifier.mscala
+ * :load {MAHOUT_HOME}/examples/spark-document-classifier.mscala
*/
import org.apache.mahout.classifier.naivebayes._
@@ -40,6 +40,8 @@ val pathToData = "/tmp/mahout-work-wiki/"
// read in our full set as vectorized by seq2sparse in classify-wikipedia.sh
val fullData = drmDfsRead(pathToData + "wikipediaVecs/tfidf-vectors")
+
+// uncomment if you want to train and test on the split "fullData" set and adjust below as necessary
//val trainData = drmDfsRead(pathToData + "training")
//val testData = drmDfsRead(pathToData + "testing")
@@ -65,7 +67,7 @@ val dictionaryMap = dictionaryRDD.collect.map(x => x._1.toString -> x._2.toInt).
val dfCountMap = documentFrequencyCountRDD.collect.map(x => x._1.toInt -> x._2.toLong).toMap
// for this simple example, tokenize our document into unigrams using native string methods andvectorize using
-// our dictionary and document frequencies. You could also use a lucene analyzer for bigrams, trigrams, etc.
+// our dictionary and document frequencies. You could also use a lucene analyzer for bigrams, trigrams, etc.
def vectorizeDocument(document: String,
dictionaryMap: Map[String,Int],
dfMap: Map[Int,Long]): Vector = {
@@ -115,13 +117,13 @@ def argmax(v: Vector): (Int, Double) = {
}
// our final classifier
-def classifyDocument(clvec: Vector) : String ={
+def classifyDocument(clvec: Vector) : String = {
val cvec = classifier.classifyFull(clvec)
val (bestIdx, bestScore) = argmax(cvec)
reverseLabelMap(bestIdx)
}
-// A random United States footbal article
+// A random United States football article
//http://www.reuters.com/article/2015/01/28/us-nfl-superbowl-security-idUSKBN0L12JR20150128
val UStextToClassify = new String("(Reuters) - Super Bowl security officials acknowledge the NFL championship game represents" +
" a high profile target on a world stage but are unaware of any specific credible threats against" +
@@ -150,7 +152,7 @@ val UStextToClassify = new String("(Reuters) - Super Bowl security officials ack
" planning process are going to have their best and brightest out there this weekend and we will have" +
" a very safe Super Bowl.")
-// A random United Kingdom footbal article
+// A random United Kingdom football article
// http://www.reuters.com/article/2015/01/26/manchester-united-swissquote-idUSL6N0V52RZ20150126
val UKtextToClassify = new String("(Reuters) - Manchester United have signed a sponsorship deal with online financial trading company" +
" Swissquote, expanding the commercial partnerships that have helped to make the English club one of" +
@@ -180,16 +182,23 @@ val UKtextToClassify = new String("(Reuters) - Manchester United have signed a s
val usVec = vectorizeDocument(UStextToClassify, dictionaryMap, dfCountMap)
val ukVec = vectorizeDocument(UKtextToClassify, dictionaryMap, dfCountMap)
-println("Classifing the news article about the superbowl (united states)")
+println("Classifying the news article about superbowl security (united states)")
classifyDocument(usVec)
-println("Classifing the news article about the Manchester United (united kingdom)")
+println("Classifying the news article about Manchester United (united kingdom)")
classifyDocument(ukVec)
-// to classify new text, simply run this method on a string
-def classifyText(txt: String): String ={
+// to classify new text, tie everything together in a new method
+def classifyText(txt: String): String = {
val v = vectorizeDocument(txt, dictionaryMap, dfCountMap)
classifyDocument(v)
}
+
+// now we can simply call our classifyText method on any string
+classifyText("Hello world from Queens")
+
+classifyText("Hello world from London")
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
----------------------------------------------------------------------
diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
index 7ef2b4c..5ffc18c 100644
--- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
+++ b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
@@ -45,6 +45,8 @@ class MahoutSparkILoop extends SparkILoop {
conf.set("spark.executor.uri", execUri)
}
+ conf.set("spark.executor.memory", "1g")
+
sparkContext = mahoutSparkContext(
masterUrl = master,
appName = "Mahout Spark Shell",
http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
index 63da80f..34e8cf9 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
@@ -59,7 +59,7 @@ object ItemSimilarityDriver extends MahoutSparkDriver {
override def main(args: Array[String]): Unit = {
parser = new MahoutSparkOptionParser(programName = "spark-itemsimilarity") {
- head("spark-itemsimilarity", "Mahout 1.0")
+ head("spark-itemsimilarity", "Mahout 0.10.0")
//Input output options, non-driver specific
parseIOOptions(numInputs = 2)
http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
index 3b47452..cfa8f99 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/RowSimilarityDriver.scala
@@ -54,7 +54,7 @@ object RowSimilarityDriver extends MahoutSparkDriver {
override def main(args: Array[String]): Unit = {
parser = new MahoutSparkOptionParser(programName = "spark-rowsimilarity") {
- head("spark-rowsimilarity", "Mahout 1.0")
+ head("spark-rowsimilarity", "Mahout 0.10.0")
//Input output options, non-driver specific
parseIOOptions()
http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
index 8531a0a..9e73094 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/TestNBDriver.scala
@@ -35,7 +35,7 @@ object TestNBDriver extends MahoutSparkDriver {
override def main(args: Array[String]): Unit = {
parser = new MahoutSparkOptionParser(programName = "spark-testnb") {
- head("spark-testnb", "Mahout 1.0")
+ head("spark-testnb", "Mahout 0.10.0")
//Input output options, non-driver specific
parseIOOptions(numInputs = 1)
http://git-wip-us.apache.org/repos/asf/mahout/blob/260753fd/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
index 4f88c13..2edebca 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/TrainNBDriver.scala
@@ -35,7 +35,7 @@ object TrainNBDriver extends MahoutSparkDriver {
override def main(args: Array[String]): Unit = {
parser = new MahoutSparkOptionParser(programName = "spark-trainnb") {
- head("spark-trainnb", "Mahout 1.0")
+ head("spark-trainnb", "Mahout 0.10.0")
//Input output options, non-driver specific
parseIOOptions(numInputs = 1)