You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/27 13:14:50 UTC
[24/24] mahout git commit: MAHOUT-2034 Split MR and New Examples into
seperate modules
MAHOUT-2034 Split MR and New Examples into seperate modules
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/02f75f99
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/02f75f99
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/02f75f99
Branch: refs/heads/branch-0.14.0
Commit: 02f75f997bbc01083a345287072e821bfe4f1558
Parents: aa57e2f
Author: Trevor a.k.a @rawkintrevo <tr...@gmail.com>
Authored: Wed Jun 27 08:13:16 2018 -0500
Committer: Trevor a.k.a @rawkintrevo <tr...@gmail.com>
Committed: Wed Jun 27 08:13:16 2018 -0500
----------------------------------------------------------------------
bin/load-shell.scala | 2 +-
bin/mahout | 196 +-
bin/mahout.bu | 395 +
community/mahout-mr/bin/mahout | 395 +
community/mahout-mr/bin/mahout.cmd | 397 +
community/mahout-mr/examples/bin/README.txt | 13 +
.../examples/bin/classify-20newsgroups.sh | 197 +
.../examples/bin/classify-wikipedia.sh | 196 +
.../mahout-mr/examples/bin/cluster-reuters.sh | 203 +
.../examples/bin/cluster-syntheticcontrol.sh | 105 +
.../examples/bin/factorize-movielens-1M.sh | 85 +
.../mahout-mr/examples/bin/factorize-netflix.sh | 90 +
.../mahout-mr/examples/bin/get-all-examples.sh | 36 +
community/mahout-mr/examples/bin/lda.algorithm | 45 +
.../examples/bin/resources/bank-full.csv | 45212 +++++++++++++++++
.../examples/bin/resources/country.txt | 229 +
.../examples/bin/resources/country10.txt | 10 +
.../examples/bin/resources/country2.txt | 2 +
.../examples/bin/resources/donut-test.csv | 41 +
.../mahout-mr/examples/bin/resources/donut.csv | 41 +
.../examples/bin/resources/test-data.csv | 61 +
.../mahout-mr/examples/bin/set-dfs-commands.sh | 54 +
community/mahout-mr/examples/pom.xml | 199 +
.../examples/src/main/assembly/job.xml | 46 +
.../cf/taste/example/TasteOptionParser.java | 75 +
.../BookCrossingBooleanRecommender.java | 102 +
.../BookCrossingBooleanRecommenderBuilder.java | 32 +
...ossingBooleanRecommenderEvaluatorRunner.java | 59 +
.../bookcrossing/BookCrossingDataModel.java | 99 +
.../BookCrossingDataModelBuilder.java | 33 +
.../bookcrossing/BookCrossingRecommender.java | 101 +
.../BookCrossingRecommenderBuilder.java | 32 +
.../BookCrossingRecommenderEvaluatorRunner.java | 54 +
.../mahout/cf/taste/example/bookcrossing/README | 9 +
.../cf/taste/example/email/EmailUtility.java | 104 +
.../email/FromEmailToDictionaryMapper.java | 61 +
.../example/email/MailToDictionaryReducer.java | 43 +
.../taste/example/email/MailToPrefsDriver.java | 274 +
.../cf/taste/example/email/MailToRecMapper.java | 101 +
.../taste/example/email/MailToRecReducer.java | 53 +
.../example/email/MsgIdToDictionaryMapper.java | 49 +
.../taste/example/kddcup/DataFileIterable.java | 44 +
.../taste/example/kddcup/DataFileIterator.java | 158 +
.../taste/example/kddcup/KDDCupDataModel.java | 231 +
.../mahout/cf/taste/example/kddcup/ToCSV.java | 77 +
.../kddcup/track1/EstimateConverter.java | 43 +
.../example/kddcup/track1/Track1Callable.java | 67 +
.../kddcup/track1/Track1Recommender.java | 94 +
.../kddcup/track1/Track1RecommenderBuilder.java | 32 +
.../track1/Track1RecommenderEvaluator.java | 108 +
.../Track1RecommenderEvaluatorRunner.java | 56 +
.../example/kddcup/track1/Track1Runner.java | 95 +
.../svd/DataModelFactorizablePreferences.java | 107 +
.../track1/svd/FactorizablePreferences.java | 44 +
.../svd/KDDCupFactorizablePreferences.java | 123 +
.../track1/svd/ParallelArraysSGDFactorizer.java | 265 +
.../kddcup/track1/svd/Track1SVDRunner.java | 141 +
.../example/kddcup/track2/HybridSimilarity.java | 62 +
.../example/kddcup/track2/Track2Callable.java | 106 +
.../kddcup/track2/Track2Recommender.java | 100 +
.../kddcup/track2/Track2RecommenderBuilder.java | 33 +
.../example/kddcup/track2/Track2Runner.java | 100 +
.../taste/example/kddcup/track2/TrackData.java | 71 +
.../kddcup/track2/TrackItemSimilarity.java | 106 +
.../taste/example/kddcup/track2/UserResult.java | 54 +
.../als/netflix/NetflixDatasetConverter.java | 140 +
.../example/BatchItemSimilaritiesGroupLens.java | 65 +
.../precompute/example/GroupLensDataModel.java | 96 +
.../mahout/classifier/NewsgroupHelper.java | 128 +
.../classifier/email/PrepEmailMapper.java | 65 +
.../classifier/email/PrepEmailReducer.java | 47 +
.../email/PrepEmailVectorsDriver.java | 76 +
.../sequencelearning/hmm/PosTagger.java | 277 +
.../sgd/AdaptiveLogisticModelParameters.java | 236 +
.../classifier/sgd/LogisticModelParameters.java | 265 +
.../classifier/sgd/PrintResourceOrFile.java | 42 +
.../classifier/sgd/RunAdaptiveLogistic.java | 197 +
.../mahout/classifier/sgd/RunLogistic.java | 163 +
.../apache/mahout/classifier/sgd/SGDHelper.java | 151 +
.../apache/mahout/classifier/sgd/SGDInfo.java | 59 +
.../classifier/sgd/SimpleCsvExamples.java | 283 +
.../mahout/classifier/sgd/TestASFEmail.java | 152 +
.../mahout/classifier/sgd/TestNewsGroups.java | 141 +
.../mahout/classifier/sgd/TrainASFEmail.java | 137 +
.../classifier/sgd/TrainAdaptiveLogistic.java | 377 +
.../mahout/classifier/sgd/TrainLogistic.java | 311 +
.../mahout/classifier/sgd/TrainNewsGroups.java | 154 +
.../sgd/ValidateAdaptiveLogistic.java | 218 +
.../BankMarketingClassificationMain.java | 70 +
.../sgd/bankmarketing/TelephoneCall.java | 104 +
.../sgd/bankmarketing/TelephoneCallParser.java | 66 +
.../clustering/display/ClustersFilter.java | 31 +
.../clustering/display/DisplayCanopy.java | 88 +
.../clustering/display/DisplayClustering.java | 374 +
.../clustering/display/DisplayFuzzyKMeans.java | 110 +
.../clustering/display/DisplayKMeans.java | 106 +
.../display/DisplaySpectralKMeans.java | 85 +
.../apache/mahout/clustering/display/README.txt | 22 +
.../tools/ClusterQualitySummarizer.java | 279 +
.../clustering/streaming/tools/IOUtils.java | 80 +
.../clustering/syntheticcontrol/canopy/Job.java | 125 +
.../syntheticcontrol/fuzzykmeans/Job.java | 144 +
.../clustering/syntheticcontrol/kmeans/Job.java | 187 +
.../fpm/pfpgrowth/DeliciousTagsExample.java | 94 +
.../dataset/KeyBasedStringTupleCombiner.java | 40 +
.../dataset/KeyBasedStringTupleGrouper.java | 77 +
.../dataset/KeyBasedStringTupleMapper.java | 90 +
.../dataset/KeyBasedStringTupleReducer.java | 74 +
.../examples/src/main/resources/bank-full.csv | 45212 +++++++++++++++++
.../src/main/resources/cf-data-purchase.txt | 7 +
.../src/main/resources/cf-data-view.txt | 12 +
.../examples/src/main/resources/donut-test.csv | 41 +
.../examples/src/main/resources/donut.csv | 41 +
.../examples/src/main/resources/test-data.csv | 61 +
.../sgd/LogisticModelParametersTest.java | 43 +
.../classifier/sgd/ModelDissectorTest.java | 40 +
.../classifier/sgd/TrainLogisticTest.java | 167 +
.../clustering/display/ClustersFilterTest.java | 75 +
.../apache/mahout/examples/MahoutTestCase.java | 30 +
.../examples/src/test/resources/country.txt | 229 +
.../examples/src/test/resources/country10.txt | 10 +
.../examples/src/test/resources/country2.txt | 2 +
.../examples/src/test/resources/subjects.txt | 2 +
.../examples/src/test/resources/wdbc.infos | 32 +
.../examples/src/test/resources/wdbc/wdbc.data | 569 +
community/mahout-mr/pom.xml | 4 +
community/spark-cli-drivers/pom.xml | 21 +
.../src/main/assembly/dependency-reduced.xml | 51 +
.../src/main/assembly/dependency-reduced.xml | 2 +-
examples/bin/README.txt | 13 -
examples/bin/basicOLS.scala | 61 +
examples/bin/cco-lastfm.scala | 112 +
examples/bin/classify-20newsgroups.sh | 197 -
examples/bin/classify-wikipedia.sh | 196 -
examples/bin/cluster-reuters.sh | 203 -
examples/bin/cluster-syntheticcontrol.sh | 105 -
examples/bin/factorize-movielens-1M.sh | 85 -
examples/bin/factorize-netflix.sh | 90 -
examples/bin/get-all-examples.sh | 36 -
examples/bin/lda.algorithm | 45 -
examples/bin/resources/bank-full.csv | 45212 -----------------
examples/bin/resources/country.txt | 229 -
examples/bin/resources/country10.txt | 10 -
examples/bin/resources/country2.txt | 2 -
examples/bin/resources/donut-test.csv | 41 -
examples/bin/resources/donut.csv | 41 -
examples/bin/resources/test-data.csv | 61 -
examples/bin/run-item-sim.sh | 6 +-
examples/bin/set-dfs-commands.sh | 54 -
examples/pom.xml | 173 +-
examples/src/main/assembly/job.xml | 46 -
.../cf/taste/example/TasteOptionParser.java | 75 -
.../BookCrossingBooleanRecommender.java | 102 -
.../BookCrossingBooleanRecommenderBuilder.java | 32 -
...ossingBooleanRecommenderEvaluatorRunner.java | 59 -
.../bookcrossing/BookCrossingDataModel.java | 99 -
.../BookCrossingDataModelBuilder.java | 33 -
.../bookcrossing/BookCrossingRecommender.java | 101 -
.../BookCrossingRecommenderBuilder.java | 32 -
.../BookCrossingRecommenderEvaluatorRunner.java | 54 -
.../mahout/cf/taste/example/bookcrossing/README | 9 -
.../cf/taste/example/email/EmailUtility.java | 104 -
.../email/FromEmailToDictionaryMapper.java | 61 -
.../example/email/MailToDictionaryReducer.java | 43 -
.../taste/example/email/MailToPrefsDriver.java | 274 -
.../cf/taste/example/email/MailToRecMapper.java | 101 -
.../taste/example/email/MailToRecReducer.java | 53 -
.../example/email/MsgIdToDictionaryMapper.java | 49 -
.../taste/example/kddcup/DataFileIterable.java | 44 -
.../taste/example/kddcup/DataFileIterator.java | 158 -
.../taste/example/kddcup/KDDCupDataModel.java | 231 -
.../mahout/cf/taste/example/kddcup/ToCSV.java | 77 -
.../kddcup/track1/EstimateConverter.java | 43 -
.../example/kddcup/track1/Track1Callable.java | 67 -
.../kddcup/track1/Track1Recommender.java | 94 -
.../kddcup/track1/Track1RecommenderBuilder.java | 32 -
.../track1/Track1RecommenderEvaluator.java | 108 -
.../Track1RecommenderEvaluatorRunner.java | 56 -
.../example/kddcup/track1/Track1Runner.java | 95 -
.../svd/DataModelFactorizablePreferences.java | 107 -
.../track1/svd/FactorizablePreferences.java | 44 -
.../svd/KDDCupFactorizablePreferences.java | 123 -
.../track1/svd/ParallelArraysSGDFactorizer.java | 265 -
.../kddcup/track1/svd/Track1SVDRunner.java | 141 -
.../example/kddcup/track2/HybridSimilarity.java | 62 -
.../example/kddcup/track2/Track2Callable.java | 106 -
.../kddcup/track2/Track2Recommender.java | 100 -
.../kddcup/track2/Track2RecommenderBuilder.java | 33 -
.../example/kddcup/track2/Track2Runner.java | 100 -
.../taste/example/kddcup/track2/TrackData.java | 71 -
.../kddcup/track2/TrackItemSimilarity.java | 106 -
.../taste/example/kddcup/track2/UserResult.java | 54 -
.../als/netflix/NetflixDatasetConverter.java | 140 -
.../example/BatchItemSimilaritiesGroupLens.java | 65 -
.../precompute/example/GroupLensDataModel.java | 96 -
.../mahout/classifier/NewsgroupHelper.java | 128 -
.../classifier/email/PrepEmailMapper.java | 65 -
.../classifier/email/PrepEmailReducer.java | 47 -
.../email/PrepEmailVectorsDriver.java | 76 -
.../sequencelearning/hmm/PosTagger.java | 277 -
.../sgd/AdaptiveLogisticModelParameters.java | 236 -
.../classifier/sgd/LogisticModelParameters.java | 265 -
.../classifier/sgd/PrintResourceOrFile.java | 42 -
.../classifier/sgd/RunAdaptiveLogistic.java | 197 -
.../mahout/classifier/sgd/RunLogistic.java | 163 -
.../apache/mahout/classifier/sgd/SGDHelper.java | 151 -
.../apache/mahout/classifier/sgd/SGDInfo.java | 59 -
.../classifier/sgd/SimpleCsvExamples.java | 283 -
.../mahout/classifier/sgd/TestASFEmail.java | 152 -
.../mahout/classifier/sgd/TestNewsGroups.java | 141 -
.../mahout/classifier/sgd/TrainASFEmail.java | 137 -
.../classifier/sgd/TrainAdaptiveLogistic.java | 377 -
.../mahout/classifier/sgd/TrainLogistic.java | 311 -
.../mahout/classifier/sgd/TrainNewsGroups.java | 154 -
.../sgd/ValidateAdaptiveLogistic.java | 218 -
.../BankMarketingClassificationMain.java | 70 -
.../sgd/bankmarketing/TelephoneCall.java | 104 -
.../sgd/bankmarketing/TelephoneCallParser.java | 66 -
.../clustering/display/ClustersFilter.java | 31 -
.../clustering/display/DisplayCanopy.java | 88 -
.../clustering/display/DisplayClustering.java | 374 -
.../clustering/display/DisplayFuzzyKMeans.java | 110 -
.../clustering/display/DisplayKMeans.java | 106 -
.../display/DisplaySpectralKMeans.java | 85 -
.../apache/mahout/clustering/display/README.txt | 22 -
.../tools/ClusterQualitySummarizer.java | 279 -
.../clustering/streaming/tools/IOUtils.java | 80 -
.../clustering/syntheticcontrol/canopy/Job.java | 125 -
.../syntheticcontrol/fuzzykmeans/Job.java | 144 -
.../clustering/syntheticcontrol/kmeans/Job.java | 187 -
.../fpm/pfpgrowth/DeliciousTagsExample.java | 94 -
.../dataset/KeyBasedStringTupleCombiner.java | 40 -
.../dataset/KeyBasedStringTupleGrouper.java | 77 -
.../dataset/KeyBasedStringTupleMapper.java | 90 -
.../dataset/KeyBasedStringTupleReducer.java | 74 -
examples/src/main/resources/bank-full.csv | 45212 -----------------
.../src/main/resources/cf-data-purchase.txt | 7 -
examples/src/main/resources/cf-data-view.txt | 12 -
examples/src/main/resources/donut-test.csv | 41 -
examples/src/main/resources/donut.csv | 41 -
examples/src/main/resources/test-data.csv | 61 -
.../sgd/LogisticModelParametersTest.java | 43 -
.../classifier/sgd/ModelDissectorTest.java | 40 -
.../classifier/sgd/TrainLogisticTest.java | 167 -
.../clustering/display/ClustersFilterTest.java | 75 -
.../apache/mahout/examples/MahoutTestCase.java | 30 -
examples/src/test/resources/country.txt | 229 -
examples/src/test/resources/country10.txt | 10 -
examples/src/test/resources/country2.txt | 2 -
examples/src/test/resources/subjects.txt | 2 -
examples/src/test/resources/wdbc.infos | 32 -
examples/src/test/resources/wdbc/wdbc.data | 569 -
pom.xml | 4 +-
253 files changed, 104613 insertions(+), 103131 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/bin/load-shell.scala
----------------------------------------------------------------------
diff --git a/bin/load-shell.scala b/bin/load-shell.scala
index 7468b76..f60705c 100644
--- a/bin/load-shell.scala
+++ b/bin/load-shell.scala
@@ -29,6 +29,6 @@ println("""
_ __ ___ __ _| |__ ___ _ _| |_
'_ ` _ \ / _` | '_ \ / _ \| | | | __|
| | | | | (_| | | | | (_) | |_| | |_
-_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.13.0
+_| |_| |_|\__,_|_| |_|\___/ \__,_|\__| version 0.14.0
""")
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/bin/mahout
----------------------------------------------------------------------
diff --git a/bin/mahout b/bin/mahout
index 3017c9e..fd40fe0 100755
--- a/bin/mahout
+++ b/bin/mahout
@@ -57,6 +57,8 @@ case "`uname`" in
CYGWIN*) cygwin=true;;
esac
+# Check that mahout home is set, if not set it to one dir up.
+
# resolve links - $0 may be a softlink
THIS="$0"
while [ -h "$THIS" ]; do
@@ -123,6 +125,13 @@ if [ "$JAVA_HOME" = "" ]; then
exit 1
fi
+if [ "$SPARK" = "1" ]; then
+ if [ "$SPARK_HOME" = "" ]; then
+ echo "Error: SPARK_HOME is not set."
+ exit 1
+ fi
+fi
+
JAVA=$JAVA_HOME/bin/java
JAVA_HEAP_MAX=-Xmx4g
@@ -133,53 +142,57 @@ if [ "$MAHOUT_HEAPSIZE" != "" ]; then
#echo $JAVA_HEAP_MAX
fi
-if [ "x$MAHOUT_CONF_DIR" = "x" ]; then
- if [ -d $MAHOUT_HOME/src/conf ]; then
- MAHOUT_CONF_DIR=$MAHOUT_HOME/src/conf
- else
- if [ -d $MAHOUT_HOME/conf ]; then
- MAHOUT_CONF_DIR=$MAHOUT_HOME/conf
- else
- echo No MAHOUT_CONF_DIR found
- fi
- fi
-fi
+#if [ "x$MAHOUT_CONF_DIR" = "x" ]; then
+# if [ -d $MAHOUT_HOME/src/conf ]; then
+# MAHOUT_CONF_DIR=$MAHOUT_HOME/src/conf
+# else
+# if [ -d $MAHOUT_HOME/conf ]; then
+# MAHOUT_CONF_DIR=$MAHOUT_HOME/conf
+# else
+# echo No MAHOUT_CONF_DIR found
+# fi
+# fi
+#fi
# CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf
-CLASSPATH=${CLASSPATH}:$MAHOUT_CONF_DIR
+#CLASSPATH=${CLASSPATH}:$MAHOUT_CONF_DIR
-if [ "$MAHOUT_LOCAL" != "" ]; then
- echo "MAHOUT_LOCAL is set, so we don't add HADOOP_CONF_DIR to classpath."
-elif [ -n "$HADOOP_CONF_DIR" ] ; then
- echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath."
- CLASSPATH=${CLASSPATH}:$HADOOP_CONF_DIR
-fi
+#if [ "$MAHOUT_LOCAL" != "" ]; then
+# echo "MAHOUT_LOCAL is set, so we don't add HADOOP_CONF_DIR to classpath."
+#elif [ -n "$HADOOP_CONF_DIR" ] ; then
+# echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath."
+# CLASSPATH=${CLASSPATH}:$HADOOP_CONF_DIR
+#fi
-CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
+#CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
# so that filenames w/ spaces are handled correctly in loops below
IFS=
+
if [ $IS_CORE == 0 ]
then
# add release dependencies to CLASSPATH
- for f in $MAHOUT_HOME/mahout-*.jar; do
+ echo "Adding lib/ to CLASSPATH"
+ for f in $MAHOUT_HOME/lib/*.jar; do
CLASSPATH=${CLASSPATH}:$f;
done
- if [ "$SPARK" != "1" ]; then
+ CLASSPATH="${CLASSPATH}:${SPARK_HOME}/jars/*"
- # add dev targets if they exist
- for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
- fi
+
+# if [ "$SPARK" != "1" ]; then
+# # add dev targets if they exist
+# for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+# fi
# add scala dev target
- for f in $MAHOUT_HOME/math-scala/target/mahout-math-scala_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
+# for f in $MAHOUT_HOME/math-scala/target/mahout-math-scala_*.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
if [ "$H2O" == "1" ]; then
for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar; do
@@ -193,38 +206,34 @@ then
fi
# add jars for running from the command line if we requested shell or spark CLI driver
- if [ "$SPARK" == "1" ]; then
-
- for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- for f in $MAHOUT_HOME/math/target/mahout-math-*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- for f in $MAHOUT_HOME/spark/target/mahout-spark_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- for f in $MAHOUT_HOME/spark-shell/target/mahout-spark-shell_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- # viennacl jars- may or may not be available depending on build profile
- for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- # viennacl jars- may or may not be available depending on build profile
- for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
+# if [ "$SPARK" == "1" ]; then
+#
+# for f in $MAHOUT_HOME/lib/mahout-hdfs-*.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+#
+# for f in $MAHOUT_HOME/lib/mahout-core-*.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+#
+# for f in $MAHOUT_HOME/lib/spark_*.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+#
+# for f in $MAHOUT_HOME/lib/spark-cli_*.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+#
+# # viennacl jars- may or may not be available depending on build profile
+# for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+#
+# # viennacl jars- may or may not be available depending on build profile
+# for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
- # viennacl jars- may or may not be available depending on build profile
- for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh"
if [ -x "${SPARK_CP_BIN}" ]; then
@@ -245,39 +254,39 @@ then
fi
fi
- # add vcl jars at any point.
- # viennacl jars- may or may not be available depending on build profile
- for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- # viennacl jars- may or may not be available depending on build profile
- for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-
- # add release dependencies to CLASSPATH
- for f in $MAHOUT_HOME/lib/*.jar; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-else
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/hdfs/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mr/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/integration/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/examples/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math-scala/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes
- CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes
-fi
+ # add vcl jars at any point.
+ # viennacl jars- may or may not be available depending on build profile
+# for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+#
+# # viennacl jars- may or may not be available depending on build profile
+# for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+#
+# # add release dependencies to CLASSPATH
+# for f in $MAHOUT_HOME/lib/*.jar; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+#else
+# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math/target/classes
+# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/hdfs/target/classes
+# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mr/target/classes
+# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/integration/target/classes
+# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/examples/target/classes
+# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math-scala/target/classes
+# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes
+# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes
+# CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes
+#fi
# add development dependencies to CLASSPATH
-if [ "$SPARK" != "1" ]; then
- for f in $MAHOUT_HOME/examples/target/dependency/*.jar; do
- CLASSPATH=${CLASSPATH}:$f;
- done
-fi
+#if [ "$SPARK" != "1" ]; then
+# for f in $MAHOUT_HOME/examples/target/dependency/*.jar; do
+# CLASSPATH=${CLASSPATH}:$f;
+# done
+#fi
# cygwin path translation
@@ -287,7 +296,7 @@ fi
# restore ordinary behaviour
unset IFS
-JARS=$(echo "$MAHOUT_HOME"/*.jar | tr ' ' ',')
+JARS=$(echo "$MAHOUT_HOME"/lib/*.jar | tr ' ' ',')
case "$1" in
(spark-shell)
save_stty=$(stty -g 2>/dev/null);
@@ -297,6 +306,7 @@ case "$1" in
# Spark CLI drivers go here
(spark-itemsimilarity)
shift
+ echo $CLASSPATH
"$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.ItemSimilarityDriver" "$@"
;;
(spark-rowsimilarity)
@@ -333,7 +343,7 @@ case "$1" in
MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.dir=$MAHOUT_LOG_DIR"
MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.file=$MAHOUT_LOGFILE"
-
+
if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
MAHOUT_OPTS="$MAHOUT_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/bin/mahout.bu
----------------------------------------------------------------------
diff --git a/bin/mahout.bu b/bin/mahout.bu
new file mode 100755
index 0000000..20f9c3d
--- /dev/null
+++ b/bin/mahout.bu
@@ -0,0 +1,395 @@
+#!/bin/bash
+#
+# The Mahout command script
+#
+# Environment Variables
+#
+# MAHOUT_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+#
+# MAHOUT_HEAPSIZE The maximum amount of heap to use, in MB.
+# Default is 4000.
+#
+# HADOOP_CONF_DIR The location of a hadoop config directory
+#
+# MAHOUT_OPTS Extra Java runtime options.
+#
+# MAHOUT_CONF_DIR The location of the program short-name to class name
+# mappings and the default properties files
+# defaults to "$MAHOUT_HOME/src/conf"
+#
+# MAHOUT_LOCAL set to anything other than an empty string to force
+# mahout to run locally even if
+# HADOOP_CONF_DIR and HADOOP_HOME are set
+#
+# MAHOUT_CORE set to anything other than an empty string to force
+# mahout to run in developer 'core' mode, just as if the
+# -core option was presented on the command-line
+# Command-line Options
+#
+# -core -core is used to switch into 'developer mode' when
+# running mahout locally. If specified, the classes
+# from the 'target/classes' directories in each project
+# are used. Otherwise classes will be retrieved from
+# jars in the binary release collection or *-job.jar files
+# found in build directories. When running on hadoop
+# the job files will always be used.
+
+#
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+cygwin=false
+case "`uname`" in
+CYGWIN*) cygwin=true;;
+esac
+
+# Check that mahout home is set, if not set it to one dir up.
+
+# resolve links - $0 may be a softlink
+THIS="$0"
+while [ -h "$THIS" ]; do
+ ls=`ls -ld "$THIS"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ THIS="$link"
+ else
+ THIS=`dirname "$THIS"`/"$link"
+ fi
+done
+
+IS_CORE=0
+if [ "$1" == "-core" ] ; then
+ IS_CORE=1
+ shift
+fi
+
+if [ "$1" == "-spark" ]; then
+ SPARK=1
+ shift
+fi
+
+if [ "$1" == "spark-shell" ]; then
+ SPARK=1
+fi
+
+if [ "$1" == "spark-itemsimilarity" ]; then
+ SPARK=1
+fi
+
+if [ "$1" == "spark-rowsimilarity" ]; then
+ SPARK=1
+fi
+
+if [ "$1" == "spark-trainnb" ]; then
+ SPARK=1
+fi
+
+if [ "$1" == "spark-testnb" ]; then
+ SPARK=1
+fi
+
+if [ "$MAHOUT_CORE" != "" ]; then
+ IS_CORE=1
+fi
+
+if [ "$1" == "h2o-node" ]; then
+ H2O=1
+fi
+
+# some directories
+THIS_DIR=`dirname "$THIS"`
+MAHOUT_HOME=`cd "$THIS_DIR/.." ; pwd`
+
+# some Java parameters
+if [ "$MAHOUT_JAVA_HOME" != "" ]; then
+ #echo "run java in $MAHOUT_JAVA_HOME"
+ JAVA_HOME=$MAHOUT_JAVA_HOME
+fi
+
+if [ "$JAVA_HOME" = "" ]; then
+ echo "Error: JAVA_HOME is not set."
+ exit 1
+fi
+
+JAVA=$JAVA_HOME/bin/java
+JAVA_HEAP_MAX=-Xmx4g
+
+# check envvars which might override default args
+if [ "$MAHOUT_HEAPSIZE" != "" ]; then
+ #echo "run with heapsize $MAHOUT_HEAPSIZE"
+ JAVA_HEAP_MAX="-Xmx""$MAHOUT_HEAPSIZE""m"
+ #echo $JAVA_HEAP_MAX
+fi
+
+if [ "x$MAHOUT_CONF_DIR" = "x" ]; then
+ if [ -d $MAHOUT_HOME/src/conf ]; then
+ MAHOUT_CONF_DIR=$MAHOUT_HOME/src/conf
+ else
+ if [ -d $MAHOUT_HOME/conf ]; then
+ MAHOUT_CONF_DIR=$MAHOUT_HOME/conf
+ else
+ echo No MAHOUT_CONF_DIR found
+ fi
+ fi
+fi
+
+
+# CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf
+CLASSPATH=${CLASSPATH}:$MAHOUT_CONF_DIR
+
+if [ "$MAHOUT_LOCAL" != "" ]; then
+ echo "MAHOUT_LOCAL is set, so we don't add HADOOP_CONF_DIR to classpath."
+elif [ -n "$HADOOP_CONF_DIR" ] ; then
+ echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath."
+ CLASSPATH=${CLASSPATH}:$HADOOP_CONF_DIR
+fi
+
+CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+if [ $IS_CORE == 0 ]
+then
+ # add release dependencies to CLASSPATH
+ for f in $MAHOUT_HOME/lib/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ if [ "$SPARK" != "1" ]; then
+ if [$SPARK_HOME == ""]; then
+ echo "Have you set SPARK_HOME ?"
+ fi
+ # add dev targets if they exist
+ for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+ fi
+
+ # add scala dev target
+ for f in $MAHOUT_HOME/math-scala/target/mahout-math-scala_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ if [ "$H2O" == "1" ]; then
+ for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ for f in $MAHOUT_HOME/h2o/target/mahout-h2o*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ fi
+
+ # add jars for running from the command line if we requested shell or spark CLI driver
+ if [ "$SPARK" == "1" ]; then
+
+ for f in $MAHOUT_HOME/lib/mahout-hdfs-*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ for f in $MAHOUT_HOME/lib/mahout-core-*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ for f in $MAHOUT_HOME/lib/spark_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ for f in $MAHOUT_HOME/lib/spark-cli_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ # viennacl jars- may or may not be available depending on build profile
+ for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ # viennacl jars- may or may not be available depending on build profile
+ for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+
+ SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh"
+ if [ -x "${SPARK_CP_BIN}" ]; then
+ SPARK_CLASSPATH=$("${SPARK_CP_BIN}" 2>/dev/null)
+ CLASSPATH="${CLASSPATH}:${SPARK_CLASSPATH}"
+ else
+ echo "Cannot find Spark classpath. Is 'SPARK_HOME' set?"
+ exit -1
+ fi
+
+ SPARK_ASSEMBLY_BIN="${MAHOUT_HOME}/bin/mahout-spark-class.sh"
+ if [ -x "${SPARK_ASSEMBLY_BIN}" ]; then
+ SPARK_ASSEMBLY_CLASSPATH=$("${SPARK_ASSEMBLY_BIN}" 2>/dev/null)
+ CLASSPATH="${CLASSPATH}:${SPARK_ASSEMBLY_BIN}"
+ else
+ echo "Cannot find Spark assembly classpath. Is 'SPARK_HOME' set?"
+ exit -1
+ fi
+ fi
+
+ # add vcl jars at any point.
+ # viennacl jars- may or may not be available depending on build profile
+ for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ # viennacl jars- may or may not be available depending on build profile
+ for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ # add release dependencies to CLASSPATH
+ for f in $MAHOUT_HOME/lib/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+else
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/hdfs/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mr/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/integration/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/examples/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math-scala/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes
+fi
+
+# add development dependencies to CLASSPATH
+if [ "$SPARK" != "1" ]; then
+ for f in $MAHOUT_HOME/examples/target/dependency/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+fi
+
+
+# cygwin path translation
+if $cygwin; then
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
+fi
+
+# restore ordinary behaviour
+unset IFS
+JARS=$(echo "$MAHOUT_HOME"/*.jar | tr ' ' ',')
+case "$1" in
+ (spark-shell)
+ save_stty=$(stty -g 2>/dev/null);
+ $SPARK_HOME/bin/spark-shell --jars "$JARS" -i $MAHOUT_HOME/bin/load-shell.scala --conf spark.kryo.referenceTracking=false --conf spark.kryo.registrator=org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator --conf spark.kryoserializer.buffer=32k --conf spark.kryoserializer.buffer.max=600m --conf spark.serializer=org.apache.spark.serializer.KryoSerializer $@
+ stty sane; stty $save_stty
+ ;;
+ # Spark CLI drivers go here
+ (spark-itemsimilarity)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.ItemSimilarityDriver" "$@"
+ ;;
+ (spark-rowsimilarity)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.RowSimilarityDriver" "$@"
+ ;;
+ (spark-trainnb)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TrainNBDriver" "$@"
+ ;;
+ (spark-testnb)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TestNBDriver" "$@"
+ ;;
+
+ (h2o-node)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "water.H2O" -md5skip "$@" -name mah2out
+ ;;
+ (*)
+
+ # default log directory & file
+ if [ "$MAHOUT_LOG_DIR" = "" ]; then
+ MAHOUT_LOG_DIR="$MAHOUT_HOME/logs"
+ fi
+ if [ "$MAHOUT_LOGFILE" = "" ]; then
+ MAHOUT_LOGFILE='mahout.log'
+ fi
+
+ #Fix log path under cygwin
+ if $cygwin; then
+ MAHOUT_LOG_DIR=`cygpath -p -w "$MAHOUT_LOG_DIR"`
+ fi
+
+ MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.dir=$MAHOUT_LOG_DIR"
+ MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.file=$MAHOUT_LOGFILE"
+
+
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+ MAHOUT_OPTS="$MAHOUT_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
+ fi
+
+ CLASS=org.apache.mahout.driver.MahoutDriver
+
+ for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do
+ if [ -e "$f" ]; then
+ MAHOUT_JOB=$f
+ fi
+ done
+
+ # run it
+
+ HADOOP_BINARY=$(PATH="${HADOOP_HOME:-${HADOOP_PREFIX}}/bin:$PATH" which hadoop 2>/dev/null)
+ if [ -x "$HADOOP_BINARY" ] ; then
+ HADOOP_BINARY_CLASSPATH=$("$HADOOP_BINARY" classpath)
+ fi
+ if [ ! -x "$HADOOP_BINARY" ] || [ "$MAHOUT_LOCAL" != "" ] ; then
+ if [ ! -x "$HADOOP_BINARY" ] ; then
+ echo "hadoop binary is not in PATH,HADOOP_HOME/bin,HADOOP_PREFIX/bin, running locally"
+ elif [ "$MAHOUT_LOCAL" != "" ] ; then
+ echo "MAHOUT_LOCAL is set, running locally"
+ fi
+ CLASSPATH="${CLASSPATH}:${MAHOUT_HOME}/lib/hadoop/*"
+ case $1 in
+ (classpath)
+ echo $CLASSPATH
+ ;;
+ (*)
+ exec "$JAVA" $JAVA_HEAP_MAX $MAHOUT_OPTS -classpath "$CLASSPATH" $CLASS "$@"
+ esac
+ else
+ echo "Running on hadoop, using $HADOOP_BINARY and HADOOP_CONF_DIR=$HADOOP_CONF_DIR"
+
+ if [ "$MAHOUT_JOB" = "" ] ; then
+ echo "ERROR: Could not find mahout-examples-*.job in $MAHOUT_HOME or $MAHOUT_HOME/examples/target, please run 'mvn install' to create the .job file"
+ exit 1
+ else
+ case "$1" in
+ (hadoop)
+ shift
+ export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH}:$CLASSPATH
+ exec "$HADOOP_BINARY" "$@"
+ ;;
+ (classpath)
+ echo $CLASSPATH
+ ;;
+ (*)
+ echo "MAHOUT-JOB: $MAHOUT_JOB"
+ export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH}
+ exec "$HADOOP_BINARY" jar $MAHOUT_JOB $CLASS "$@"
+ esac
+ fi
+ fi
+ ;;
+esac
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/bin/mahout
----------------------------------------------------------------------
diff --git a/community/mahout-mr/bin/mahout b/community/mahout-mr/bin/mahout
new file mode 100755
index 0000000..3017c9e
--- /dev/null
+++ b/community/mahout-mr/bin/mahout
@@ -0,0 +1,395 @@
+#!/bin/bash
+#
+# The Mahout command script
+#
+# Environment Variables
+#
+# MAHOUT_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+#
+# MAHOUT_HEAPSIZE The maximum amount of heap to use, in MB.
+# Default is 4000.
+#
+# HADOOP_CONF_DIR The location of a hadoop config directory
+#
+# MAHOUT_OPTS Extra Java runtime options.
+#
+# MAHOUT_CONF_DIR The location of the program short-name to class name
+# mappings and the default properties files
+# defaults to "$MAHOUT_HOME/src/conf"
+#
+# MAHOUT_LOCAL set to anything other than an empty string to force
+# mahout to run locally even if
+# HADOOP_CONF_DIR and HADOOP_HOME are set
+#
+# MAHOUT_CORE set to anything other than an empty string to force
+# mahout to run in developer 'core' mode, just as if the
+# -core option was presented on the command-line
+# Command-line Options
+#
+# -core -core is used to switch into 'developer mode' when
+# running mahout locally. If specified, the classes
+# from the 'target/classes' directories in each project
+# are used. Otherwise classes will be retrieved from
+# jars in the binary release collection or *-job.jar files
+# found in build directories. When running on hadoop
+# the job files will always be used.
+
+#
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+cygwin=false
+case "`uname`" in
+CYGWIN*) cygwin=true;;
+esac
+
+# resolve links - $0 may be a softlink
+THIS="$0"
+while [ -h "$THIS" ]; do
+ ls=`ls -ld "$THIS"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ THIS="$link"
+ else
+ THIS=`dirname "$THIS"`/"$link"
+ fi
+done
+
+IS_CORE=0
+if [ "$1" == "-core" ] ; then
+ IS_CORE=1
+ shift
+fi
+
+if [ "$1" == "-spark" ]; then
+ SPARK=1
+ shift
+fi
+
+if [ "$1" == "spark-shell" ]; then
+ SPARK=1
+fi
+
+if [ "$1" == "spark-itemsimilarity" ]; then
+ SPARK=1
+fi
+
+if [ "$1" == "spark-rowsimilarity" ]; then
+ SPARK=1
+fi
+
+if [ "$1" == "spark-trainnb" ]; then
+ SPARK=1
+fi
+
+if [ "$1" == "spark-testnb" ]; then
+ SPARK=1
+fi
+
+if [ "$MAHOUT_CORE" != "" ]; then
+ IS_CORE=1
+fi
+
+if [ "$1" == "h2o-node" ]; then
+ H2O=1
+fi
+
+# some directories
+THIS_DIR=`dirname "$THIS"`
+MAHOUT_HOME=`cd "$THIS_DIR/.." ; pwd`
+
+# some Java parameters
+if [ "$MAHOUT_JAVA_HOME" != "" ]; then
+ #echo "run java in $MAHOUT_JAVA_HOME"
+ JAVA_HOME=$MAHOUT_JAVA_HOME
+fi
+
+if [ "$JAVA_HOME" = "" ]; then
+ echo "Error: JAVA_HOME is not set."
+ exit 1
+fi
+
+JAVA=$JAVA_HOME/bin/java
+JAVA_HEAP_MAX=-Xmx4g
+
+# check envvars which might override default args
+if [ "$MAHOUT_HEAPSIZE" != "" ]; then
+ #echo "run with heapsize $MAHOUT_HEAPSIZE"
+ JAVA_HEAP_MAX="-Xmx""$MAHOUT_HEAPSIZE""m"
+ #echo $JAVA_HEAP_MAX
+fi
+
+if [ "x$MAHOUT_CONF_DIR" = "x" ]; then
+ if [ -d $MAHOUT_HOME/src/conf ]; then
+ MAHOUT_CONF_DIR=$MAHOUT_HOME/src/conf
+ else
+ if [ -d $MAHOUT_HOME/conf ]; then
+ MAHOUT_CONF_DIR=$MAHOUT_HOME/conf
+ else
+ echo No MAHOUT_CONF_DIR found
+ fi
+ fi
+fi
+
+
+# CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf
+CLASSPATH=${CLASSPATH}:$MAHOUT_CONF_DIR
+
+if [ "$MAHOUT_LOCAL" != "" ]; then
+ echo "MAHOUT_LOCAL is set, so we don't add HADOOP_CONF_DIR to classpath."
+elif [ -n "$HADOOP_CONF_DIR" ] ; then
+ echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath."
+ CLASSPATH=${CLASSPATH}:$HADOOP_CONF_DIR
+fi
+
+CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+if [ $IS_CORE == 0 ]
+then
+ # add release dependencies to CLASSPATH
+ for f in $MAHOUT_HOME/mahout-*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ if [ "$SPARK" != "1" ]; then
+
+ # add dev targets if they exist
+ for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+ fi
+
+ # add scala dev target
+ for f in $MAHOUT_HOME/math-scala/target/mahout-math-scala_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ if [ "$H2O" == "1" ]; then
+ for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ for f in $MAHOUT_HOME/h2o/target/mahout-h2o*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ fi
+
+ # add jars for running from the command line if we requested shell or spark CLI driver
+ if [ "$SPARK" == "1" ]; then
+
+ for f in $MAHOUT_HOME/hdfs/target/mahout-hdfs-*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ for f in $MAHOUT_HOME/math/target/mahout-math-*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ for f in $MAHOUT_HOME/spark/target/mahout-spark_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ for f in $MAHOUT_HOME/spark-shell/target/mahout-spark-shell_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ # viennacl jars- may or may not be available depending on build profile
+ for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ # viennacl jars- may or may not be available depending on build profile
+ for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ # viennacl jars- may or may not be available depending on build profile
+ for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ SPARK_CP_BIN="${MAHOUT_HOME}/bin/compute-classpath.sh"
+ if [ -x "${SPARK_CP_BIN}" ]; then
+ SPARK_CLASSPATH=$("${SPARK_CP_BIN}" 2>/dev/null)
+ CLASSPATH="${CLASSPATH}:${SPARK_CLASSPATH}"
+ else
+ echo "Cannot find Spark classpath. Is 'SPARK_HOME' set?"
+ exit -1
+ fi
+
+ SPARK_ASSEMBLY_BIN="${MAHOUT_HOME}/bin/mahout-spark-class.sh"
+ if [ -x "${SPARK_ASSEMBLY_BIN}" ]; then
+ SPARK_ASSEMBLY_CLASSPATH=$("${SPARK_ASSEMBLY_BIN}" 2>/dev/null)
+ CLASSPATH="${CLASSPATH}:${SPARK_ASSEMBLY_BIN}"
+ else
+ echo "Cannot find Spark assembly classpath. Is 'SPARK_HOME' set?"
+ exit -1
+ fi
+ fi
+
+ # add vcl jars at any point.
+ # viennacl jars- may or may not be available depending on build profile
+ for f in $MAHOUT_HOME/viennacl/target/mahout-native-viennacl_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ # viennacl jars- may or may not be available depending on build profile
+ for f in $MAHOUT_HOME/viennacl-omp/target/mahout-native-viennacl-omp_*.jar ; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+
+ # add release dependencies to CLASSPATH
+ for f in $MAHOUT_HOME/lib/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+else
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/hdfs/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mr/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/integration/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/examples/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/math-scala/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes
+ CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes
+fi
+
+# add development dependencies to CLASSPATH
+if [ "$SPARK" != "1" ]; then
+ for f in $MAHOUT_HOME/examples/target/dependency/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+ done
+fi
+
+
+# cygwin path translation
+if $cygwin; then
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
+fi
+
+# restore ordinary behaviour
+unset IFS
+JARS=$(echo "$MAHOUT_HOME"/*.jar | tr ' ' ',')
+case "$1" in
+ (spark-shell)
+ save_stty=$(stty -g 2>/dev/null);
+ $SPARK_HOME/bin/spark-shell --jars "$JARS" -i $MAHOUT_HOME/bin/load-shell.scala --conf spark.kryo.referenceTracking=false --conf spark.kryo.registrator=org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator --conf spark.kryoserializer.buffer=32k --conf spark.kryoserializer.buffer.max=600m --conf spark.serializer=org.apache.spark.serializer.KryoSerializer $@
+ stty sane; stty $save_stty
+ ;;
+ # Spark CLI drivers go here
+ (spark-itemsimilarity)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.ItemSimilarityDriver" "$@"
+ ;;
+ (spark-rowsimilarity)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.RowSimilarityDriver" "$@"
+ ;;
+ (spark-trainnb)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TrainNBDriver" "$@"
+ ;;
+ (spark-testnb)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.TestNBDriver" "$@"
+ ;;
+
+ (h2o-node)
+ shift
+ "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "water.H2O" -md5skip "$@" -name mah2out
+ ;;
+ (*)
+
+ # default log directory & file
+ if [ "$MAHOUT_LOG_DIR" = "" ]; then
+ MAHOUT_LOG_DIR="$MAHOUT_HOME/logs"
+ fi
+ if [ "$MAHOUT_LOGFILE" = "" ]; then
+ MAHOUT_LOGFILE='mahout.log'
+ fi
+
+ #Fix log path under cygwin
+ if $cygwin; then
+ MAHOUT_LOG_DIR=`cygpath -p -w "$MAHOUT_LOG_DIR"`
+ fi
+
+ MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.dir=$MAHOUT_LOG_DIR"
+ MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.file=$MAHOUT_LOGFILE"
+
+
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+ MAHOUT_OPTS="$MAHOUT_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
+ fi
+
+ CLASS=org.apache.mahout.driver.MahoutDriver
+
+ for f in $MAHOUT_HOME/examples/target/mahout-examples-*-job.jar $MAHOUT_HOME/mahout-examples-*-job.jar ; do
+ if [ -e "$f" ]; then
+ MAHOUT_JOB=$f
+ fi
+ done
+
+ # run it
+
+ HADOOP_BINARY=$(PATH="${HADOOP_HOME:-${HADOOP_PREFIX}}/bin:$PATH" which hadoop 2>/dev/null)
+ if [ -x "$HADOOP_BINARY" ] ; then
+ HADOOP_BINARY_CLASSPATH=$("$HADOOP_BINARY" classpath)
+ fi
+ if [ ! -x "$HADOOP_BINARY" ] || [ "$MAHOUT_LOCAL" != "" ] ; then
+ if [ ! -x "$HADOOP_BINARY" ] ; then
+ echo "hadoop binary is not in PATH,HADOOP_HOME/bin,HADOOP_PREFIX/bin, running locally"
+ elif [ "$MAHOUT_LOCAL" != "" ] ; then
+ echo "MAHOUT_LOCAL is set, running locally"
+ fi
+ CLASSPATH="${CLASSPATH}:${MAHOUT_HOME}/lib/hadoop/*"
+ case $1 in
+ (classpath)
+ echo $CLASSPATH
+ ;;
+ (*)
+ exec "$JAVA" $JAVA_HEAP_MAX $MAHOUT_OPTS -classpath "$CLASSPATH" $CLASS "$@"
+ esac
+ else
+ echo "Running on hadoop, using $HADOOP_BINARY and HADOOP_CONF_DIR=$HADOOP_CONF_DIR"
+
+ if [ "$MAHOUT_JOB" = "" ] ; then
+ echo "ERROR: Could not find mahout-examples-*.job in $MAHOUT_HOME or $MAHOUT_HOME/examples/target, please run 'mvn install' to create the .job file"
+ exit 1
+ else
+ case "$1" in
+ (hadoop)
+ shift
+ export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH}:$CLASSPATH
+ exec "$HADOOP_BINARY" "$@"
+ ;;
+ (classpath)
+ echo $CLASSPATH
+ ;;
+ (*)
+ echo "MAHOUT-JOB: $MAHOUT_JOB"
+ export HADOOP_CLASSPATH=$MAHOUT_CONF_DIR:${HADOOP_CLASSPATH}
+ exec "$HADOOP_BINARY" jar $MAHOUT_JOB $CLASS "$@"
+ esac
+ fi
+ fi
+ ;;
+esac
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/bin/mahout.cmd
----------------------------------------------------------------------
diff --git a/community/mahout-mr/bin/mahout.cmd b/community/mahout-mr/bin/mahout.cmd
new file mode 100644
index 0000000..86bae79
--- /dev/null
+++ b/community/mahout-mr/bin/mahout.cmd
@@ -0,0 +1,397 @@
+@echo off
+
+echo "===============DEPRECATION WARNING==============="
+echo "This script is no longer supported for new drivers as of Mahout 0.10.0"
+echo "Mahout's bash script is supported and if someone wants to contribute a fix for this"
+echo "it would be appreciated."
+
+
+@rem
+@rem The Mahout command script
+@rem
+@rem Environment Variables
+@rem
+@rem MAHOUT_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+@rem
+@rem MAHOUT_HEAPSIZE The maximum amount of heap to use, in MB.
+@rem Default is 1000.
+@rem
+@rem HADOOP_CONF_DIR The location of a hadoop config directory
+@rem
+@rem MAHOUT_OPTS Extra Java runtime options.
+@rem
+@rem MAHOUT_CONF_DIR The location of the program short-name to class name
+@rem mappings and the default properties files
+@rem defaults to "$MAHOUT_HOME/src/conf"
+@rem
+@rem MAHOUT_LOCAL set to anything other than an empty string to force
+@rem mahout to run locally even if
+@rem HADOOP_CONF_DIR and HADOOP_HOME are set
+@rem
+@rem MAHOUT_CORE set to anything other than an empty string to force
+@rem mahout to run in developer 'core' mode, just as if the
+@rem -core option was presented on the command-line
+@rem Commane-line Options
+@rem
+@rem -core -core is used to switch into 'developer mode' when
+@rem running mahout locally. If specified, the classes
+@rem from the 'target/classes' directories in each project
+@rem are used. Otherwise classes will be retrived from
+@rem jars in the binary releas collection or *-job.jar files
+@rem found in build directories. When running on hadoop
+@rem the job files will always be used.
+
+@rem
+@rem /*
+@rem * Licensed to the Apache Software Foundation (ASF) under one or more
+@rem * contributor license agreements. See the NOTICE file distributed with
+@rem * this work for additional information regarding copyright ownership.
+@rem * The ASF licenses this file to You under the Apache License, Version 2.0
+@rem * (the "License"); you may not use this file except in compliance with
+@rem * the License. You may obtain a copy of the License at
+@rem *
+@rem * http://www.apache.org/licenses/LICENSE-2.0
+@rem *
+@rem * Unless required by applicable law or agreed to in writing, software
+@rem * distributed under the License is distributed on an "AS IS" BASIS,
+@rem * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem * See the License for the specific language governing permissions and
+@rem * limitations under the License.
+@rem */
+
+setlocal enabledelayedexpansion
+
+@rem disable "developer mode"
+set IS_CORE=0
+if [%1] == [-core] (
+ set IS_CORE=1
+ shift
+)
+
+if not [%MAHOUT_CORE%] == [] (
+set IS_CORE=1
+)
+
+if [%MAHOUT_HOME%] == [] set MAHOUT_HOME=%~dp0..
+
+echo "Mahout home set %MAHOUT_HOME%"
+
+@rem some Java parameters
+if not [%MAHOUT_JAVA_HOME%] == [] (
+@rem echo run java in %MAHOUT_JAVA_HOME%
+set JAVA_HOME=%MAHOUT_JAVA_HOME%
+)
+
+if [%JAVA_HOME%] == [] (
+ echo Error: JAVA_HOME is not set.
+ exit /B 1
+)
+
+set JAVA=%JAVA_HOME%\bin\java
+set JAVA_HEAP_MAX=-Xmx3g
+
+@rem check envvars which might override default args
+if not [%MAHOUT_HEAPSIZE%] == [] (
+@rem echo run with heapsize %MAHOUT_HEAPSIZE%
+set JAVA_HEAP_MAX=-Xmx%MAHOUT_HEAPSIZE%m
+@rem echo %JAVA_HEAP_MAX%
+)
+
+if [%MAHOUT_CONF_DIR%] == [] (
+set MAHOUT_CONF_DIR=%MAHOUT_HOME%\conf
+)
+
+:main
+@rem MAHOUT_CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf
+set CLASSPATH=%CLASSPATH%;%MAHOUT_CONF_DIR%
+
+if not [%MAHOUT_LOCAL%] == [] (
+echo "MAHOUT_LOCAL is set, so we do not add HADOOP_CONF_DIR to classpath."
+) else (
+if not [%HADOOP_CONF_DIR%] == [] (
+echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath."
+set CLASSPATH=%CLASSPATH%;%HADOOP_CONF_DIR%
+)
+)
+
+set CLASSPATH=%CLASSPATH%;%JAVA_HOME%\lib\tools.jar
+
+if %IS_CORE% == 0 (
+@rem add release dependencies to CLASSPATH
+for %%f in (%MAHOUT_HOME%\mahout-*.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+@rem add dev targets if they exist
+for %%f in (%MAHOUT_HOME%\examples\target\mahout-examples-*-job.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+for %%f in (%MAHOUT_HOME%\mahout-examples-*-job.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+@rem add release dependencies to CLASSPATH
+for %%f in (%MAHOUT_HOME%\lib\*.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+) else (
+set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\math\target\classes
+set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\core\target\classes
+set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\integration\target\classes
+set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\examples\target\classes
+@rem set CLASSPATH=%CLASSPATH%;%MAHOUT_HOME%\core\src\main\resources
+)
+
+@rem add development dependencies to CLASSPATH
+for %%f in (%MAHOUT_HOME%\examples\target\dependency\*.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+
+@rem default log directory & file
+if [%MAHOUT_LOG_DIR%] == [] (
+set MAHOUT_LOG_DIR=%MAHOUT_HOME%\logs
+)
+if [%MAHOUT_LOGFILE%] == [] (
+set MAHOUT_LOGFILE=mahout.log
+)
+
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dhadoop.log.dir=%MAHOUT_LOG_DIR%
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dhadoop.log.file=%MAHOUT_LOGFILE%
+
+if not [%JAVA_LIBRARY_PATH%] == [] (
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Djava.library.path=%JAVA_LIBRARY_PATH%
+)
+
+set CLASS=org.apache.mahout.driver.MahoutDriver
+
+for %%f in (%MAHOUT_HOME%\examples\target\mahout-examples-*-job.jar) do (
+set MAHOUT_JOB=%%f
+)
+
+@rem run it
+
+if not [%MAHOUT_LOCAL%] == [] (
+ echo "MAHOUT_LOCAL is set, running locally"
+ %JAVA% %JAVA_HEAP_MAX% %MAHOUT_OPTS% -classpath %MAHOUT_CLASSPATH% %CLASS% %*
+) else (
+ if [%MAHOUT_JOB%] == [] (
+ echo "ERROR: Could not find mahout-examples-*.job in %MAHOUT_HOME% or %MAHOUT_HOME%/examples/target, please run 'mvn install' to create the .job file"
+ exit /B 1
+ ) else (
+ set HADOOP_CLASSPATH=%MAHOUT_CLASSPATH%
+ if /i [%1] == [hadoop] (
+shift
+set HADOOP_CLASSPATH=%MAHOUT_CONF_DIR%;%HADOOP_CLASSPATH%
+ call %HADOOP_HOME%\bin\%*
+ ) else (
+if /i [%1] == [classpath] (
+echo %CLASSPATH%
+) else (
+echo MAHOUT_JOB: %MAHOUT_JOB%
+set HADOOP_CLASSPATH=%MAHOUT_CONF_DIR%;%HADOOP_CLASSPATH%
+set HADOOP_CLIENT_OPTS=%JAVA_HEAP_MAX%
+call %HADOOP_HOME%\bin\hadoop jar %MAHOUT_JOB% %CLASS% %*
+)
+
+ )
+ )
+)
+@echo off
+
+@rem
+@rem The Mahout command script
+@rem
+@rem Environment Variables
+@rem
+@rem MAHOUT_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+@rem
+@rem MAHOUT_HEAPSIZE The maximum amount of heap to use, in MB.
+@rem Default is 1000.
+@rem
+@rem HADOOP_CONF_DIR The location of a hadoop config directory
+@rem
+@rem MAHOUT_OPTS Extra Java runtime options.
+@rem
+@rem MAHOUT_CONF_DIR The location of the program short-name to class name
+@rem mappings and the default properties files
+@rem defaults to "$MAHOUT_HOME/src/conf"
+@rem
+@rem MAHOUT_LOCAL set to anything other than an empty string to force
+@rem mahout to run locally even if
+@rem HADOOP_CONF_DIR and HADOOP_HOME are set
+@rem
+@rem MAHOUT_CORE set to anything other than an empty string to force
+@rem mahout to run in developer 'core' mode, just as if the
+@rem -core option was presented on the command-line
+@rem Commane-line Options
+@rem
+@rem -core -core is used to switch into 'developer mode' when
+@rem running mahout locally. If specified, the classes
+@rem from the 'target/classes' directories in each project
+@rem are used. Otherwise classes will be retrived from
+@rem jars in the binary releas collection or *-job.jar files
+@rem found in build directories. When running on hadoop
+@rem the job files will always be used.
+
+@rem
+@rem /*
+@rem * Licensed to the Apache Software Foundation (ASF) under one or more
+@rem * contributor license agreements. See the NOTICE file distributed with
+@rem * this work for additional information regarding copyright ownership.
+@rem * The ASF licenses this file to You under the Apache License, Version 2.0
+@rem * (the "License"); you may not use this file except in compliance with
+@rem * the License. You may obtain a copy of the License at
+@rem *
+@rem * http://www.apache.org/licenses/LICENSE-2.0
+@rem *
+@rem * Unless required by applicable law or agreed to in writing, software
+@rem * distributed under the License is distributed on an "AS IS" BASIS,
+@rem * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem * See the License for the specific language governing permissions and
+@rem * limitations under the License.
+@rem */
+
+setlocal enabledelayedexpansion
+
+@rem disable "developer mode"
+set IS_CORE=0
+if [%1] == [-core] (
+ set IS_CORE=1
+ shift
+)
+
+if not [%MAHOUT_CORE%] == [] (
+set IS_CORE=1
+)
+
+if [%MAHOUT_HOME%] == [] set MAHOUT_HOME=%~dp0..
+
+echo "Mahout home set %MAHOUT_HOME%"
+
+@rem some Java parameters
+if not [%MAHOUT_JAVA_HOME%] == [] (
+@rem echo run java in %MAHOUT_JAVA_HOME%
+set JAVA_HOME=%MAHOUT_JAVA_HOME%
+)
+
+if [%JAVA_HOME%] == [] (
+ echo Error: JAVA_HOME is not set.
+ exit /B 1
+)
+
+set JAVA=%JAVA_HOME%\bin\java
+set JAVA_HEAP_MAX=-Xmx3g
+
+@rem check envvars which might override default args
+if not [%MAHOUT_HEAPSIZE%] == [] (
+@rem echo run with heapsize %MAHOUT_HEAPSIZE%
+set JAVA_HEAP_MAX=-Xmx%MAHOUT_HEAPSIZE%m
+@rem echo %JAVA_HEAP_MAX%
+)
+
+if [%MAHOUT_CONF_DIR%] == [] (
+set MAHOUT_CONF_DIR=%MAHOUT_HOME%\conf
+)
+
+:main
+@rem MAHOUT_CLASSPATH initially contains $MAHOUT_CONF_DIR, or defaults to $MAHOUT_HOME/src/conf
+set CLASSPATH=%CLASSPATH%;%MAHOUT_CONF_DIR%
+
+if not [%MAHOUT_LOCAL%] == [] (
+echo "MAHOUT_LOCAL is set, so we do not add HADOOP_CONF_DIR to classpath."
+) else (
+if not [%HADOOP_CONF_DIR%] == [] (
+echo "MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath."
+set CLASSPATH=%CLASSPATH%;%HADOOP_CONF_DIR%
+)
+)
+
+set CLASSPATH=%CLASSPATH%;%JAVA_HOME%\lib\tools.jar
+
+if %IS_CORE% == 0 (
+@rem add release dependencies to CLASSPATH
+for %%f in (%MAHOUT_HOME%\mahout-*.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+@rem add dev targets if they exist
+for %%f in (%MAHOUT_HOME%\examples\target\mahout-examples-*-job.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+for %%f in (%MAHOUT_HOME%\mahout-examples-*-job.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+@rem add release dependencies to CLASSPATH
+for %%f in (%MAHOUT_HOME%\lib\*.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+) else (
+set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\math\target\classes
+set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\core\target\classes
+set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\integration\target\classes
+set CLASSPATH=!CLASSPATH!;%MAHOUT_HOME%\examples\target\classes
+@rem set CLASSPATH=%CLASSPATH%;%MAHOUT_HOME%\core\src\main\resources
+)
+
+@rem add development dependencies to CLASSPATH
+for %%f in (%MAHOUT_HOME%\examples\target\dependency\*.jar) do (
+set CLASSPATH=!CLASSPATH!;%%f
+)
+
+@rem default log directory & file
+if [%MAHOUT_LOG_DIR%] == [] (
+set MAHOUT_LOG_DIR=%MAHOUT_HOME%\logs
+)
+if [%MAHOUT_LOGFILE%] == [] (
+set MAHOUT_LOGFILE=mahout.log
+)
+
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dhadoop.log.dir=%MAHOUT_LOG_DIR%
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dhadoop.log.file=%MAHOUT_LOGFILE%
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.min.split.size=512MB
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.map.child.java.opts=-Xmx4096m
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.reduce.child.java.opts=-Xmx4096m
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.output.compress=true
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.compress.map.output=true
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.map.tasks=1
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dmapred.reduce.tasks=1
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dio.sort.factor=30
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dio.sort.mb=1024
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Dio.file.buffer.size=32786
+set HADOOP_OPTS=%HADOOP_OPTS% -Djava.library.path=%HADOOP_HOME%\bin
+
+if not [%JAVA_LIBRARY_PATH%] == [] (
+set MAHOUT_OPTS=%MAHOUT_OPTS% -Djava.library.path=%JAVA_LIBRARY_PATH%
+)
+
+set CLASS=org.apache.mahout.driver.MahoutDriver
+
+for %%f in (%MAHOUT_HOME%\examples\target\mahout-examples-*-job.jar) do (
+set MAHOUT_JOB=%%f
+)
+
+@rem run it
+
+if not [%MAHOUT_LOCAL%] == [] (
+ echo "MAHOUT_LOCAL is set, running locally"
+ %JAVA% %JAVA_HEAP_MAX% %MAHOUT_OPTS% -classpath %MAHOUT_CLASSPATH% %CLASS% %*
+) else (
+ if [%MAHOUT_JOB%] == [] (
+ echo "ERROR: Could not find mahout-examples-*.job in %MAHOUT_HOME% or %MAHOUT_HOME%/examples/target, please run 'mvn install' to create the .job file"
+ exit /B 1
+ ) else (
+ set HADOOP_CLASSPATH=%MAHOUT_CLASSPATH%
+ if /i [%1] == [hadoop] (
+shift
+set HADOOP_CLASSPATH=%MAHOUT_CONF_DIR%;%HADOOP_CLASSPATH%
+ call %HADOOP_HOME%\bin\%*
+ ) else (
+if /i [%1] == [classpath] (
+echo %CLASSPATH%
+) else (
+echo MAHOUT_JOB: %MAHOUT_JOB%
+set HADOOP_CLASSPATH=%MAHOUT_CONF_DIR%;%HADOOP_CLASSPATH%
+set HADOOP_CLIENT_OPTS=%JAVA_HEAP_MAX%
+call %HADOOP_HOME%\bin\hadoop jar %MAHOUT_JOB% %CLASS% %*
+)
+
+ )
+ )
+)
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/README.txt b/community/mahout-mr/examples/bin/README.txt
new file mode 100644
index 0000000..7ad3a38
--- /dev/null
+++ b/community/mahout-mr/examples/bin/README.txt
@@ -0,0 +1,13 @@
+This directory contains helpful shell scripts for working with some of Mahout's examples.
+
+To set a non-default temporary work directory: `export MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir`
+ Note that this requires the same path to be writable both on the local file system as well as on HDFS.
+
+Here's a description of what each does:
+
+classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 News Groups. Downloads the data set automatically.
+cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms. Downloads the data set automatically.
+cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set. Downloads the data set automatically.
+factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M).
+factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set.
+spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text.
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/classify-20newsgroups.sh b/community/mahout-mr/examples/bin/classify-20newsgroups.sh
new file mode 100755
index 0000000..f47d5c5
--- /dev/null
+++ b/community/mahout-mr/examples/bin/classify-20newsgroups.sh
@@ -0,0 +1,197 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the 20newsgroups dataset, trains and tests a classifier.
+#
+# To run: change into the mahout directory and type:
+# examples/bin/classify-20newsgroups.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups."
+ exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+ WORK_DIR=/tmp/mahout-work-${USER}
+else
+ WORK_DIR=$MAHOUT_WORK_DIR
+fi
+algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)
+if [ -n "$1" ]; then
+ choice=$1
+else
+ echo "Please select a number to choose the corresponding task to run"
+ echo "1. ${algorithm[0]}"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]}"
+ echo "4. ${algorithm[3]}"
+ echo "5. ${algorithm[4]}"
+ echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
+ read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+# Spark specific check and work
+if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
+ if [ "$MASTER" == "" ] ; then
+ echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..."
+ exit 1
+ fi
+ if [ "$MAHOUT_LOCAL" != "" ] ; then
+ echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
+ exit 1
+ fi
+fi
+
+if [ "x$alg" != "xclean" ]; then
+ echo "creating work directory at ${WORK_DIR}"
+
+ mkdir -p ${WORK_DIR}
+ if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
+ if [ ! -e ${WORK_DIR}/20news-bydate ]; then
+ if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
+ echo "Downloading 20news-bydate"
+ curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz
+ fi
+ mkdir -p ${WORK_DIR}/20news-bydate
+ echo "Extracting..."
+ cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
+ fi
+ fi
+fi
+#echo $START_PATH
+cd $START_PATH
+cd ../..
+
+set -e
+
+if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark" ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then
+ c=""
+
+ if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then
+ c=" -c"
+ fi
+
+ set -x
+ echo "Preparing 20newsgroups data"
+ rm -rf ${WORK_DIR}/20news-all
+ mkdir ${WORK_DIR}/20news-all
+ cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
+
+ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ echo "Copying 20newsgroups data to HDFS"
+ set +e
+ $DFSRM ${WORK_DIR}/20news-all
+ $DFS -mkdir -p ${WORK_DIR}
+ $DFS -mkdir ${WORK_DIR}/20news-all
+ set -e
+ if [ $HVERSION -eq "1" ] ; then
+ echo "Copying 20newsgroups data to Hadoop 1 HDFS"
+ $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
+ elif [ $HVERSION -eq "2" ] ; then
+ echo "Copying 20newsgroups data to Hadoop 2 HDFS"
+ $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
+ fi
+ fi
+
+ echo "Creating sequence files from 20newsgroups data"
+ ./bin/mahout seqdirectory \
+ -i ${WORK_DIR}/20news-all \
+ -o ${WORK_DIR}/20news-seq -ow
+
+ echo "Converting sequence files to vectors"
+ ./bin/mahout seq2sparse \
+ -i ${WORK_DIR}/20news-seq \
+ -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf
+
+ echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+ ./bin/mahout split \
+ -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
+ --trainingOutput ${WORK_DIR}/20news-train-vectors \
+ --testOutput ${WORK_DIR}/20news-test-vectors \
+ --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
+
+ if [ "x$alg" == "xnaivebayes-MapReduce" -o "x$alg" == "xcnaivebayes-MapReduce" ]; then
+
+ echo "Training Naive Bayes model"
+ ./bin/mahout trainnb \
+ -i ${WORK_DIR}/20news-train-vectors \
+ -o ${WORK_DIR}/model \
+ -li ${WORK_DIR}/labelindex \
+ -ow $c
+
+ echo "Self testing on training set"
+
+ ./bin/mahout testnb \
+ -i ${WORK_DIR}/20news-train-vectors\
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow -o ${WORK_DIR}/20news-testing $c
+
+ echo "Testing on holdout set"
+
+ ./bin/mahout testnb \
+ -i ${WORK_DIR}/20news-test-vectors\
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow -o ${WORK_DIR}/20news-testing $c
+
+ elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
+
+ echo "Training Naive Bayes model"
+ ./bin/mahout spark-trainnb \
+ -i ${WORK_DIR}/20news-train-vectors \
+ -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER
+
+ echo "Self testing on training set"
+ ./bin/mahout spark-testnb \
+ -i ${WORK_DIR}/20news-train-vectors\
+ -m ${WORK_DIR}/spark-model $c -ma $MASTER
+
+ echo "Testing on holdout set"
+ ./bin/mahout spark-testnb \
+ -i ${WORK_DIR}/20news-test-vectors\
+ -m ${WORK_DIR}/spark-model $c -ma $MASTER
+
+ fi
+elif [ "x$alg" == "xsgd" ]; then
+ if [ ! -e "/tmp/news-group.model" ]; then
+ echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
+ ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/
+ fi
+ echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model"
+ ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
+elif [ "x$alg" == "xclean" ]; then
+ rm -rf $WORK_DIR
+ rm -rf /tmp/news-group.model
+ $DFSRM $WORK_DIR
+fi
+# Remove the work directory
+#
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/classify-wikipedia.sh b/community/mahout-mr/examples/bin/classify-wikipedia.sh
new file mode 100755
index 0000000..41dc0c9
--- /dev/null
+++ b/community/mahout-mr/examples/bin/classify-wikipedia.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads a (partial) wikipedia dump, trains and tests a classifier.
+#
+# To run: change into the mahout directory and type:
+# examples/bin/classify-wikipedia.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
+ exit
+fi
+
+# ensure that MAHOUT_HOME is set
+if [[ -z "$MAHOUT_HOME" ]]; then
+ echo "Please set MAHOUT_HOME."
+ exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+ WORK_DIR=/tmp/mahout-work-wiki
+else
+ WORK_DIR=$MAHOUT_WORK_DIR
+fi
+algorithm=( CBayes BinaryCBayes clean)
+if [ -n "$1" ]; then
+ choice=$1
+else
+ echo "Please select a number to choose the corresponding task to run"
+ echo "1. ${algorithm[0]} (may require increased heap space on yarn)"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
+ read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+if [ "x$alg" != "xclean" ]; then
+ echo "creating work directory at ${WORK_DIR}"
+
+ mkdir -p ${WORK_DIR}
+ if [ ! -e ${WORK_DIR}/wikixml ]; then
+ mkdir -p ${WORK_DIR}/wikixml
+ fi
+ if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then
+ echo "Downloading wikipedia XML dump"
+ ########################################################
+ # Datasets: uncomment and run "clean" to change dataset
+ ########################################################
+ ########## partial small 42.5M zipped
+ # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ########## partial larger 256M zipped
+ curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ######### full wikipedia dump: 10G zipped
+ # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ########################################################
+ fi
+ if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then
+ echo "Extracting..."
+
+ cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
+ fi
+
+echo $START_PATH
+
+set -e
+
+if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
+
+ set -x
+ echo "Preparing wikipedia data"
+ rm -rf ${WORK_DIR}/wiki
+ mkdir ${WORK_DIR}/wiki
+
+ if [ "x$alg" == "xCBayes" ] ; then
+ # use a list of 10 countries as categories
+ cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt
+ chmod 666 ${WORK_DIR}/country.txt
+ fi
+
+ if [ "x$alg" == "xBinaryCBayes" ] ; then
+ # use United States and United Kingdom as categories
+ cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt
+ chmod 666 ${WORK_DIR}/country.txt
+ fi
+
+ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ echo "Copying wikipedia data to HDFS"
+ set +e
+ $DFSRM ${WORK_DIR}/wikixml
+ $DFS -mkdir -p ${WORK_DIR}
+ set -e
+ $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+ fi
+
+ echo "Creating sequence files from wikiXML"
+ $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
+ -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
+ -o ${WORK_DIR}/wikipediainput
+
+ # if using the 10 class problem use bigrams
+ if [ "x$alg" == "xCBayes" ] ; then
+ echo "Converting sequence files to vectors using bigrams"
+ $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+ -o ${WORK_DIR}/wikipediaVecs \
+ -wt tfidf \
+ -lnorm -nv \
+ -ow -ng 2
+ fi
+
+ # if using the 2 class problem try different options
+ if [ "x$alg" == "xBinaryCBayes" ] ; then
+ echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%"
+ $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+ -o ${WORK_DIR}/wikipediaVecs \
+ -wt tfidf \
+ -lnorm \
+ -nv \
+ -ow \
+ -ng 1 \
+ -x 30
+ fi
+
+ echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+ $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
+ --trainingOutput ${WORK_DIR}/training \
+ --testOutput ${WORK_DIR}/testing \
+ -rp 20 \
+ -ow \
+ -seq \
+ -xm sequential
+
+ echo "Training Naive Bayes model"
+ $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
+ -o ${WORK_DIR}/model \
+ -li ${WORK_DIR}/labelindex \
+ -ow \
+ -c
+
+ echo "Self testing on training set"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow \
+ -o ${WORK_DIR}/output \
+ -c
+
+ echo "Testing on holdout set: Bayes"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow \
+ -o ${WORK_DIR}/output \
+ -seq
+
+ echo "Testing on holdout set: CBayes"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+ -m ${WORK_DIR}/model -l \
+ ${WORK_DIR}/labelindex \
+ -ow \
+ -o ${WORK_DIR}/output \
+ -c \
+ -seq
+fi
+
+elif [ "x$alg" == "xclean" ]; then
+ rm -rf $WORK_DIR
+ $DFSRM $WORK_DIR
+fi
+# Remove the work directory
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/cluster-reuters.sh b/community/mahout-mr/examples/bin/cluster-reuters.sh
new file mode 100755
index 0000000..49f6c94
--- /dev/null
+++ b/community/mahout-mr/examples/bin/cluster-reuters.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Reuters dataset and prepares it for clustering
+#
+# To run: change into the mahout directory and type:
+# examples/bin/cluster-reuters.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script clusters the Reuters data set using a variety of algorithms. The data set is downloaded automatically."
+ exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+MAHOUT="../../bin/mahout"
+
+if [ ! -e $MAHOUT ]; then
+ echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
+ exit 1
+fi
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+ WORK_DIR=/tmp/mahout-work-${USER}
+else
+ WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
+if [ -n "$1" ]; then
+ choice=$1
+else
+ echo "Please select a number to choose the corresponding clustering algorithm"
+ echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)"
+ echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)"
+ echo "3. ${algorithm[2]} clustering"
+ echo "4. ${algorithm[3]} clustering"
+ echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
+ read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
+if [ "x$clustertype" == "xclean" ]; then
+ rm -rf $WORK_DIR
+ $DFSRM $WORK_DIR
+ exit 1
+else
+ $DFS -mkdir -p $WORK_DIR
+ mkdir -p $WORK_DIR
+ echo "Creating work directory at ${WORK_DIR}"
+fi
+if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
+ if [ ! -e ${WORK_DIR}/reuters-out ]; then
+ if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
+ if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
+ if [ -n "$2" ]; then
+ echo "Copying Reuters from local download"
+ cp $2 ${WORK_DIR}/reuters21578.tar.gz
+ else
+ echo "Downloading Reuters-21578"
+ curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz
+ fi
+ fi
+ #make sure it was actually downloaded
+ if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
+ echo "Failed to download reuters"
+ exit 1
+ fi
+ mkdir -p ${WORK_DIR}/reuters-sgm
+ echo "Extracting..."
+ tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
+ fi
+ echo "Extracting Reuters"
+ $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
+ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ echo "Copying Reuters data to Hadoop"
+ set +e
+ $DFSRM ${WORK_DIR}/reuters-sgm
+ $DFSRM ${WORK_DIR}/reuters-out
+ $DFS -mkdir -p ${WORK_DIR}/
+ $DFS -mkdir ${WORK_DIR}/reuters-sgm
+ $DFS -mkdir ${WORK_DIR}/reuters-out
+ $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
+ $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
+ set -e
+ fi
+ fi
+ echo "Converting to Sequence Files from Directory"
+ $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
+fi
+
+if [ "x$clustertype" == "xkmeans" ]; then
+ $MAHOUT seq2sparse \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \
+ && \
+ $MAHOUT kmeans \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
+ -c ${WORK_DIR}/reuters-kmeans-clusters \
+ -o ${WORK_DIR}/reuters-kmeans \
+ -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
+ -x 10 -k 20 -ow --clustering \
+ && \
+ $MAHOUT clusterdump \
+ -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \
+ -o ${WORK_DIR}/reuters-kmeans/clusterdump \
+ -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
+ -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
+ --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \
+ && \
+ cat ${WORK_DIR}/reuters-kmeans/clusterdump
+elif [ "x$clustertype" == "xfuzzykmeans" ]; then
+ $MAHOUT seq2sparse \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \
+ && \
+ $MAHOUT fkmeans \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
+ -c ${WORK_DIR}/reuters-fkmeans-clusters \
+ -o ${WORK_DIR}/reuters-fkmeans \
+ -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
+ -x 10 -k 20 -ow -m 1.1 \
+ && \
+ $MAHOUT clusterdump \
+ -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
+ -o ${WORK_DIR}/reuters-fkmeans/clusterdump \
+ -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
+ -dt sequencefile -b 100 -n 20 -sp 0 \
+ && \
+ cat ${WORK_DIR}/reuters-fkmeans/clusterdump
+elif [ "x$clustertype" == "xlda" ]; then
+ $MAHOUT seq2sparse \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \
+ && \
+ $MAHOUT rowid \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
+ -o ${WORK_DIR}/reuters-out-matrix \
+ && \
+ rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \
+ && \
+ $MAHOUT cvb \
+ -i ${WORK_DIR}/reuters-out-matrix/matrix \
+ -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
+ -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+ -dt ${WORK_DIR}/reuters-lda-topics \
+ -mt ${WORK_DIR}/reuters-lda-model \
+ && \
+ $MAHOUT vectordump \
+ -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+ -o ${WORK_DIR}/reuters-lda/vectordump \
+ -vs 10 -p true \
+ -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+ -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+ && \
+ cat ${WORK_DIR}/reuters-lda/vectordump
+elif [ "x$clustertype" == "xstreamingkmeans" ]; then
+ $MAHOUT seq2sparse \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \
+ && \
+ rm -rf ${WORK_DIR}/reuters-streamingkmeans \
+ && \
+ $MAHOUT streamingkmeans \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
+ --tempDir ${WORK_DIR}/tmp \
+ -o ${WORK_DIR}/reuters-streamingkmeans \
+ -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
+ -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
+ -k 10 -km 100 -ow \
+ && \
+ $MAHOUT qualcluster \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \
+ -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000 \
+ -o ${WORK_DIR}/reuters-cluster-distance.csv \
+ && \
+ cat ${WORK_DIR}/reuters-cluster-distance.csv
+fi