You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/28 14:55:18 UTC

[50/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/conf/log4j.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/log4j.xml b/community/mahout-mr/conf/log4j.xml
index 6231b48..179f1a9 100644
--- a/community/mahout-mr/conf/log4j.xml
+++ b/community/mahout-mr/conf/log4j.xml
@@ -1,4 +1,21 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
 <log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
     <appender class="org.apache.log4j.ConsoleAppender" name="console">

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/README.txt b/community/mahout-mr/examples/bin/README.txt
deleted file mode 100644
index 7ad3a38..0000000
--- a/community/mahout-mr/examples/bin/README.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-This directory contains helpful shell scripts for working with some of Mahout's examples.  
-
-To set a non-default temporary work directory: `export MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir`
-  Note that this requires the same path to be writable both on the local file system as well as on HDFS.
-
-Here's a description of what each does:
-
-classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 News Groups.  Downloads the data set automatically.
-cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms.  Downloads the data set automatically.
-cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set.  Downloads the data set automatically.
-factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M).
-factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set.
-spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text.

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/classify-20newsgroups.sh b/community/mahout-mr/examples/bin/classify-20newsgroups.sh
deleted file mode 100755
index f47d5c5..0000000
--- a/community/mahout-mr/examples/bin/classify-20newsgroups.sh
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the 20newsgroups dataset, trains and tests a classifier.
-#
-# To run:  change into the mahout directory and type:
-# examples/bin/classify-20newsgroups.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding task to run"
-  echo "1. ${algorithm[0]}"
-  echo "2. ${algorithm[1]}"
-  echo "3. ${algorithm[2]}"
-  echo "4. ${algorithm[3]}"
-  echo "5. ${algorithm[4]}"
-  echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-# Spark specific check and work 
-if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
-  if [ "$MASTER" == "" ] ; then
-    echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..."
-    exit 1
-  fi
-  if [ "$MAHOUT_LOCAL" != "" ] ; then
-    echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
-    exit 1
-  fi
-fi
-
-if [ "x$alg" != "xclean" ]; then
-  echo "creating work directory at ${WORK_DIR}"
-
-  mkdir -p ${WORK_DIR}
-  if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
-    if [ ! -e ${WORK_DIR}/20news-bydate ]; then
-      if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
-        echo "Downloading 20news-bydate"
-        curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz
-      fi
-      mkdir -p ${WORK_DIR}/20news-bydate
-      echo "Extracting..."
-      cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
-    fi
-  fi
-fi
-#echo $START_PATH
-cd $START_PATH
-cd ../..
-
-set -e
-
-if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark"  ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then
-  c=""
-
-  if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then
-    c=" -c"
-  fi
-
-  set -x
-  echo "Preparing 20newsgroups data"
-  rm -rf ${WORK_DIR}/20news-all
-  mkdir ${WORK_DIR}/20news-all
-  cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
-
-  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-    echo "Copying 20newsgroups data to HDFS"
-    set +e
-    $DFSRM ${WORK_DIR}/20news-all
-    $DFS -mkdir -p ${WORK_DIR}
-    $DFS -mkdir ${WORK_DIR}/20news-all
-    set -e
-    if [ $HVERSION -eq "1" ] ; then
-      echo "Copying 20newsgroups data to Hadoop 1 HDFS"
-      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
-    elif [ $HVERSION -eq "2" ] ; then
-      echo "Copying 20newsgroups data to Hadoop 2 HDFS"
-      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
-    fi
-  fi
-
-  echo "Creating sequence files from 20newsgroups data"
-  ./bin/mahout seqdirectory \
-    -i ${WORK_DIR}/20news-all \
-    -o ${WORK_DIR}/20news-seq -ow
-
-  echo "Converting sequence files to vectors"
-  ./bin/mahout seq2sparse \
-    -i ${WORK_DIR}/20news-seq \
-    -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf
-
-  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
-  ./bin/mahout split \
-    -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
-    --trainingOutput ${WORK_DIR}/20news-train-vectors \
-    --testOutput ${WORK_DIR}/20news-test-vectors  \
-    --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
-
-    if [ "x$alg" == "xnaivebayes-MapReduce"  -o  "x$alg" == "xcnaivebayes-MapReduce" ]; then
-
-      echo "Training Naive Bayes model"
-      ./bin/mahout trainnb \
-        -i ${WORK_DIR}/20news-train-vectors \
-        -o ${WORK_DIR}/model \
-        -li ${WORK_DIR}/labelindex \
-        -ow $c
-
-      echo "Self testing on training set"
-
-      ./bin/mahout testnb \
-        -i ${WORK_DIR}/20news-train-vectors\
-        -m ${WORK_DIR}/model \
-        -l ${WORK_DIR}/labelindex \
-        -ow -o ${WORK_DIR}/20news-testing $c
-
-      echo "Testing on holdout set"
-
-      ./bin/mahout testnb \
-        -i ${WORK_DIR}/20news-test-vectors\
-        -m ${WORK_DIR}/model \
-        -l ${WORK_DIR}/labelindex \
-        -ow -o ${WORK_DIR}/20news-testing $c
-
-    elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
-
-      echo "Training Naive Bayes model"
-      ./bin/mahout spark-trainnb \
-        -i ${WORK_DIR}/20news-train-vectors \
-        -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER
-
-      echo "Self testing on training set"
-      ./bin/mahout spark-testnb \
-        -i ${WORK_DIR}/20news-train-vectors\
-        -m ${WORK_DIR}/spark-model $c -ma $MASTER
-
-      echo "Testing on holdout set"
-      ./bin/mahout spark-testnb \
-        -i ${WORK_DIR}/20news-test-vectors\
-        -m ${WORK_DIR}/spark-model $c -ma $MASTER
-        
-    fi
-elif [ "x$alg" == "xsgd" ]; then
-  if [ ! -e "/tmp/news-group.model" ]; then
-    echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
-    ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/
-  fi
-  echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model"
-  ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
-elif [ "x$alg" == "xclean" ]; then
-  rm -rf $WORK_DIR
-  rm -rf /tmp/news-group.model
-  $DFSRM $WORK_DIR
-fi
-# Remove the work directory
-#

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/classify-wikipedia.sh b/community/mahout-mr/examples/bin/classify-wikipedia.sh
deleted file mode 100755
index 41dc0c9..0000000
--- a/community/mahout-mr/examples/bin/classify-wikipedia.sh
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads a (partial) wikipedia dump, trains and tests a classifier.
-#
-# To run:  change into the mahout directory and type:
-# examples/bin/classify-wikipedia.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
-  exit
-fi
-
-# ensure that MAHOUT_HOME is set
-if [[ -z "$MAHOUT_HOME" ]]; then
-  echo "Please set MAHOUT_HOME."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-wiki
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-algorithm=( CBayes BinaryCBayes clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding task to run"
-  echo "1. ${algorithm[0]} (may require increased heap space on yarn)"
-  echo "2. ${algorithm[1]}"
-  echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-if [ "x$alg" != "xclean" ]; then
-  echo "creating work directory at ${WORK_DIR}"
-
-  mkdir -p ${WORK_DIR}
-    if [ ! -e ${WORK_DIR}/wikixml ]; then
-        mkdir -p ${WORK_DIR}/wikixml
-    fi
-    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then
-        echo "Downloading wikipedia XML dump"
-        ########################################################   
-        #  Datasets: uncomment and run "clean" to change dataset   
-        ########################################################
-        ########## partial small 42.5M zipped
-        # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ########## partial larger 256M zipped
-        curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ######### full wikipedia dump: 10G zipped
-        # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ########################################################
-    fi
-    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then
-        echo "Extracting..."
-       
-        cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
-    fi
-
-echo $START_PATH
-
-set -e
-
-if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
-
-  set -x
-  echo "Preparing wikipedia data"
-  rm -rf ${WORK_DIR}/wiki
-  mkdir ${WORK_DIR}/wiki
-  
-  if [ "x$alg" == "xCBayes" ] ; then
-    # use a list of 10 countries as categories
-    cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt
-    chmod 666 ${WORK_DIR}/country.txt
-  fi
-  
-  if [ "x$alg" == "xBinaryCBayes" ] ; then
-    # use United States and United Kingdom as categories
-    cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt
-    chmod 666 ${WORK_DIR}/country.txt
-  fi
-
-  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-    echo "Copying wikipedia data to HDFS"
-    set +e
-    $DFSRM ${WORK_DIR}/wikixml
-    $DFS -mkdir -p ${WORK_DIR}
-    set -e
-    $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
-  fi
-
-  echo "Creating sequence files from wikiXML"
-  $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
-                                  -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
-                                  -o ${WORK_DIR}/wikipediainput
-   
-  # if using the 10 class problem use bigrams
-  if [ "x$alg" == "xCBayes" ] ; then
-    echo "Converting sequence files to vectors using bigrams"
-    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
-                                       -o ${WORK_DIR}/wikipediaVecs \
-                                       -wt tfidf \
-                                       -lnorm -nv \
-                                       -ow -ng 2
-  fi
-  
-  # if using the 2 class problem try different options
-  if [ "x$alg" == "xBinaryCBayes" ] ; then
-    echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%"
-    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
-                                       -o ${WORK_DIR}/wikipediaVecs \
-                                       -wt tfidf \
-                                       -lnorm \
-                                       -nv \
-                                       -ow \
-                                       -ng 1 \
-                                       -x 30
-  fi
-  
-  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
-  $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
-                                --trainingOutput ${WORK_DIR}/training \
-                                --testOutput ${WORK_DIR}/testing \
-                                -rp 20 \
-                                -ow \
-                                -seq \
-                                -xm sequential
-
-  echo "Training Naive Bayes model"
-  $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
-                                  -o ${WORK_DIR}/model \
-                                  -li ${WORK_DIR}/labelindex \
-                                  -ow \
-                                  -c
-
-  echo "Self testing on training set"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
-                                 -m ${WORK_DIR}/model \
-                                 -l ${WORK_DIR}/labelindex \
-                                 -ow \
-                                 -o ${WORK_DIR}/output \
-                                 -c
-
-  echo "Testing on holdout set: Bayes"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
-                                 -m ${WORK_DIR}/model \
-                                 -l ${WORK_DIR}/labelindex \
-                                 -ow \
-                                 -o ${WORK_DIR}/output \
-                                 -seq
-
- echo "Testing on holdout set: CBayes"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
-                                 -m ${WORK_DIR}/model -l \
-                                 ${WORK_DIR}/labelindex \
-                                 -ow \
-                                 -o ${WORK_DIR}/output  \
-                                 -c \
-                                 -seq
-fi
-
-elif [ "x$alg" == "xclean" ]; then
-  rm -rf $WORK_DIR
-  $DFSRM $WORK_DIR
-fi
-# Remove the work directory

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/cluster-reuters.sh b/community/mahout-mr/examples/bin/cluster-reuters.sh
deleted file mode 100755
index 49f6c94..0000000
--- a/community/mahout-mr/examples/bin/cluster-reuters.sh
+++ /dev/null
@@ -1,203 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the Reuters dataset and prepares it for clustering
-#
-# To run:  change into the mahout directory and type:
-#  examples/bin/cluster-reuters.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script clusters the Reuters data set using a variety of algorithms.  The data set is downloaded automatically."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-MAHOUT="../../bin/mahout"
-
-if [ ! -e $MAHOUT ]; then
-  echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
-  exit 1
-fi
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding clustering algorithm"
-  echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)" 
-  echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)"
-  echo "3. ${algorithm[2]} clustering"
-  echo "4. ${algorithm[3]} clustering"
-  echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]}
-
-if [ "x$clustertype" == "xclean" ]; then
-  rm -rf $WORK_DIR
-  $DFSRM $WORK_DIR
-  exit 1
-else
-  $DFS -mkdir -p $WORK_DIR
-  mkdir -p $WORK_DIR
-  echo "Creating work directory at ${WORK_DIR}"
-fi
-if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
-  if [ ! -e ${WORK_DIR}/reuters-out ]; then
-    if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
-      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
-	  if [ -n "$2" ]; then
-	      echo "Copying Reuters from local download"
-	      cp $2 ${WORK_DIR}/reuters21578.tar.gz
-	  else
-              echo "Downloading Reuters-21578"
-              curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz
-	  fi
-      fi
-      #make sure it was actually downloaded
-      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
-	  echo "Failed to download reuters"
-	  exit 1
-      fi
-      mkdir -p ${WORK_DIR}/reuters-sgm
-      echo "Extracting..."
-      tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
-    fi
-    echo "Extracting Reuters"
-    $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
-    if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-        echo "Copying Reuters data to Hadoop"
-        set +e
-        $DFSRM ${WORK_DIR}/reuters-sgm
-        $DFSRM ${WORK_DIR}/reuters-out
-        $DFS -mkdir -p ${WORK_DIR}/
-        $DFS -mkdir ${WORK_DIR}/reuters-sgm
-        $DFS -mkdir ${WORK_DIR}/reuters-out
-        $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
-        $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
-        set -e
-    fi
-  fi
-  echo "Converting to Sequence Files from Directory"
-  $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
-fi
-
-if [ "x$clustertype" == "xkmeans" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \
-  && \
-  $MAHOUT kmeans \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
-    -c ${WORK_DIR}/reuters-kmeans-clusters \
-    -o ${WORK_DIR}/reuters-kmeans \
-    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
-    -x 10 -k 20 -ow --clustering \
-  && \
-  $MAHOUT clusterdump \
-    -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \
-    -o ${WORK_DIR}/reuters-kmeans/clusterdump \
-    -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
-    -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
-    --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \
-    && \
-  cat ${WORK_DIR}/reuters-kmeans/clusterdump
-elif [ "x$clustertype" == "xfuzzykmeans" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \
-  && \
-  $MAHOUT fkmeans \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
-    -c ${WORK_DIR}/reuters-fkmeans-clusters \
-    -o ${WORK_DIR}/reuters-fkmeans \
-    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
-    -x 10 -k 20 -ow -m 1.1 \
-  && \
-  $MAHOUT clusterdump \
-    -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
-    -o ${WORK_DIR}/reuters-fkmeans/clusterdump \
-    -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
-    -dt sequencefile -b 100 -n 20 -sp 0 \
-    && \
-  cat ${WORK_DIR}/reuters-fkmeans/clusterdump
-elif [ "x$clustertype" == "xlda" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \
-  && \
-  $MAHOUT rowid \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
-    -o ${WORK_DIR}/reuters-out-matrix \
-  && \
-  rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \
-  && \
-  $MAHOUT cvb \
-    -i ${WORK_DIR}/reuters-out-matrix/matrix \
-    -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
-    -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
-    -dt ${WORK_DIR}/reuters-lda-topics \
-    -mt ${WORK_DIR}/reuters-lda-model \
-  && \
-  $MAHOUT vectordump \
-    -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
-    -o ${WORK_DIR}/reuters-lda/vectordump \
-    -vs 10 -p true \
-    -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
-    -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
-    && \
-  cat ${WORK_DIR}/reuters-lda/vectordump
-elif [ "x$clustertype" == "xstreamingkmeans" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \
-  && \
-  rm -rf ${WORK_DIR}/reuters-streamingkmeans \
-  && \
-  $MAHOUT streamingkmeans \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
-    --tempDir ${WORK_DIR}/tmp \
-    -o ${WORK_DIR}/reuters-streamingkmeans \
-    -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
-    -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
-    -k 10 -km 100 -ow \
-  && \
-  $MAHOUT qualcluster \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \
-    -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000   \
-    -o ${WORK_DIR}/reuters-cluster-distance.csv \
-    && \
-  cat ${WORK_DIR}/reuters-cluster-distance.csv
-fi

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh b/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
deleted file mode 100755
index 796da33..0000000
--- a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the Synthetic control dataset and prepares it for clustering
-#
-# To run:  change into the mahout directory and type:
-#  examples/bin/cluster-syntheticcontrol.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script clusters the Synthetic Control data set.  The data set is downloaded automatically."
-  exit
-fi
-
-algorithm=( kmeans fuzzykmeans )
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding clustering algorithm"
-  echo "1. ${algorithm[0]} clustering"
-  echo "2. ${algorithm[1]} clustering"
-  read -p "Enter your choice : " choice
-fi
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]}
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-echo "creating work directory at ${WORK_DIR}"
-mkdir -p ${WORK_DIR}
-if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
-  if [ -n "$2" ]; then
-    cp $2 ${WORK_DIR}/.
-  else
-    echo "Downloading Synthetic control data"
-    curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data  -o ${WORK_DIR}/synthetic_control.data
-  fi
-fi
-if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
-  echo "Couldn't download synthetic control"
-  exit 1
-fi
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
-  echo "Checking the health of DFS..."
-  $DFS -ls /
-  if [ $? -eq 0 ];then 
-    echo "DFS is healthy... "
-    echo "Uploading Synthetic control data to HDFS"
-    $DFSRM ${WORK_DIR}/testdata
-    $DFS -mkdir -p ${WORK_DIR}/testdata
-    $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata
-    echo "Successfully Uploaded Synthetic control data to HDFS "
-
-    options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5"
-
-    if [ "${clustertype}" == "kmeans" ]; then
-      options="${options} --numClusters 6"
-      # t1 & t2 not used if --numClusters specified, but parser requires input
-      options="${options} --t1 1 --t2 2"
-      ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
-    else
-      options="${options} --m 2.0f --t1 80 --t2 55"
-      ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
-    fi
-  else
-    echo " HADOOP is not running. Please make sure you hadoop is running. "
-  fi
-elif [ "$MAHOUT_LOCAL" != "" ]; then
-  echo "running MAHOUT_LOCAL"
-  cp ${WORK_DIR}/synthetic_control.data testdata
-  ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
-  rm testdata
-else
-  echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script"
-fi
-# Remove the work directory
-rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh b/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
deleted file mode 100755
index 29730e1..0000000
--- a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Instructions:
-#
-# Before using this script, you have to download and extract the Movielens 1M dataset
-# from http://www.grouplens.org/node/73
-#
-# To run:  change into the mahout directory and type:
-#  export MAHOUT_LOCAL=true
-# Then:
-#  examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)."
-  echo "Syntax: $0 /path/to/ratings.dat\n"
-  exit
-fi
-
-if [ $# -ne 1 ]
-then
-  echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before"
-  echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n"
-  echo -e "Syntax: $0 /path/to/ratings.dat\n"
-  exit -1
-fi
-
-export MAHOUT_LOCAL=true
-MAHOUT="$MAHOUT_HOME/bin/mahout"
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-echo "creating work directory at ${WORK_DIR}"
-mkdir -p ${WORK_DIR}/movielens
-
-echo "Converting ratings..."
-cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
-
-# create a 90% percent training set and a 10% probe set
-$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \
-    --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp
-
-# run distributed ALS-WR to factorize the rating matrix defined by the training set
-$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \
-    --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2
-
-# compute predictions against the probe set, measure the error
-$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \
-    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
-
-# compute recommendations
-$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \
-    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \
-    --numRecommendations 6 --maxRating 5 --numThreads 2
-
-# print the error
-echo -e "\nRMSE is:\n"
-cat ${WORK_DIR}/als/rmse/rmse.txt
-echo -e "\n"
-
-echo -e "\nSample recommendations:\n"
-shuf ${WORK_DIR}/recommendations/part-m-00000 |head
-echo -e "\n\n"
-
-echo "removing work directory"
-rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/factorize-netflix.sh b/community/mahout-mr/examples/bin/factorize-netflix.sh
deleted file mode 100755
index 26faf66..0000000
--- a/community/mahout-mr/examples/bin/factorize-netflix.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Instructions:
-#
-# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the
-# following:
-#
-#   1) the path to the folder 'training_set' that contains all the movie rating files
-#   2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict
-#   3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for
-#
-# To run:
-#  ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt
-
-echo "Note this script has been deprecated due to the lack of access to the Netflix data set."
-exit 1
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script runs the ALS Recommender on the Netflix data set."
-  echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
-  exit
-fi
-
-if [ $# -ne 3 ]
-then
-  echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
-  exit -1
-fi
-
-MAHOUT="../../bin/mahout"
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-echo "Preparing data..."
-$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR}
-
-# run distributed ALS-WR to factorize the rating matrix defined by the training set
-$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \
-    --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4
-
-# compute predictions against the probe set, measure the error
-$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \
-    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
-
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-
-  # print the error, should be around 0.923
-  echo -e "\nRMSE is:\n"
-  $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
-  echo -e "\n"
-  echo "removing work directory"
-  set +e
-  $DFSRM ${WORK_DIR}
-
-else
-
-  # print the error, should be around 0.923
-  echo -e "\nRMSE is:\n"
-  cat ${WORK_DIR}/als/rmse/rmse.txt
-  echo -e "\n"
-  echo "removing work directory"
-  rm -rf ${WORK_DIR}
-
-fi
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/get-all-examples.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/get-all-examples.sh b/community/mahout-mr/examples/bin/get-all-examples.sh
deleted file mode 100755
index 4128e47..0000000
--- a/community/mahout-mr/examples/bin/get-all-examples.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Clones Mahout example code from remote repositories with their own 
-# build process.  Follow the README for each example for instructions.
-#
-# Usage:  change into the mahout directory and type:
-#  examples/bin/get-all-examples.sh
-
-# Solr-recommender
-echo " Solr-recommender example: "
-echo " 1) imports text 'log files' of some delimited form for user preferences"
-echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids"
-echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations"
-echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender."
-echo "    To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result"
-echo "    from Solr will be an ordered list of recommendations returning the same item Ids as were input."
-echo " For further description see the README.md here https://github.com/pferrel/solr-recommender"
-echo " To build run 'cd solr-recommender; mvn install'"
-echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then "
-echo " run 'cd scripts; ./solr-recommender-example'"
-git clone https://github.com/pferrel/solr-recommender

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/lda.algorithm
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/lda.algorithm b/community/mahout-mr/examples/bin/lda.algorithm
deleted file mode 100644
index fb84ea0..0000000
--- a/community/mahout-mr/examples/bin/lda.algorithm
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-merge.policy=org.apache.lucene.index.LogDocMergePolicy
-merge.factor=mrg:10:20
-max.buffered=buf:100:1000
-compound=true
-
-analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
-directory=FSDirectory
-
-doc.stored=true
-doc.term.vector=true
-doc.tokenized=true
-log.step=600
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
-content.source.forever=false
-doc.maker.forever=false
-query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
-
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=false
-# --------- alg
-{ "BuildReuters"
-  CreateIndex 
-  { "AddDocs" AddDoc > : *
-#  Optimize
-  CloseIndex
-}
-