You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2014/02/28 09:49:57 UTC
svn commit: r1572878 - in /mahout/trunk: examples/bin/README.txt examples/bin/create-rf-data.sh examples/bin/run-rf.sh src/conf/driver.classes.default.props

Author: smarthi
Date: Fri Feb 28 08:49:57 2014
New Revision: 1572878

URL: http://svn.apache.org/r1572878
Log:
MAHOUT-1419: Add example shell scripts for testing Random Forests

Added:
    mahout/trunk/examples/bin/create-rf-data.sh   (with props)
    mahout/trunk/examples/bin/run-rf.sh   (with props)
Modified:
    mahout/trunk/examples/bin/README.txt
    mahout/trunk/src/conf/driver.classes.default.props

Modified: mahout/trunk/examples/bin/README.txt
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/README.txt?rev=1572878&r1=1572877&r2=1572878&view=diff
==============================================================================
--- mahout/trunk/examples/bin/README.txt (original)
+++ mahout/trunk/examples/bin/README.txt Fri Feb 28 08:49:57 2014
@@ -6,4 +6,5 @@ classify-20newsgroups.sh -- Run SGD and 
 cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms.  Downloads the data set automatically.
 cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set.  Downloads the data set automatically.
 factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M).
-factorize-netflix.sh -- Run the ALS Recommender on the Netflix data set
\ No newline at end of file
+factorize-netflix.sh -- Run the ALS Recommender on the Netflix data set
+run-rf.sh -- Create some synthetic data, build a random forest, and test performance.
\ No newline at end of file

Added: mahout/trunk/examples/bin/create-rf-data.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/create-rf-data.sh?rev=1572878&view=auto
==============================================================================
--- mahout/trunk/examples/bin/create-rf-data.sh (added)
+++ mahout/trunk/examples/bin/create-rf-data.sh Fri Feb 28 08:49:57 2014
@@ -0,0 +1,36 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# Create synthetic data set with four numeric fields and a boolean label.
+#
+# Requires scala, and is run from run-rf.sh.
+
+exec scala "$0" "$@"
+!#
+val r = new scala.util.Random()
+val pw = new java.io.PrintWriter(args(1))
+val numRows = args(0).toInt
+(1 to numRows).foreach(e =>
+  pw.println(r.nextDouble() + "," +
+  r.nextDouble() + "," +
+  r.nextDouble() + "," +
+  r.nextDouble() + "," +
+  (if (r.nextBoolean()) 1 else 0))
+)
+pw.close()
+

Propchange: mahout/trunk/examples/bin/create-rf-data.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: mahout/trunk/examples/bin/run-rf.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/run-rf.sh?rev=1572878&view=auto
==============================================================================
--- mahout/trunk/examples/bin/run-rf.sh (added)
+++ mahout/trunk/examples/bin/run-rf.sh Fri Feb 28 08:49:57 2014
@@ -0,0 +1,89 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# Requires scala, and for $HADOOP_HOME to be set.
+#
+# Creates test data for random forest classifier, splits data into train 
+# and test sets, trains the classifier on the train set, then tests 
+# model on test set.
+#
+# To run:  change into the mahout directory and type:
+# ./examples/bin/run-rf.sh <num-rows>
+WORK_DIR=/tmp/mahout-work-${USER}/
+input="rf-input.csv"
+
+
+# Remove old files
+echo
+echo "Removing old temp files if they exist; this will mention they're not there if not."
+echo
+$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash $WORK_DIR forest
+$HADOOP_HOME/bin/hadoop fs -mkdir $WORK_DIR
+
+# Create test data
+numrows=$1
+echo
+echo "Writing random data to $input"
+./examples/bin/create-rf-data.sh $numrows $input
+
+# Put the test file in HDFS
+$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash ${WORK_DIR}
+$HADOOP_HOME/bin/hadoop fs -mkdir -p ${WORK_DIR}/input
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+  HADOOP="$HADOOP_HOME/bin/hadoop"
+  if [ ! -e $HADOOP ]; then
+    echo "Can't find hadoop in $HADOOP, exiting"
+    exit 1
+  fi
+fi
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+  echo "Copying random data to HDFS"
+  set +e
+  $HADOOP dfs -rmr ${WORK_DIR}
+  set -e
+  $HADOOP dfs -put $input ${WORK_DIR}/input/$input
+fi
+
+# Split original file into train and test
+echo "Creating training and holdout set with a random 60-40 split of the generated vector dataset"
+./bin/mahout split \
+  -i ${WORK_DIR}/input \
+  --trainingOutput ${WORK_DIR}/train.csv \
+  --testOutput ${WORK_DIR}/test.csv \
+  --randomSelectionPct 40 --overwrite -xm sequential
+
+# Describe input file schema
+# Note:  "-d 4 N L" indicates four numerical fields and one label, as built by the step above.
+./bin/mahout describe -p $WORK_DIR/input/$input -f $WORK_DIR/info -d 4 N L
+
+# Train rf model
+echo
+echo "Training random forest."
+echo
+./bin/mahout buildforest -DXmx10000m -Dmapred.max.split.size=1000000 -d $WORK_DIR/train.csv -ds $WORK_DIR/info -sl 7 -p -t 500 -o $WORK_DIR/forest
+
+# Test predictions
+echo
+echo "Testing predictions on test set."
+echo
+./bin/mahout testforest -DXmx10000m -Dmapred.output.compress=false -i $WORK_DIR/test.csv -ds $WORK_DIR/info -m $WORK_DIR/forest -a -mr -o $WORK_DIR/predictions
+
+# Remove old files
+$HADOOP_HOME/bin/hadoop fs -rmr -skipTrash $WORK_DIR
+rm $input
+

Propchange: mahout/trunk/examples/bin/run-rf.sh
------------------------------------------------------------------------------
    svn:executable = *

Modified: mahout/trunk/src/conf/driver.classes.default.props
URL: http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.default.props?rev=1572878&r1=1572877&r2=1572878&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.default.props (original)
+++ mahout/trunk/src/conf/driver.classes.default.props Fri Feb 28 08:49:57 2014
@@ -17,6 +17,7 @@ org.apache.mahout.text.SequenceFilesFrom
 org.apache.mahout.utils.ConcatenateVectorsJob = concatmatrices : Concatenates 2 matrices of same cardinality into a single matrix
 org.apache.mahout.clustering.streaming.tools.ResplitSequenceFiles = resplit : Splits a set of SequenceFiles into a number of equal splits
 org.apache.mahout.clustering.streaming.tools.ClusterQualitySummarizer = qualcluster : Runs clustering experiments and summarizes results in a CSV
+org.apache.mahout.classifier.df.tools.Describe = describe : Describe the fields and target variable in a data set
 
 #Math
 org.apache.mahout.math.hadoop.TransposeJob = transpose : Take the transpose of a matrix
@@ -26,6 +27,7 @@ org.apache.mahout.math.hadoop.decomposer
 org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob = rowsimilarity : Compute the pairwise similarities of the rows of a matrix
 org.apache.mahout.math.hadoop.similarity.VectorDistanceSimilarityJob =  vecdist : Compute the distances between a set of Vectors (or Cluster or Canopy, they must fit in memory) and a list of Vectors
 org.apache.mahout.math.hadoop.stochasticsvd.SSVDCli = ssvd : Stochastic SVD
+
 #Clustering
 org.apache.mahout.clustering.kmeans.KMeansDriver = kmeans : K-means clustering
 org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver = fkmeans : Fuzzy K-means clustering
@@ -40,6 +42,9 @@ org.apache.mahout.clustering.streaming.m
 #new bayes
 org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob = trainnb : Train the Vector-based Bayes classifier
 org.apache.mahout.classifier.naivebayes.test.TestNaiveBayesDriver = testnb : Test the Vector-based Bayes classifier
+org.apache.mahout.classifier.df.mapreduce.BuildForest = buildforest : Build the random forest classifier
+org.apache.mahout.classifier.df.mapreduce.TestForest = testforest : Test the random forest classifier
+
 #SGD
 org.apache.mahout.classifier.sgd.TrainLogistic = trainlogistic : Train a logistic regression using stochastic gradient descent
 org.apache.mahout.classifier.sgd.RunLogistic = runlogistic : Run a logistic regression model against CSV data