You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/27 13:14:49 UTC

[23/24] mahout git commit: MAHOUT-2034 Split MR and New Examples into seperate modules

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh b/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
new file mode 100755
index 0000000..796da33
--- /dev/null
+++ b/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Synthetic control dataset and prepares it for clustering
+#
+# To run:  change into the mahout directory and type:
+#  examples/bin/cluster-syntheticcontrol.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script clusters the Synthetic Control data set.  The data set is downloaded automatically."
+  exit
+fi
+
+algorithm=( kmeans fuzzykmeans )
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding clustering algorithm"
+  echo "1. ${algorithm[0]} clustering"
+  echo "2. ${algorithm[1]} clustering"
+  read -p "Enter your choice : " choice
+fi
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
+  if [ -n "$2" ]; then
+    cp $2 ${WORK_DIR}/.
+  else
+    echo "Downloading Synthetic control data"
+    curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data  -o ${WORK_DIR}/synthetic_control.data
+  fi
+fi
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
+  echo "Couldn't download synthetic control"
+  exit 1
+fi
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
+  echo "Checking the health of DFS..."
+  $DFS -ls /
+  if [ $? -eq 0 ];then 
+    echo "DFS is healthy... "
+    echo "Uploading Synthetic control data to HDFS"
+    $DFSRM ${WORK_DIR}/testdata
+    $DFS -mkdir -p ${WORK_DIR}/testdata
+    $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata
+    echo "Successfully Uploaded Synthetic control data to HDFS "
+
+    options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5"
+
+    if [ "${clustertype}" == "kmeans" ]; then
+      options="${options} --numClusters 6"
+      # t1 & t2 not used if --numClusters specified, but parser requires input
+      options="${options} --t1 1 --t2 2"
+      ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
+    else
+      options="${options} --m 2.0f --t1 80 --t2 55"
+      ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
+    fi
+  else
+    echo " HADOOP is not running. Please make sure you hadoop is running. "
+  fi
+elif [ "$MAHOUT_LOCAL" != "" ]; then
+  echo "running MAHOUT_LOCAL"
+  cp ${WORK_DIR}/synthetic_control.data testdata
+  ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
+  rm testdata
+else
+  echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script"
+fi
+# Remove the work directory
+rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh b/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
new file mode 100755
index 0000000..29730e1
--- /dev/null
+++ b/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Instructions:
+#
+# Before using this script, you have to download and extract the Movielens 1M dataset
+# from http://www.grouplens.org/node/73
+#
+# To run:  change into the mahout directory and type:
+#  export MAHOUT_LOCAL=true
+# Then:
+#  examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)."
+  echo "Syntax: $0 /path/to/ratings.dat\n"
+  exit
+fi
+
+if [ $# -ne 1 ]
+then
+  echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before"
+  echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n"
+  echo -e "Syntax: $0 /path/to/ratings.dat\n"
+  exit -1
+fi
+
+export MAHOUT_LOCAL=true
+MAHOUT="$MAHOUT_HOME/bin/mahout"
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}/movielens
+
+echo "Converting ratings..."
+cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
+
+# create a 90% percent training set and a 10% probe set
+$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \
+    --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp
+
+# run distributed ALS-WR to factorize the rating matrix defined by the training set
+$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \
+    --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2
+
+# compute predictions against the probe set, measure the error
+$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
+
+# compute recommendations
+$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \
+    --numRecommendations 6 --maxRating 5 --numThreads 2
+
+# print the error
+echo -e "\nRMSE is:\n"
+cat ${WORK_DIR}/als/rmse/rmse.txt
+echo -e "\n"
+
+echo -e "\nSample recommendations:\n"
+shuf ${WORK_DIR}/recommendations/part-m-00000 |head
+echo -e "\n\n"
+
+echo "removing work directory"
+rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/factorize-netflix.sh b/community/mahout-mr/examples/bin/factorize-netflix.sh
new file mode 100755
index 0000000..26faf66
--- /dev/null
+++ b/community/mahout-mr/examples/bin/factorize-netflix.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Instructions:
+#
+# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the
+# following:
+#
+#   1) the path to the folder 'training_set' that contains all the movie rating files
+#   2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict
+#   3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for
+#
+# To run:
+#  ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt
+
+echo "Note this script has been deprecated due to the lack of access to the Netflix data set."
+exit 1
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs the ALS Recommender on the Netflix data set."
+  echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
+  exit
+fi
+
+if [ $# -ne 3 ]
+then
+  echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
+  exit -1
+fi
+
+MAHOUT="../../bin/mahout"
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+echo "Preparing data..."
+$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR}
+
+# run distributed ALS-WR to factorize the rating matrix defined by the training set
+$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \
+    --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4
+
+# compute predictions against the probe set, measure the error
+$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
+
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+
+  # print the error, should be around 0.923
+  echo -e "\nRMSE is:\n"
+  $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
+  echo -e "\n"
+  echo "removing work directory"
+  set +e
+  $DFSRM ${WORK_DIR}
+
+else
+
+  # print the error, should be around 0.923
+  echo -e "\nRMSE is:\n"
+  cat ${WORK_DIR}/als/rmse/rmse.txt
+  echo -e "\n"
+  echo "removing work directory"
+  rm -rf ${WORK_DIR}
+
+fi
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/get-all-examples.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/get-all-examples.sh b/community/mahout-mr/examples/bin/get-all-examples.sh
new file mode 100755
index 0000000..4128e47
--- /dev/null
+++ b/community/mahout-mr/examples/bin/get-all-examples.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Clones Mahout example code from remote repositories with their own 
+# build process.  Follow the README for each example for instructions.
+#
+# Usage:  change into the mahout directory and type:
+#  examples/bin/get-all-examples.sh
+
+# Solr-recommender
+echo " Solr-recommender example: "
+echo " 1) imports text 'log files' of some delimited form for user preferences"
+echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids"
+echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations"
+echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender."
+echo "    To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result"
+echo "    from Solr will be an ordered list of recommendations returning the same item Ids as were input."
+echo " For further description see the README.md here https://github.com/pferrel/solr-recommender"
+echo " To build run 'cd solr-recommender; mvn install'"
+echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then "
+echo " run 'cd scripts; ./solr-recommender-example'"
+git clone https://github.com/pferrel/solr-recommender

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/community/mahout-mr/examples/bin/lda.algorithm
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/lda.algorithm b/community/mahout-mr/examples/bin/lda.algorithm
new file mode 100644
index 0000000..fb84ea0
--- /dev/null
+++ b/community/mahout-mr/examples/bin/lda.algorithm
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+merge.policy=org.apache.lucene.index.LogDocMergePolicy
+merge.factor=mrg:10:20
+max.buffered=buf:100:1000
+compound=true
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.term.vector=true
+doc.tokenized=true
+log.step=600
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+content.source.forever=false
+doc.maker.forever=false
+query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=false
+# --------- alg
+{ "BuildReuters"
+  CreateIndex 
+  { "AddDocs" AddDoc > : *
+#  Optimize
+  CloseIndex
+}
+