You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by is...@apache.org on 2009/10/03 20:23:06 UTC
svn commit: r821385 - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/
core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/
core/src/main/java/org/apache/mahout/classifier/bayes/mapreduc...
Author: isabel
Date: Sat Oct 3 18:23:05 2009
New Revision: 821385
URL: http://svn.apache.org/viewvc?rev=821385&view=rev
Log:
MAHOUT-138 - adjusted jobs for bayes and cbayes to use cli.
Added:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesJob.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/JobExecutor.java
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesThetaNormalizerDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesNormalizedWeightDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaNormalizerDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerDriver.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesDriver.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesDriver.java Sat Oct 3 18:23:05 2009
@@ -22,35 +22,31 @@
import org.apache.hadoop.mapred.JobConf;
import org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureDriver;
import org.apache.mahout.classifier.bayes.common.BayesParameters;
+import org.apache.mahout.classifier.bayes.mapreduce.common.BayesJob;
import org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfDriver;
import org.apache.mahout.classifier.bayes.mapreduce.common.BayesWeightSummerDriver;
+import org.apache.mahout.classifier.bayes.mapreduce.common.JobExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
/** Create and run the Bayes Trainer. */
-public class BayesDriver {
+public class BayesDriver implements BayesJob {
private static final Logger log = LoggerFactory.getLogger(BayesDriver.class);
- private BayesDriver() {
- }
-
/**
* Takes in two arguments: <ol> <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
* <li>The output {@link org.apache.hadoop.fs.Path} where to write the Model as a
* {@link org.apache.hadoop.io.SequenceFile}</li> </ol>
*
* @param args The args
- * @throws ClassNotFoundException
- * @throws InterruptedException
+ * @throws Exception in case of job execution problems.
*/
- public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
- String input = args[0];
- String output = args[1];
-
- runJob(input, output, new BayesParameters(1));
+ public static void main(String[] args) throws Exception {
+ JobExecutor executor = new JobExecutor();
+ executor.execute(args, new BayesDriver());
}
/**
@@ -61,7 +57,7 @@
* @throws ClassNotFoundException
* @throws InterruptedException
*/
- public static void runJob(String input, String output, BayesParameters params) throws IOException, InterruptedException, ClassNotFoundException {
+ public void runJob(String input, String output, BayesParameters params) throws IOException, InterruptedException, ClassNotFoundException {
JobConf conf = new JobConf(BayesDriver.class);
Path outPath = new Path(output);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
@@ -71,22 +67,26 @@
log.info("Reading features...");
//Read the features in each document normalized by length of each document
- BayesFeatureDriver.runJob(input, output, params);
+ BayesFeatureDriver feature = new BayesFeatureDriver();
+ feature.runJob(input, output, params);
log.info("Calculating Tf-Idf...");
//Calculate the TfIdf for each word in each label
- BayesTfIdfDriver.runJob(input, output, params);
+ BayesTfIdfDriver tfidf = new BayesTfIdfDriver();
+ tfidf.runJob(input, output, params);
log.info("Calculating weight sums for labels and features...");
//Calculate the Sums of weights for each label, for each feature and for each feature and for each label
- BayesWeightSummerDriver.runJob(input, output, params);
+ BayesWeightSummerDriver summer = new BayesWeightSummerDriver();
+ summer.runJob(input, output, params);
//Calculate the W_ij = log(Theta) for each label, feature. This step actually generates the complement class
//CBayesThetaDriver.runJob(input, output);
log.info("Calculating the weight Normalisation factor for each class...");
//Calculate the normalization factor Sigma_W_ij for each complement class.
- BayesThetaNormalizerDriver.runJob(input, output, params);
+ BayesThetaNormalizerDriver normalizer = new BayesThetaNormalizerDriver();
+ normalizer.runJob(input, output, params);
//Calculate the normalization factor Sigma_W_ij for each complement class.
//CBayesNormalizedWeightDriver.runJob(input, output);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesThetaNormalizerDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesThetaNormalizerDriver.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesThetaNormalizerDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesThetaNormalizerDriver.java Sat Oct 3 18:23:05 2009
@@ -31,6 +31,10 @@
import org.apache.hadoop.util.GenericsUtil;
import org.apache.mahout.classifier.bayes.common.BayesParameters;
import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+import org.apache.mahout.classifier.bayes.mapreduce.common.BayesJob;
+import org.apache.mahout.classifier.bayes.mapreduce.common.JobExecutor;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -38,25 +42,20 @@
import java.util.Map;
/** Create and run the Bayes Theta Normalization Step. */
-public class BayesThetaNormalizerDriver {
+public class BayesThetaNormalizerDriver implements BayesJob {
private static final Logger log = LoggerFactory.getLogger(BayesThetaNormalizerDriver.class);
- private BayesThetaNormalizerDriver() {
- }
-
/**
* Takes in two arguments: <ol> <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
* <li>The output {@link org.apache.hadoop.fs.Path} where to write the the interim filesas a {@link
* org.apache.hadoop.io.SequenceFile}</li> </ol>
*
- * @param args The args
+ * @param args The args - should contain input and output path.
*/
- public static void main(String[] args) throws IOException {
- String input = args[0];
- String output = args[1];
-
- runJob(input, output, new BayesParameters(1));
+ public static void main(String[] args) throws Exception {
+ JobExecutor executor = new JobExecutor();
+ executor.execute(args, new BayesThetaNormalizerDriver());
}
/**
@@ -65,7 +64,7 @@
* @param input the input pathname String
* @param output the output pathname String
*/
- public static void runJob(String input, String output, BayesParameters params) throws IOException {
+ public void runJob(String input, String output, BayesParameters params) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(BayesThetaNormalizerDriver.class);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesDriver.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesDriver.java Sat Oct 3 18:23:05 2009
@@ -22,35 +22,31 @@
import org.apache.hadoop.mapred.JobConf;
import org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureDriver;
import org.apache.mahout.classifier.bayes.common.BayesParameters;
+import org.apache.mahout.classifier.bayes.mapreduce.common.BayesJob;
import org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfDriver;
import org.apache.mahout.classifier.bayes.mapreduce.common.BayesWeightSummerDriver;
+import org.apache.mahout.classifier.bayes.mapreduce.common.JobExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
/** Create and run the Bayes Trainer. */
-public class CBayesDriver {
+public class CBayesDriver implements BayesJob{
private static final Logger log = LoggerFactory.getLogger(CBayesDriver.class);
- private CBayesDriver() {
- }
-
/**
* Takes in two arguments: <ol> <li>The input {@link Path} where the input documents live</li>
* <li>The output {@link Path} where to write the Model as a
* {@link org.apache.hadoop.io.SequenceFile}</li> </ol>
*
- * @param args The args
- * @throws ClassNotFoundException
- * @throws InterruptedException
+ * @param args The args input and output path.
+ * @throws Exception in case of problems during job execution.
*/
- public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
- String input = args[0];
- String output = args[1];
-
- runJob(input, output, new BayesParameters(1));
+ public static void main(String[] args) throws Exception {
+ JobExecutor executor = new JobExecutor();
+ executor.execute(args, new CBayesDriver());
}
/**
@@ -61,7 +57,7 @@
* @throws ClassNotFoundException
* @throws InterruptedException
*/
- public static void runJob(String input, String output, BayesParameters params) throws IOException, InterruptedException, ClassNotFoundException {
+ public void runJob(String input, String output, BayesParameters params) throws IOException, InterruptedException, ClassNotFoundException {
JobConf conf = new JobConf(CBayesDriver.class);
Path outPath = new Path(output);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
@@ -71,22 +67,26 @@
log.info("Reading features...");
//Read the features in each document normalized by length of each document
- BayesFeatureDriver.runJob(input, output, params);
+ BayesFeatureDriver feature = new BayesFeatureDriver();
+ feature.runJob(input, output, params);
log.info("Calculating Tf-Idf...");
//Calculate the TfIdf for each word in each label
- BayesTfIdfDriver.runJob(input, output,params);
+ BayesTfIdfDriver tfidf = new BayesTfIdfDriver();
+ tfidf.runJob(input, output,params);
log.info("Calculating weight sums for labels and features...");
//Calculate the Sums of weights for each label, for each feature and for each feature and for each label
- BayesWeightSummerDriver.runJob(input, output, params);
+ BayesWeightSummerDriver summer = new BayesWeightSummerDriver();
+ summer.runJob(input, output, params);
//Calculate the W_ij = log(Theta) for each label, feature. This step actually generates the complement class
//CBayesThetaDriver.runJob(input, output);
log.info("Calculating the weight Normalisation factor for each complement class...");
//Calculate the normalization factor Sigma_W_ij for each complement class.
- CBayesThetaNormalizerDriver.runJob(input, output, params);
+ CBayesThetaNormalizerDriver normalizer = new CBayesThetaNormalizerDriver();
+ normalizer.runJob(input, output, params);
//Calculate the normalization factor Sigma_W_ij for each complement class.
//CBayesNormalizedWeightDriver.runJob(input, output);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesNormalizedWeightDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesNormalizedWeightDriver.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesNormalizedWeightDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesNormalizedWeightDriver.java Sat Oct 3 18:23:05 2009
@@ -31,6 +31,8 @@
import org.apache.hadoop.util.GenericsUtil;
import org.apache.mahout.classifier.bayes.common.BayesParameters;
import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+import org.apache.mahout.classifier.bayes.mapreduce.common.BayesJob;
+import org.apache.mahout.classifier.bayes.mapreduce.common.JobExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -38,25 +40,20 @@
import java.util.Map;
/** Create and run the Bayes Trainer. */
-public class CBayesNormalizedWeightDriver {
+public class CBayesNormalizedWeightDriver implements BayesJob {
private static final Logger log = LoggerFactory.getLogger(CBayesNormalizedWeightDriver.class);
- private CBayesNormalizedWeightDriver() {
- }
-
/**
* Takes in two arguments: <ol> <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
* <li>The output {@link org.apache.hadoop.fs.Path} where to write the Model as a
* {@link org.apache.hadoop.io.SequenceFile}</li> </ol>
*
- * @param args The args
+ * @param args The args - contains input and output path.
*/
- public static void main(String[] args) throws IOException {
- String input = args[0];
- String output = args[1];
-
- runJob(input, output, new BayesParameters(1));
+ public static void main(String[] args) throws Exception {
+ JobExecutor executor = new JobExecutor();
+ executor.execute(args, new CBayesNormalizedWeightDriver());
}
/**
@@ -65,7 +62,7 @@
* @param input the input pathname String
* @param output the output pathname String
*/
- public static void runJob(String input, String output, BayesParameters params) throws IOException {
+ public void runJob(String input, String output, BayesParameters params) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(CBayesNormalizedWeightDriver.class);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaDriver.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaDriver.java Sat Oct 3 18:23:05 2009
@@ -26,11 +26,14 @@
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.mahout.classifier.bayes.common.BayesParameters;
import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+import org.apache.mahout.classifier.bayes.mapreduce.common.BayesJob;
+import org.apache.mahout.classifier.bayes.mapreduce.common.JobExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -38,7 +41,7 @@
import java.util.Map;
/** Create and run the Bayes Trainer. */
-public class CBayesThetaDriver {
+public class CBayesThetaDriver implements BayesJob {
private static final Logger log = LoggerFactory.getLogger(CBayesThetaDriver.class);
@@ -50,13 +53,11 @@
* <li>The output {@link org.apache.hadoop.fs.Path} where to write the Model as a
* {@link org.apache.hadoop.io.SequenceFile}</li> </ol>
*
- * @param args The args
+ * @param args The args - input path and output path
*/
- public static void main(String[] args) throws IOException {
- String input = args[0];
- String output = args[1];
-
- runJob(input, output, new BayesParameters(1));
+ public static void main(String[] args) throws Exception {
+ JobExecutor executor = new JobExecutor();
+ executor.execute(args, new CBayesThetaDriver());
}
/**
@@ -65,7 +66,7 @@
* @param input the input pathname String
* @param output the output pathname String
*/
- public static void runJob(String input, String output, BayesParameters params) throws IOException {
+ public void runJob(String input, String output, BayesParameters params) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(CBayesThetaDriver.class);
conf.setJobName("Complementary Theta Driver running over input: " + input);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaNormalizerDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaNormalizerDriver.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaNormalizerDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/cbayes/CBayesThetaNormalizerDriver.java Sat Oct 3 18:23:05 2009
@@ -31,6 +31,8 @@
import org.apache.hadoop.util.GenericsUtil;
import org.apache.mahout.classifier.bayes.common.BayesParameters;
import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+import org.apache.mahout.classifier.bayes.mapreduce.common.BayesJob;
+import org.apache.mahout.classifier.bayes.mapreduce.common.JobExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -38,25 +40,20 @@
import java.util.Map;
/** Create and run the Bayes Trainer. */
-public class CBayesThetaNormalizerDriver {
+public class CBayesThetaNormalizerDriver implements BayesJob {
private static final Logger log = LoggerFactory.getLogger(CBayesThetaNormalizerDriver.class);
- private CBayesThetaNormalizerDriver() {
- }
-
/**
* Takes in two arguments: <ol> <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
* <li>The output {@link org.apache.hadoop.fs.Path} where to write the Model as a
* {@link org.apache.hadoop.io.SequenceFile}</li> </ol>
*
- * @param args The args
+ * @param args The args input and output path.
*/
- public static void main(String[] args) throws IOException {
- String input = args[0];
- String output = args[1];
-
- runJob(input, output, new BayesParameters(1));
+ public static void main(String[] args) throws Exception {
+ JobExecutor executor = new JobExecutor();
+ executor.execute(args, new CBayesThetaNormalizerDriver());
}
/**
@@ -65,7 +62,7 @@
* @param input the input pathname String
* @param output the output pathname String
*/
- public static void runJob(String input, String output, BayesParameters params) throws IOException {
+ public void runJob(String input, String output, BayesParameters params) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(CBayesThetaNormalizerDriver.class);
conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureDriver.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureDriver.java Sat Oct 3 18:23:05 2009
@@ -33,25 +33,21 @@
import java.io.IOException;
/** Create and run the Bayes Feature Reader Step. */
-public class BayesFeatureDriver {
+public class BayesFeatureDriver implements BayesJob {
private static final Logger log = LoggerFactory.getLogger(BayesFeatureDriver.class);
- private BayesFeatureDriver() {
- }
-
/**
* Takes in two arguments: <ol> <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
* <li>The output {@link org.apache.hadoop.fs.Path} where to write the interim files as a {@link
* org.apache.hadoop.io.SequenceFile}</li> </ol>
*
- * @param args The args
+ * @param args The args - input and output path.
+ * @throws Exception in case of problems during job execution.
*/
- public static void main(String[] args) throws IOException {
- String input = args[0];
- String output = args[1];
-
- runJob(input, output, new BayesParameters(1));
+ public static void main(String[] args) throws Exception {
+ JobExecutor executor = new JobExecutor();
+ executor.execute(args, new BayesFeatureDriver());
}
/**
@@ -60,7 +56,7 @@
* @param input the input pathname String
* @param output the output pathname String
*/
- public static void runJob(String input, String output, BayesParameters params) throws IOException {
+ public void runJob(String input, String output, BayesParameters params) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(BayesFeatureDriver.class);
conf.setJobName("Bayes Feature Driver running over input: " + input);
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesJob.java?rev=821385&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesJob.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesJob.java Sat Oct 3 18:23:05 2009
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.classifier.bayes.mapreduce.common;
+
+import org.apache.mahout.classifier.bayes.common.BayesParameters;
+
+/**
+ * Implementors of this interface provide a way for running bayes training jobs on
+ * a hadoop cluster.
+ * */
+public interface BayesJob {
+
+ /**
+ * Execute a classification job on a cluster.
+ * @param input path to training documents.
+ * @param output path to output directory.
+ * */
+ void runJob(final String input, final String output, final BayesParameters params) throws Exception;
+
+}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfDriver.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesTfIdfDriver.java Sat Oct 3 18:23:05 2009
@@ -31,6 +31,7 @@
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.mahout.classifier.bayes.common.BayesParameters;
@@ -42,13 +43,10 @@
import java.util.Map;
/** The Driver which drives the Tf-Idf Generation */
-public class BayesTfIdfDriver {
+public class BayesTfIdfDriver implements BayesJob {
private static final Logger log = LoggerFactory.getLogger(BayesTfIdfDriver.class);
- private BayesTfIdfDriver() {
- }
-
/**
* Takes in two arguments:
* <ol>
@@ -58,14 +56,12 @@
* files as a {@link org.apache.hadoop.io.SequenceFile}</li>
* </ol>
*
- * @param args The args
- * @throws ClassNotFoundException
+ * @param args The args - input and output path.
+ * @throws Exception in case of problems during job execution.
*/
- public static void main(String[] args) throws IOException {
- String input = args[0];
- String output = args[1];
-
- runJob(input, output, new BayesParameters(1));
+ public static void main(String[] args) throws Exception {
+ JobExecutor executor = new JobExecutor();
+ executor.execute(args, new BayesTfIdfDriver());
}
/**
@@ -75,7 +71,7 @@
* @param output the output pathname String
* @throws ClassNotFoundException
*/
- public static void runJob(String input, String output, BayesParameters params) throws IOException {
+ public void runJob(String input, String output, BayesParameters params) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(BayesWeightSummerDriver.class);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerDriver.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesWeightSummerDriver.java Sat Oct 3 18:23:05 2009
@@ -31,22 +31,18 @@
import java.io.IOException;
/** Create and run the Bayes Trainer. */
-public class BayesWeightSummerDriver {
- private BayesWeightSummerDriver() {
- }
+public class BayesWeightSummerDriver implements BayesJob {
/**
* Takes in two arguments: <ol> <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
* <li>The output {@link org.apache.hadoop.fs.Path} where to write the the interim files as a {@link
* org.apache.hadoop.io.SequenceFile}</li> </ol>
*
- * @param args The args
+ * @param args The args - should contain input and output path.
*/
- public static void main(String[] args) throws IOException {
- String input = args[0];
- String output = args[1];
-
- runJob(input, output, new BayesParameters(1));
+ public static void main(String[] args) throws Exception {
+ JobExecutor executor = new JobExecutor();
+ executor.execute(args, new BayesWeightSummerDriver());
}
/**
@@ -55,7 +51,7 @@
* @param input the input pathname String
* @param output the output pathname String
*/
- public static void runJob(String input, String output, BayesParameters params) throws IOException {
+ public void runJob(String input, String output, BayesParameters params) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(BayesWeightSummerDriver.class);
conf.setJobName("Bayes Weight Summer Driver running over input: " + input);
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/JobExecutor.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/JobExecutor.java?rev=821385&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/JobExecutor.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/JobExecutor.java Sat Oct 3 18:23:05 2009
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.classifier.bayes.mapreduce.common;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.classifier.bayes.common.BayesParameters;
+import org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesDriver;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class JobExecutor {
+ /** Logger for this class. */
+ private static final Logger log = LoggerFactory.getLogger(BayesDriver.class);
+
+ /**
+ * Execute a bayes classification job. Input and output path are parsed from
+ * the input parameters.
+ * @param args input parameters.
+ * @param job the job to execute.
+ * @throws Exception any exception thrown at job execution.
+ * */
+ public void execute(final String args[], final BayesJob job) throws Exception {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = DefaultOptionCreator.inputOption(obuilder, abuilder);
+ Option outputOpt = DefaultOptionCreator.outputOption(obuilder, abuilder);
+ Option helpOpt = DefaultOptionCreator.helpOption(obuilder);
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
+ .withOption(helpOpt).create();
+
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String input = cmdLine.getValue(inputOpt).toString();
+ String output = cmdLine.getValue(outputOpt).toString();
+
+ job.runJob(input, output, new BayesParameters(1));
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
+ }
+}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java?rev=821385&r1=821384&r2=821385&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/TrainClassifier.java Sat Oct 3 18:23:05 2009
@@ -61,11 +61,13 @@
}
public static void trainNaiveBayes(String dir, String outputDir, BayesParameters params) throws IOException, InterruptedException, ClassNotFoundException {
- BayesDriver.runJob(dir, outputDir, params);
+ BayesDriver driver = new BayesDriver();
+ driver.runJob(dir, outputDir, params);
}
public static void trainCNaiveBayes(String dir, String outputDir, BayesParameters params) throws IOException, InterruptedException, ClassNotFoundException {
- CBayesDriver.runJob(dir, outputDir, params);
+ CBayesDriver driver = new CBayesDriver();
+ driver.runJob(dir, outputDir, params);
}
public static void main(String[] args) throws IOException, OptionException, NumberFormatException, IllegalStateException, InterruptedException, ClassNotFoundException {