You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/05/07 22:21:49 UTC

svn commit: r942203 - in /lucene/mahout/trunk: examples/ examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/ examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/ examples/src/main/java/org/apache/maho...

Author: srowen
Date: Fri May  7 20:21:49 2010
New Revision: 942203

URL: http://svn.apache.org/viewvc?rev=942203&view=rev
Log:
More MAHOUT-302 madness; this time substantially fixed by letting examples and core depend on math

Modified:
    lucene/mahout/trunk/examples/pom.xml
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDFitnessEvaluator.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDGA.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/FileInfoParser.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluator.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosTool.java
    lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluatorTest.java
    lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
    lucene/mahout/trunk/utils/pom.xml

Modified: lucene/mahout/trunk/examples/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/pom.xml?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/pom.xml (original)
+++ lucene/mahout/trunk/examples/pom.xml Fri May  7 20:21:49 2010
@@ -143,6 +143,18 @@
     </dependency>
     <dependency>
       <groupId>org.apache.mahout</groupId>
+      <artifactId>mahout-math</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.mahout</groupId>
+      <artifactId>mahout-math</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.mahout</groupId>
       <artifactId>mahout-utils</artifactId>
       <version>${project.version}</version>
     </dependency>

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Fri May  7 20:21:49 2010
@@ -27,7 +27,6 @@ import org.apache.commons.cli2.builder.A
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
@@ -141,7 +140,8 @@ public final class Job {
       "org.apache.mahout.math.RandomAccessSparseVector");
     CanopyDriver.runJob(directoryContainingConvertedInput, output, measureClassName, t1, t2, true);
     
-    ClusterDumper clusterDumper = new ClusterDumper("output/clusters-0", "output/clusteredPoints");
+    ClusterDumper clusterDumper =
+        new ClusterDumper(new Path(output, "clusters-0"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);
 
   }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java Fri May  7 20:21:49 2010
@@ -30,7 +30,6 @@ import org.apache.commons.cli2.builder.A
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.mahout.clustering.dirichlet.DirichletCluster;
@@ -130,13 +129,6 @@ public class Job {
    *          the alpha0 value for the DirichletDistribution
    * @param numReducers
    *          the desired number of reducers
-   * @throws IllegalAccessException
-   * @throws InstantiationException
-   * @throws ClassNotFoundException
-   * @throws InvocationTargetException
-   * @throws NoSuchMethodException
-   * @throws IllegalArgumentException
-   * @throws SecurityException
    */
   public static void runJob(Path input,
                             Path output,
@@ -159,8 +151,9 @@ public class Job {
     InputDriver.runJob(input, directoryContainingConvertedInput, vectorClassName);
     DirichletDriver.runJob(directoryContainingConvertedInput, output, modelFactory,
       vectorClassName, 60, numModels, maxIterations, alpha_0, numReducers, true, true, 0);
-    
-    ClusterDumper clusterDumper = new ClusterDumper("output/clusters-5", "output/clusteredPoints");
+
+    ClusterDumper clusterDumper =
+        new ClusterDumper(new Path(output, "clusters-5"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);
 
   }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Fri May  7 20:21:49 2010
@@ -27,7 +27,6 @@ import org.apache.commons.cli2.builder.A
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
@@ -149,7 +148,8 @@ public final class Job {
     KMeansDriver.runJob(directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR), output, measureClass,
         convergenceDelta, maxIterations, 1, true);
 
-    ClusterDumper clusterDumper = new ClusterDumper("output/clusters-10", "output/clusteredPoints");
+    ClusterDumper clusterDumper =
+        new ClusterDumper(new Path(output, "clusters-10"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);
   }
 }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Fri May  7 20:21:49 2010
@@ -133,7 +133,8 @@ public final class Job {
     MeanShiftCanopyDriver.runJob(directoryContainingConvertedInput, output, measureClassName, t1, t2,
         convergenceDelta, maxIterations, true, true);
 
-    ClusterDumper clusterDumper = new ClusterDumper("output/clusters-10", "output/clusteredPoints");
+    ClusterDumper clusterDumper =
+        new ClusterDumper(new Path(output, "clusters-10"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);
 
   }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDFitnessEvaluator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDFitnessEvaluator.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDFitnessEvaluator.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDFitnessEvaluator.java Fri May  7 20:21:49 2010
@@ -60,9 +60,10 @@ public class CDFitnessEvaluator extends 
   protected void evaluate(List<? extends Rule> population,
                           List<Double> evaluations) {
     evals.clear();
-    
+
+    Path output = new Path("output");
     try {
-      CDMahoutEvaluator.evaluate(population, target, dataset, evals, split);
+      CDMahoutEvaluator.evaluate(population, target, dataset, output, evals, split);
     } catch (IOException e) {
       throw new IllegalStateException("Exception while evaluating the population", e);
     }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDGA.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDGA.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDGA.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDGA.java Fri May  7 20:21:49 2010
@@ -200,13 +200,15 @@ public class CDGA {
     
     // evolve the rules over the training set
     Rule solution = engine.evolve(popSize, 1, new GenerationCount(genCount));
-    
+
+    Path output = new Path("output");
+
     // fitness over the training set
-    CDFitness bestTrainFit = CDMahoutEvaluator.evaluate(solution, target, inpath, split);
+    CDFitness bestTrainFit = CDMahoutEvaluator.evaluate(solution, target, inpath, output, split);
     
     // fitness over the testing set
     split.setTraining(false);
-    CDFitness bestTestFit = CDMahoutEvaluator.evaluate(solution, target, inpath, split);
+    CDFitness bestTestFit = CDMahoutEvaluator.evaluate(solution, target, inpath, output, split);
     
     // evaluate the solution over the testing set
     log.info("Best solution fitness (train set) : {}", bestTrainFit);

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/FileInfoParser.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/FileInfoParser.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/FileInfoParser.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/FileInfoParser.java Fri May  7 20:21:49 2010
@@ -117,10 +117,10 @@ public class FileInfoParser {
       throw new IllegalArgumentException("null inpath parameter");
     }
     if (!fs.exists(inpath)) {
-      throw new IllegalArgumentException("Input path does not exist");
+      throw new IllegalArgumentException("Input path does not exist: " + inpath);
     }
     if (!fs.getFileStatus(inpath).isDir()) {
-      throw new IllegalArgumentException("Input path should be a directory");
+      throw new IllegalArgumentException("Input path should be a directory: " + inpath);
     }
     
     // info file name

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluator.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluator.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluator.java Fri May  7 20:21:49 2010
@@ -63,7 +63,7 @@ public class CDMahoutEvaluator {
    * @throws IOException
    */
   public static void evaluate(List<? extends Rule> rules, int target,
-                              Path inpath, List<CDFitness> evaluations, DatasetSplit split)
+                              Path inpath, Path output, List<CDFitness> evaluations, DatasetSplit split)
   throws IOException {
     JobConf conf = new JobConf(CDMahoutEvaluator.class);
     FileSystem fs = FileSystem.get(inpath.toUri(), conf);
@@ -72,13 +72,11 @@ public class CDMahoutEvaluator {
     if (!fs.exists(inpath) || !fs.getFileStatus(inpath).isDir()) {
       throw new IllegalArgumentException("Input path not found or is not a directory");
     }
-    
-    Path outpath = new Path("output");
-    
-    configureJob(conf, rules, target, inpath, outpath, split);
+
+    configureJob(conf, rules, target, inpath, output, split);
     JobClient.runJob(conf);
     
-    importEvaluations(fs, conf, outpath, evaluations);
+    importEvaluations(fs, conf, output, evaluations);
   }
   
   /**
@@ -105,11 +103,11 @@ public class CDMahoutEvaluator {
    * @return the evaluation
    * @throws IOException
    */
-  public static CDFitness evaluate(Rule rule, int target, Path inpath,
+  public static CDFitness evaluate(Rule rule, int target, Path inpath, Path output,
                                    DatasetSplit split) throws IOException {
     List<CDFitness> evals = new ArrayList<CDFitness>();
     
-    evaluate(Arrays.asList(rule), target, inpath, evals, split);
+    evaluate(Arrays.asList(rule), target, inpath, output, evals, split);
     
     return evals.get(0);
   }
@@ -126,8 +124,8 @@ public class CDMahoutEvaluator {
    * @throws IOException
    */
   public static void evaluate(List<? extends Rule> rules, int target,
-                              Path inpath, List<CDFitness> evaluations) throws IOException {
-    evaluate(rules, target, inpath, evaluations, new DatasetSplit(1));
+                              Path inpath, Path output, List<CDFitness> evaluations) throws IOException {
+    evaluate(rules, target, inpath, output, evaluations, new DatasetSplit(1));
   }
   
   /**

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosTool.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosTool.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosTool.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosTool.java Fri May  7 20:21:49 2010
@@ -75,7 +75,7 @@ public class CDInfosTool {
    *        generated descriptions for each non ignored attribute
    * @throws IOException
    */
-  public static void gatherInfos(Descriptors descriptors, Path inpath,
+  public static void gatherInfos(Descriptors descriptors, Path inpath, Path output,
                                  List<String> descriptions) throws IOException {
     JobConf conf = new JobConf(CDInfosTool.class);
     FileSystem fs = FileSystem.get(inpath.toUri(), conf);
@@ -84,13 +84,11 @@ public class CDInfosTool {
     if (!fs.exists(inpath) || !fs.getFileStatus(inpath).isDir()) {
       throw new IllegalArgumentException("Input path not found or is not a directory");
     }
-    
-    Path outpath = new Path("output");
-    
-    configureJob(conf, descriptors, inpath, outpath);
+
+    configureJob(conf, descriptors, inpath, output);
     JobClient.runJob(conf);
     
-    importDescriptions(fs, conf, outpath, descriptions);
+    importDescriptions(fs, conf, output, descriptions);
   }
   
   /**
@@ -240,20 +238,20 @@ public class CDInfosTool {
         return;
       }
       
-      String input = cmdLine.getValue(inputOpt).toString();
+      Path input = new Path(cmdLine.getValue(inputOpt).toString());
+      Path output = new Path("output"); // TODO surely this should be configurable?
       
-      Path inpath = new Path(input);
-      FileSystem fs = FileSystem.get(inpath.toUri(), new Configuration());
+      FileSystem fs = FileSystem.get(input.toUri(), new Configuration());
       
       log.info("Loading Descriptors...");
-      Descriptors descriptors = loadDescriptors(fs, inpath);
+      Descriptors descriptors = loadDescriptors(fs, input);
       
       log.info("Gathering informations...");
       List<String> descriptions = new ArrayList<String>();
-      gatherInfos(descriptors, inpath, descriptions);
+      gatherInfos(descriptors, input, output, descriptions);
       
       log.info("Storing Descriptions...");
-      storeDescriptions(fs, inpath, descriptors, descriptions);
+      storeDescriptions(fs, input, descriptors, descriptions);
     } catch (OptionException e) {
       log.error("Error while parsing options", e);
       CommandLineUtil.printHelp(group);

Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluatorTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluatorTest.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluatorTest.java (original)
+++ lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMahoutEvaluatorTest.java Fri May  7 20:21:49 2010
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.ga.watchmaker.cd.hadoop;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.ga.watchmaker.cd.CDFitness;
@@ -43,12 +45,16 @@ public class CDMahoutEvaluatorTest exten
     }
 
     // dataset
+    // This is sensitive to the working directory where the test is run:
     Path input = new Path("target/test-classes/wdbc");
     CDMahoutEvaluator.initializeDataSet(input);
 
     // evaluate the rules
     List<CDFitness> results = new ArrayList<CDFitness>();
-    CDMahoutEvaluator.evaluate(rules, target, input, results);
+    Path output = getTestTempDirPath("output");
+    FileSystem fs = output.getFileSystem(new Configuration());
+    fs.delete(output, true); // It's unhappy if this directory exists
+    CDMahoutEvaluator.evaluate(rules, target, input, output, results);
 
     // check the results
     for (int index = 0; index < nbrules; index++) {

Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java (original)
+++ lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java Fri May  7 20:21:49 2010
@@ -210,6 +210,7 @@ public class CDInfosToolTest extends Mah
 
       // random dataset
       Path inpath = getTestTempDirPath("input");
+      Path output = getTestTempDirPath("output");
       FileSystem fs = FileSystem.get(inpath.toUri(), new Configuration());
       HadoopUtil.overwriteOutput(inpath);
 
@@ -217,7 +218,8 @@ public class CDInfosToolTest extends Mah
 
       // Start the tool
       List<String> result = new ArrayList<String>();
-      CDInfosTool.gatherInfos(descriptors, inpath, result);
+      fs.delete(output, true); // It's unhappy if this directory exists
+      CDInfosTool.gatherInfos(descriptors, inpath, output, result);
 
       // check the results
       Collection<String> target = new ArrayList<String>();

Modified: lucene/mahout/trunk/utils/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/pom.xml?rev=942203&r1=942202&r2=942203&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/pom.xml (original)
+++ lucene/mahout/trunk/utils/pom.xml Fri May  7 20:21:49 2010
@@ -125,6 +125,18 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.mahout</groupId>
+      <artifactId>mahout-math</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.mahout</groupId>
+      <artifactId>mahout-math</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
 
     <dependency>
       <groupId>junit</groupId>