You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by jm...@apache.org on 2010/03/06 06:10:54 UTC

svn commit: r919704 - in /lucene/mahout/trunk: conf/ core/src/main/java/org/apache/mahout/math/hadoop/decomposer/ core/src/test/java/org/apache/mahout/clustering/ core/src/test/java/org/apache/mahout/math/hadoop/decomposer/

Author: jmannix
Date: Sat Mar  6 05:10:54 2010
New Revision: 919704

URL: http://svn.apache.org/viewvc?rev=919704&view=rev
Log:
Last few commmits for MAHOUT-310

Added:
    lucene/mahout/trunk/conf/transpose.props
Modified:
    lucene/mahout/trunk/conf/driver.classes.props
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java

Modified: lucene/mahout/trunk/conf/driver.classes.props
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/conf/driver.classes.props?rev=919704&r1=919703&r2=919704&view=diff
==============================================================================
--- lucene/mahout/trunk/conf/driver.classes.props (original)
+++ lucene/mahout/trunk/conf/driver.classes.props Sat Mar  6 05:10:54 2010
@@ -8,6 +8,7 @@
 org.apache.mahout.clustering.dirichlet.DirichletDriver = dirichlet : Dirichlet Clustering
 org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver = meanshift : Mean Shift clustering
 org.apache.mahout.clustering.canopy.CanopyDriver = canopy : Canopy clustering
+org.apache.mahout.math.hadoop.TransposeJob = transpose : Take the transpose of a matrix
 org.apache.mahout.utils.vectors.lucene.Driver = lucene.vector : Generate Vectors from a Lucene index
 org.apache.mahout.text.SequenceFilesFromDirectory = seqdirectory : Generate sequence files (of Text) from a directory
 org.apache.mahout.text.SparseVectorsFromSequenceFiles = seq2sparse: Sparse Vector generation from Text sequence files

Added: lucene/mahout/trunk/conf/transpose.props
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/conf/transpose.props?rev=919704&view=auto
==============================================================================
--- lucene/mahout/trunk/conf/transpose.props (added)
+++ lucene/mahout/trunk/conf/transpose.props Sat Mar  6 05:10:54 2010
@@ -0,0 +1,2 @@
+#i|input =
+#o|output =

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java?rev=919704&r1=919703&r2=919704&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java Sat Mar  6 05:10:54 2010
@@ -71,6 +71,7 @@
     String outputTmpPathString = parsedArgs.get("--tempDir");
     int numRows = Integer.parseInt(parsedArgs.get("--numRows"));
     int numCols = Integer.parseInt(parsedArgs.get("--numCols"));
+    boolean isSymmetric = Boolean.parseBoolean(parsedArgs.get("--symmetric"));
     int desiredRank = Integer.parseInt(parsedArgs.get("--rank"));
     Matrix eigenVectors = new DenseMatrix(desiredRank, numCols);
     List<Double> eigenValues = new ArrayList<Double>();
@@ -81,7 +82,7 @@
                                                            numRows,
                                                            numCols);
     matrix.configure(new JobConf(getConf()));
-    solve(matrix, desiredRank, eigenVectors, eigenValues);
+    solve(matrix, desiredRank, eigenVectors, eigenValues, isSymmetric);
 
     serializeOutput(eigenVectors, eigenValues, outputEigenVectorPath);  
     return 0;
@@ -154,9 +155,20 @@
                                           "r",
                                           "Desired decomposition rank (note: only roughly 1/4 to 1/3 "
                                         + "of these will have the top portion of the spectrum)");
-
-      DistributedLanczosSolver.this.parsedArgs = parseArguments(args, numRowsOpt, numColsOpt, desiredRankOpt);
-      return DistributedLanczosSolver.this.run(args);
+      Option isSymmetricOpt = buildOption("symmetric",
+                                          "sym",
+                                          "Is the input matrix square and symmetric?");
+
+      DistributedLanczosSolver.this.parsedArgs = parseArguments(args,
+                                                                numRowsOpt,
+                                                                numColsOpt,
+                                                                desiredRankOpt,
+                                                                isSymmetricOpt);
+      if (DistributedLanczosSolver.this.parsedArgs == null) {
+        return -1;
+      } else {
+        return DistributedLanczosSolver.this.run(args);
+      }
     }
   }
 

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java?rev=919704&r1=919703&r2=919704&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java Sat Mar  6 05:10:54 2010
@@ -20,6 +20,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.mahout.math.VectorWritable;
@@ -31,13 +32,27 @@
   private ClusteringTestUtils() {
   }
 
-  public static void writePointsToFile(Iterable<VectorWritable> points, String fileName, FileSystem fs, Configuration conf)
-      throws IOException {
+  public static void writePointsToFile(Iterable<VectorWritable> points,
+                                       String fileName,
+                                       FileSystem fs,
+                                       Configuration conf) throws IOException {
+    writePointsToFile(points, false, fileName, fs, conf);
+  }
+
+  public static void writePointsToFile(Iterable<VectorWritable> points,
+                                       boolean intWritable,
+                                       String fileName,
+                                       FileSystem fs,
+                                       Configuration conf) throws IOException {
     Path path = new Path(fileName);
-    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
-    long recNum = 0;
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs,
+                                                         conf,
+                                                         path,
+                                                         intWritable ? IntWritable.class : LongWritable.class,
+                                                         VectorWritable.class);
+    int recNum = 0;
     for (VectorWritable point : points) {
-      writer.append(new LongWritable(recNum++), point);
+      writer.append(intWritable ? new IntWritable(recNum++) : new LongWritable(recNum++), point);
     }
     writer.close();
   }

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java?rev=919704&r1=919703&r2=919704&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java Sat Mar  6 05:10:54 2010
@@ -1,21 +1,15 @@
 package org.apache.mahout.math.hadoop.decomposer;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.mapred.JobConf;
-import org.apache.mahout.clustering.ClusteringTestUtils;
 import org.apache.mahout.clustering.canopy.TestCanopyCreation;
 import org.apache.mahout.math.DenseMatrix;
 import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.MatrixSlice;
-import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.decomposer.SolverTest;
 import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+import org.apache.mahout.math.hadoop.TestDistributedRowMatrix;
 
 import java.io.File;
-import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Iterator;
 import java.util.List;
 
 
@@ -25,20 +19,27 @@
     super(name);
   }
 
-  public void testDistributedLanczosSolver() throws Exception {
+  public void doTestDistributedLanczosSolver(boolean symmetric) throws Exception {
     File testData = new File("testdata");
     if (!testData.exists()) {
       testData.mkdir();
     }
-    DistributedRowMatrix corpus = randomDistributedMatrix(1000, 900, 800, 100, 10.0, "testdata");
+    DistributedRowMatrix corpus = TestDistributedRowMatrix.randomDistributedMatrix(500,
+        450, 400, 10, 10.0, symmetric, "testdata");
     corpus.configure(new JobConf());
     DistributedLanczosSolver solver = new DistributedLanczosSolver();
     int desiredRank = 30;
-    Matrix eigenVectors = new DenseMatrix(desiredRank, 800);
+    Matrix eigenVectors = new DenseMatrix(desiredRank, corpus.numCols());
     List<Double> eigenValues = new ArrayList<Double>();
-    solver.solve(corpus, desiredRank, eigenVectors, eigenValues);
+    solver.solve(corpus, desiredRank, eigenVectors, eigenValues, symmetric);
     assertOrthonormal(eigenVectors);
-    assertEigen(eigenVectors, corpus, eigenVectors.numRows() / 2, 0.01);
+    assertEigen(eigenVectors, corpus, eigenVectors.numRows() / 2, 0.01, symmetric);
+  }
+
+  public void testDistributedLanczosSolver() throws Exception {
+  //  doTestDistributedLanczosSolver(false);
+  //  TestCanopyCreation.rmr("testData");
+    doTestDistributedLanczosSolver(true);
   }
 
   @Override
@@ -48,41 +49,4 @@
   }
 
 
-  public static DistributedRowMatrix randomDistributedMatrix(int numRows,
-                                                          int nonNullRows,
-                                                          int numCols,
-                                                          int entriesPerRow,
-                                                          double entryMean,
-                                                          String baseTmpDir) throws IOException {
-    final Matrix m = randomSequentialAccessSparseMatrix(numRows, nonNullRows, numCols, entriesPerRow, entryMean);
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(conf);
-
-    ClusteringTestUtils.writePointsToFile(new Iterable<VectorWritable>() {
-      @Override
-      public Iterator<VectorWritable> iterator() {
-        final Iterator<MatrixSlice> it = m.iterator();
-        final VectorWritable v = new VectorWritable();
-        return new Iterator<VectorWritable>() {
-          @Override
-          public boolean hasNext() { return it.hasNext(); }
-          @Override
-          public VectorWritable next() {
-            MatrixSlice slice = it.next();
-            v.set(slice.vector());
-            return v;
-          }
-          @Override
-          public void remove() { it.remove(); }
-        };
-      }
-    }, baseTmpDir + "/distMatrix", fs, conf);
-
-    DistributedRowMatrix distMatrix = new DistributedRowMatrix(baseTmpDir + "/distMatrix",
-                                                               baseTmpDir + "/tmpOut",
-                                                               m.numRows(),
-                                                               m.numCols());
-
-    return distMatrix;
-  }
 }