You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by jm...@apache.org on 2010/03/06 06:10:54 UTC
svn commit: r919704 - in /lucene/mahout/trunk: conf/
core/src/main/java/org/apache/mahout/math/hadoop/decomposer/
core/src/test/java/org/apache/mahout/clustering/
core/src/test/java/org/apache/mahout/math/hadoop/decomposer/
Author: jmannix
Date: Sat Mar 6 05:10:54 2010
New Revision: 919704
URL: http://svn.apache.org/viewvc?rev=919704&view=rev
Log:
Last few commmits for MAHOUT-310
Added:
lucene/mahout/trunk/conf/transpose.props
Modified:
lucene/mahout/trunk/conf/driver.classes.props
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java
Modified: lucene/mahout/trunk/conf/driver.classes.props
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/conf/driver.classes.props?rev=919704&r1=919703&r2=919704&view=diff
==============================================================================
--- lucene/mahout/trunk/conf/driver.classes.props (original)
+++ lucene/mahout/trunk/conf/driver.classes.props Sat Mar 6 05:10:54 2010
@@ -8,6 +8,7 @@
org.apache.mahout.clustering.dirichlet.DirichletDriver = dirichlet : Dirichlet Clustering
org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver = meanshift : Mean Shift clustering
org.apache.mahout.clustering.canopy.CanopyDriver = canopy : Canopy clustering
+org.apache.mahout.math.hadoop.TransposeJob = transpose : Take the transpose of a matrix
org.apache.mahout.utils.vectors.lucene.Driver = lucene.vector : Generate Vectors from a Lucene index
org.apache.mahout.text.SequenceFilesFromDirectory = seqdirectory : Generate sequence files (of Text) from a directory
org.apache.mahout.text.SparseVectorsFromSequenceFiles = seq2sparse: Sparse Vector generation from Text sequence files
Added: lucene/mahout/trunk/conf/transpose.props
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/conf/transpose.props?rev=919704&view=auto
==============================================================================
--- lucene/mahout/trunk/conf/transpose.props (added)
+++ lucene/mahout/trunk/conf/transpose.props Sat Mar 6 05:10:54 2010
@@ -0,0 +1,2 @@
+#i|input =
+#o|output =
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java?rev=919704&r1=919703&r2=919704&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java Sat Mar 6 05:10:54 2010
@@ -71,6 +71,7 @@
String outputTmpPathString = parsedArgs.get("--tempDir");
int numRows = Integer.parseInt(parsedArgs.get("--numRows"));
int numCols = Integer.parseInt(parsedArgs.get("--numCols"));
+ boolean isSymmetric = Boolean.parseBoolean(parsedArgs.get("--symmetric"));
int desiredRank = Integer.parseInt(parsedArgs.get("--rank"));
Matrix eigenVectors = new DenseMatrix(desiredRank, numCols);
List<Double> eigenValues = new ArrayList<Double>();
@@ -81,7 +82,7 @@
numRows,
numCols);
matrix.configure(new JobConf(getConf()));
- solve(matrix, desiredRank, eigenVectors, eigenValues);
+ solve(matrix, desiredRank, eigenVectors, eigenValues, isSymmetric);
serializeOutput(eigenVectors, eigenValues, outputEigenVectorPath);
return 0;
@@ -154,9 +155,20 @@
"r",
"Desired decomposition rank (note: only roughly 1/4 to 1/3 "
+ "of these will have the top portion of the spectrum)");
-
- DistributedLanczosSolver.this.parsedArgs = parseArguments(args, numRowsOpt, numColsOpt, desiredRankOpt);
- return DistributedLanczosSolver.this.run(args);
+ Option isSymmetricOpt = buildOption("symmetric",
+ "sym",
+ "Is the input matrix square and symmetric?");
+
+ DistributedLanczosSolver.this.parsedArgs = parseArguments(args,
+ numRowsOpt,
+ numColsOpt,
+ desiredRankOpt,
+ isSymmetricOpt);
+ if (DistributedLanczosSolver.this.parsedArgs == null) {
+ return -1;
+ } else {
+ return DistributedLanczosSolver.this.run(args);
+ }
}
}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java?rev=919704&r1=919703&r2=919704&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java Sat Mar 6 05:10:54 2010
@@ -20,6 +20,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.mahout.math.VectorWritable;
@@ -31,13 +32,27 @@
private ClusteringTestUtils() {
}
- public static void writePointsToFile(Iterable<VectorWritable> points, String fileName, FileSystem fs, Configuration conf)
- throws IOException {
+ public static void writePointsToFile(Iterable<VectorWritable> points,
+ String fileName,
+ FileSystem fs,
+ Configuration conf) throws IOException {
+ writePointsToFile(points, false, fileName, fs, conf);
+ }
+
+ public static void writePointsToFile(Iterable<VectorWritable> points,
+ boolean intWritable,
+ String fileName,
+ FileSystem fs,
+ Configuration conf) throws IOException {
Path path = new Path(fileName);
- SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
- long recNum = 0;
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs,
+ conf,
+ path,
+ intWritable ? IntWritable.class : LongWritable.class,
+ VectorWritable.class);
+ int recNum = 0;
for (VectorWritable point : points) {
- writer.append(new LongWritable(recNum++), point);
+ writer.append(intWritable ? new IntWritable(recNum++) : new LongWritable(recNum++), point);
}
writer.close();
}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java?rev=919704&r1=919703&r2=919704&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/decomposer/TestDistributedLanczosSolver.java Sat Mar 6 05:10:54 2010
@@ -1,21 +1,15 @@
package org.apache.mahout.math.hadoop.decomposer;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.JobConf;
-import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.canopy.TestCanopyCreation;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.MatrixSlice;
-import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.decomposer.SolverTest;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+import org.apache.mahout.math.hadoop.TestDistributedRowMatrix;
import java.io.File;
-import java.io.IOException;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
@@ -25,20 +19,27 @@
super(name);
}
- public void testDistributedLanczosSolver() throws Exception {
+ public void doTestDistributedLanczosSolver(boolean symmetric) throws Exception {
File testData = new File("testdata");
if (!testData.exists()) {
testData.mkdir();
}
- DistributedRowMatrix corpus = randomDistributedMatrix(1000, 900, 800, 100, 10.0, "testdata");
+ DistributedRowMatrix corpus = TestDistributedRowMatrix.randomDistributedMatrix(500,
+ 450, 400, 10, 10.0, symmetric, "testdata");
corpus.configure(new JobConf());
DistributedLanczosSolver solver = new DistributedLanczosSolver();
int desiredRank = 30;
- Matrix eigenVectors = new DenseMatrix(desiredRank, 800);
+ Matrix eigenVectors = new DenseMatrix(desiredRank, corpus.numCols());
List<Double> eigenValues = new ArrayList<Double>();
- solver.solve(corpus, desiredRank, eigenVectors, eigenValues);
+ solver.solve(corpus, desiredRank, eigenVectors, eigenValues, symmetric);
assertOrthonormal(eigenVectors);
- assertEigen(eigenVectors, corpus, eigenVectors.numRows() / 2, 0.01);
+ assertEigen(eigenVectors, corpus, eigenVectors.numRows() / 2, 0.01, symmetric);
+ }
+
+ public void testDistributedLanczosSolver() throws Exception {
+ // doTestDistributedLanczosSolver(false);
+ // TestCanopyCreation.rmr("testData");
+ doTestDistributedLanczosSolver(true);
}
@Override
@@ -48,41 +49,4 @@
}
- public static DistributedRowMatrix randomDistributedMatrix(int numRows,
- int nonNullRows,
- int numCols,
- int entriesPerRow,
- double entryMean,
- String baseTmpDir) throws IOException {
- final Matrix m = randomSequentialAccessSparseMatrix(numRows, nonNullRows, numCols, entriesPerRow, entryMean);
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
-
- ClusteringTestUtils.writePointsToFile(new Iterable<VectorWritable>() {
- @Override
- public Iterator<VectorWritable> iterator() {
- final Iterator<MatrixSlice> it = m.iterator();
- final VectorWritable v = new VectorWritable();
- return new Iterator<VectorWritable>() {
- @Override
- public boolean hasNext() { return it.hasNext(); }
- @Override
- public VectorWritable next() {
- MatrixSlice slice = it.next();
- v.set(slice.vector());
- return v;
- }
- @Override
- public void remove() { it.remove(); }
- };
- }
- }, baseTmpDir + "/distMatrix", fs, conf);
-
- DistributedRowMatrix distMatrix = new DistributedRowMatrix(baseTmpDir + "/distMatrix",
- baseTmpDir + "/tmpOut",
- m.numRows(),
- m.numCols());
-
- return distMatrix;
- }
}