You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by jm...@apache.org on 2010/01/13 09:01:42 UTC
svn commit: r898669 [3/3] - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/dirichlet/
core/src/main/java/org/ap...
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java Wed Jan 13 08:01:34 2010
@@ -30,6 +30,7 @@
import org.apache.mahout.math.Vector;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.VectorWritable;
class DisplayMeanShift extends DisplayDirichlet {
private DisplayMeanShift() {
@@ -60,8 +61,8 @@
// plot the sample data
g2.setColor(Color.DARK_GRAY);
dv.assign(0.03);
- for (Vector v : sampleData)
- plotRectangle(g2, v, dv);
+ for (VectorWritable v : sampleData)
+ plotRectangle(g2, v.get(), dv);
int i = 0;
for (MeanShiftCanopy canopy : canopies)
if (canopy.getBoundPoints().size() > 0.015 * sampleData.size()) {
@@ -76,8 +77,8 @@
private static void testReferenceImplementation() {
// add all points to the canopies
int nextCanopyId = 0;
- for (Vector aRaw : sampleData) {
- clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++), canopies);
+ for (VectorWritable aRaw : sampleData) {
+ clusterer.mergeCanopy(new MeanShiftCanopy(aRaw.get(), nextCanopyId++), canopies);
}
boolean done = false;
while (!done) {// shift canopies to their centroids
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java Wed Jan 13 08:01:34 2010
@@ -36,6 +36,7 @@
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -75,20 +76,19 @@
String input = cmdLine.getValue(inputOpt, "testdata").toString();
String output = cmdLine.getValue(outputOpt, "output").toString();
String vectorClassName = cmdLine.getValue(vectorOpt, "org.apache.mahout.math.SparseVector").toString();
- Class<? extends Vector> vectorClass = (Class<? extends Vector>) Class.forName(vectorClassName);
- runJob(input, output, vectorClass);
+ runJob(input, output);
} catch (OptionException e) {
LOG.error("Exception parsing command line: ", e);
CommandLineUtil.printHelp(group);
}
}
- public static void runJob(String input, String output, Class<? extends Vector> vectorClass) throws IOException {
+ public static void runJob(String input, String output) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(InputDriver.class);
conf.setOutputKeyClass(Text.class);
- conf.setOutputValueClass(vectorClass);
+ conf.setOutputValueClass(VectorWritable.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Wed Jan 13 08:01:34 2010
@@ -101,7 +101,7 @@
double t1 = Double.parseDouble(cmdLine.getValue(t1Opt, "80").toString());
double t2 = Double.parseDouble(cmdLine.getValue(t2Opt, "55").toString());
- runJob(input, output, measureClass, t1, t2, vectorClass);
+ runJob(input, output, measureClass, t1, t2);
} catch (OptionException e) {
LOG.error("Exception", e);
CommandLineUtil.printHelp(group);
@@ -131,8 +131,7 @@
* the canopy T2 threshold
*/
private static void runJob(String input, String output,
- String measureClassName, double t1, double t2,
- Class<? extends Vector> vectorClass) throws IOException {
+ String measureClassName, double t1, double t2) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
@@ -143,9 +142,9 @@
dfs.delete(outPath, true);
String directoryContainingConvertedInput = output
+ Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
- InputDriver.runJob(input, directoryContainingConvertedInput, vectorClass);
+ InputDriver.runJob(input, directoryContainingConvertedInput);
CanopyClusteringJob.runJob(directoryContainingConvertedInput, output,
- measureClassName, t1, t2, vectorClass);
+ measureClassName, t1, t2);
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java Wed Jan 13 08:01:34 2010
@@ -42,6 +42,7 @@
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -139,7 +140,7 @@
}
fs.mkdirs(outPath);
final String directoryContainingConvertedInput = output + DIRECTORY_CONTAINING_CONVERTED_INPUT;
- InputDriver.runJob(input, directoryContainingConvertedInput, vectorClass);
+ InputDriver.runJob(input, directoryContainingConvertedInput);
DirichletDriver.runJob(directoryContainingConvertedInput, output + "/state", modelFactory,
numModels, maxIterations, alpha_0, numReducers);
printResults(output + "/state", modelFactory, maxIterations, numModels,
@@ -156,7 +157,7 @@
*/
public static void printResults(String output, String modelDistribution,
int numIterations, int numModels, double alpha_0) {
- List<List<DirichletCluster<Vector>>> clusters = new ArrayList<List<DirichletCluster<Vector>>>();
+ List<List<DirichletCluster<VectorWritable>>> clusters = new ArrayList<List<DirichletCluster<VectorWritable>>>();
JobConf conf = new JobConf(KMeansDriver.class);
conf.set(DirichletDriver.MODEL_FACTORY_KEY, modelDistribution);
conf.set(DirichletDriver.NUM_CLUSTERS_KEY, Integer.toString(numModels));
@@ -175,12 +176,12 @@
* @param significant the minimum number of samples to enable printing a model
*/
private static void printResults(
- List<List<DirichletCluster<Vector>>> clusters, int significant) {
+ List<List<DirichletCluster<VectorWritable>>> clusters, int significant) {
int row = 0;
- for (List<DirichletCluster<Vector>> r : clusters) {
+ for (List<DirichletCluster<VectorWritable>> r : clusters) {
System.out.print("sample[" + row++ + "]= ");
for (int k = 0; k < r.size(); k++) {
- Model<Vector> model = r.get(k).getModel();
+ Model<VectorWritable> model = r.get(k).getModel();
if (model.count() > significant) {
int total = (int) r.get(k).getTotalCount();
System.out.print("m" + k + '(' + total + ')' + model.toString()
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java Wed Jan 13 08:01:34 2010
@@ -23,16 +23,17 @@
import org.apache.mahout.clustering.dirichlet.models.NormalModel;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
/**
* An implementation of the ModelDistribution interface suitable for testing the
* DirichletCluster algorithm. Uses a Normal Distribution
*/
-public class NormalScModelDistribution implements ModelDistribution<Vector> {
+public class NormalScModelDistribution implements ModelDistribution<VectorWritable> {
@Override
- public Model<Vector>[] sampleFromPrior(int howMany) {
- Model<Vector>[] result = new NormalModel[howMany];
+ public Model<VectorWritable>[] sampleFromPrior(int howMany) {
+ Model<VectorWritable>[] result = new NormalModel[howMany];
for (int i = 0; i < howMany; i++) {
DenseVector mean = new DenseVector(60);
for (int j = 0; j < 60; j++)
@@ -43,8 +44,8 @@
}
@Override
- public Model<Vector>[] sampleFromPosterior(Model<Vector>[] posterior) {
- Model<Vector>[] result = new NormalModel[posterior.length];
+ public Model<VectorWritable>[] sampleFromPosterior(Model<VectorWritable>[] posterior) {
+ Model<VectorWritable>[] result = new NormalModel[posterior.length];
for (int i = 0; i < posterior.length; i++) {
NormalModel m = (NormalModel) posterior[i];
result[i] = m.sample();
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Wed Jan 13 08:01:34 2010
@@ -100,7 +100,7 @@
String className = cmdLine.getValue(vectorClassOpt, "org.apache.mahout.math.SparseVector").toString();
Class<? extends Vector> vectorClass = Class.forName(className).asSubclass(Vector.class);
- runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations, vectorClass);
+ runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations);
} catch (OptionException e) {
LOG.error("Exception", e);
CommandLineUtil.printHelp(group);
@@ -127,8 +127,7 @@
* @param maxIterations the int maximum number of iterations
*/
private static void runJob(String input, String output, String measureClass,
- double t1, double t2, double convergenceDelta, int maxIterations,
- Class<? extends Vector> vectorClass) throws IOException {
+ double t1, double t2, double convergenceDelta, int maxIterations) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
@@ -140,14 +139,14 @@
final String directoryContainingConvertedInput = output
+ DIRECTORY_CONTAINING_CONVERTED_INPUT;
System.out.println("Preparing Input");
- InputDriver.runJob(input, directoryContainingConvertedInput, vectorClass);
+ InputDriver.runJob(input, directoryContainingConvertedInput);
System.out.println("Running Canopy to get initial clusters");
CanopyDriver.runJob(directoryContainingConvertedInput, output
+ CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, measureClass,
- t1, t2, vectorClass);
+ t1, t2);
System.out.println("Running KMeans");
KMeansDriver.runJob(directoryContainingConvertedInput, output
+ CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, output,
- measureClass, convergenceDelta, maxIterations, 1, vectorClass);
+ measureClass, convergenceDelta, maxIterations, 1);
}
}
Modified: lucene/mahout/trunk/math/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/pom.xml?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/pom.xml (original)
+++ lucene/mahout/trunk/math/pom.xml Wed Jan 13 08:01:34 2010
@@ -108,11 +108,6 @@
<dependencies>
<dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-core</artifactId>
- </dependency>
-
- <dependency>
<groupId>concurrent</groupId>
<artifactId>concurrent</artifactId>
<version>1.3.4</version>
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java Wed Jan 13 08:01:34 2010
@@ -31,9 +31,9 @@
/** A few universal implementations of convenience functions */
public abstract class AbstractMatrix implements Matrix {
- private Map<String, Integer> columnLabelBindings;
+ protected Map<String, Integer> columnLabelBindings;
- private Map<String, Integer> rowLabelBindings;
+ protected Map<String, Integer> rowLabelBindings;
@Override
public double get(String rowLabel, String columnLabel) throws IndexException,
@@ -460,71 +460,4 @@
return result;
}
- @Override
- public void readFields(DataInput in) throws IOException {
- // read the label bindings
- int colSize = in.readInt();
- if (colSize > 0) {
- columnLabelBindings = new HashMap<String, Integer>();
- for (int i = 0; i < colSize; i++) {
- columnLabelBindings.put(in.readUTF(), in.readInt());
- }
- }
- int rowSize = in.readInt();
- if (rowSize > 0) {
- rowLabelBindings = new HashMap<String, Integer>();
- for (int i = 0; i < rowSize; i++) {
- rowLabelBindings.put(in.readUTF(), in.readInt());
- }
- }
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- // write the label bindings
- if (columnLabelBindings == null) {
- out.writeInt(0);
- } else {
- out.writeInt(columnLabelBindings.size());
- for (Map.Entry<String, Integer> stringIntegerEntry : columnLabelBindings.entrySet()) {
- out.writeUTF(stringIntegerEntry.getKey());
- out.writeInt(stringIntegerEntry.getValue());
- }
- }
- if (rowLabelBindings == null) {
- out.writeInt(0);
- } else {
- out.writeInt(rowLabelBindings.size());
- for (Map.Entry<String, Integer> stringIntegerEntry : rowLabelBindings.entrySet()) {
- out.writeUTF(stringIntegerEntry.getKey());
- out.writeInt(stringIntegerEntry.getValue());
- }
- }
- }
-
- /** Reads a typed Matrix instance from the input stream */
- public static Matrix readMatrix(DataInput in) throws IOException {
- String matrixClassName = in.readUTF();
- Matrix matrix;
- try {
- matrix = Class.forName(matrixClassName).asSubclass(Matrix.class)
- .newInstance();
- } catch (ClassNotFoundException e) {
- throw new IllegalStateException(e);
- } catch (IllegalAccessException e) {
- throw new IllegalStateException(e);
- } catch (InstantiationException e) {
- throw new IllegalStateException(e);
- }
- matrix.readFields(in);
- return matrix;
- }
-
- /** Writes a typed Matrix instance to the output stream */
- public static void writeMatrix(DataOutput out, Matrix matrix)
- throws IOException {
- out.writeUTF(matrix.getClass().getName());
- matrix.write(out);
- }
-
}
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java Wed Jan 13 08:01:34 2010
@@ -20,11 +20,6 @@
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
-import org.apache.hadoop.io.WritableComparable;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
import java.lang.reflect.Type;
import java.util.HashMap;
import java.util.Iterator;
@@ -332,17 +327,6 @@
}
/**
- * Decodes a point from its WritableComparable<?> representation.
- *
- * @param writableComparable a WritableComparable<?> produced by asWritableComparable. Note the payload remainder: it
- * is optional, but can be present.
- * @return the n-dimensional point
- */
- public static Vector decodeVector(WritableComparable<?> writableComparable) {
- return decodeVector(writableComparable.toString());
- }
-
- /**
* Decodes a point from its string representation.
*
* @param formattedString a formatted String produced by asFormatString. Note the payload remainder: it is optional,
@@ -506,40 +490,4 @@
set(index, value);
}
- // cache most recent vector instance class name
- private static String instanceClassName;
-
- // cache most recent vector instance class
- private static Class<? extends Vector> instanceClass;
-
- /** Read and return a vector from the input */
- public static Vector readVector(DataInput in) throws IOException {
- String vectorClassName = in.readUTF();
- Vector vector;
- try {
- if (!vectorClassName.equals(instanceClassName)) {
- instanceClassName = vectorClassName;
- instanceClass = Class.forName(vectorClassName).asSubclass(Vector.class);
- }
- vector = instanceClass.newInstance();
- } catch (ClassNotFoundException e) {
- throw new IllegalStateException(e);
- } catch (IllegalAccessException e) {
- throw new IllegalStateException(e);
- } catch (InstantiationException e) {
- throw new IllegalStateException(e);
- }
- vector.readFields(in);
- return vector;
- }
-
- /** Write the vector to the output */
- public static void writeVector(DataOutput out, Vector vector)
- throws IOException {
- String vectorClassName = vector.getClass().getName();
- out.writeUTF(vectorClassName);
- vector.write(out);
-
- }
-
}
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseMatrix.java Wed Jan 13 08:01:34 2010
@@ -17,24 +17,21 @@
package org.apache.mahout.math;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
/** Matrix of doubles implemented using a 2-d array */
public class DenseMatrix extends AbstractMatrix {
- private double[][] values;
+ protected double[][] values;
public DenseMatrix() {
super();
}
- private int columnSize() {
+ protected int columnSize() {
return values[0].length;
}
- private int rowSize() {
+ protected int rowSize() {
return values.length;
}
@@ -153,30 +150,5 @@
}
return new DenseVector(values[row]);
}
-
- @Override
- public void readFields(DataInput in) throws IOException {
- super.readFields(in);
- int rows = in.readInt();
- int columns = in.readInt();
- this.values = new double[rows][columns];
- for (int row = 0; row < rows; row++) {
- for (int column = 0; column < columns; column++) {
- this.values[row][column] = in.readDouble();
- }
- }
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- super.write(out);
- out.writeInt(rowSize());
- out.writeInt(columnSize());
- for (double[] row : values) {
- for (double value : row) {
- out.writeDouble(value);
- }
- }
- }
-
+
}
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java Wed Jan 13 08:01:34 2010
@@ -27,8 +27,8 @@
/** Implements vector as an array of doubles */
public class DenseVector extends AbstractVector {
- private double[] values;
- private double lengthSquared = -1.0;
+ protected double[] values;
+ protected double lengthSquared = -1.0;
/** For serialization purposes only */
public DenseVector() {
@@ -58,6 +58,21 @@
this.values = new double[cardinality];
}
+ /**
+ * Copy-constructor (for use in turning a SparseVector into a dense one, for example)
+ * @param vector
+ */
+ public DenseVector(Vector vector) {
+ super(vector.getName());
+ values = new double[vector.size()];
+ Iterator<Vector.Element> it = vector.iterateNonZero();
+ Vector.Element e = null;
+ while(it.hasNext()) {
+ e = it.next();
+ values[e.index()] = e.get();
+ }
+ }
+
@Override
protected Matrix matrixLike(int rows, int columns) {
return new DenseMatrix(rows, columns);
@@ -227,28 +242,6 @@
return new Element(index);
}
- @Override
- public void write(DataOutput dataOutput) throws IOException {
- dataOutput.writeUTF(this.getName() == null ? "" : this.getName());
- dataOutput.writeInt(size());
- Iterator<Vector.Element> iter = iterateAll();
- while (iter.hasNext()) {
- Vector.Element element = iter.next();
- dataOutput.writeDouble(element.get());
- }
- }
-
- @Override
- public void readFields(DataInput dataInput) throws IOException {
- this.setName(dataInput.readUTF());
- double[] values = new double[dataInput.readInt()];
- for (int i = 0; i < values.length; i++) {
- values[i] = dataInput.readDouble();
- }
- this.values = values;
- lengthSquared = -1.0;
- }
-
/**
* Indicate whether the two objects are the same or not. Two {@link org.apache.mahout.math.Vector}s can be equal
* even if the underlying implementation is not equal.
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java Wed Jan 13 08:01:34 2010
@@ -17,12 +17,10 @@
package org.apache.mahout.math;
-import org.apache.hadoop.io.Writable;
-
import java.util.Map;
/** The basic interface including numerous convenience functions */
-public interface Matrix extends Cloneable, Writable {
+public interface Matrix extends Cloneable {
/** @return a formatted String suitable for output */
String asFormatString();
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java Wed Jan 13 08:01:34 2010
@@ -146,21 +146,4 @@
cardinality[COL]);
}
- @Override
- public void readFields(DataInput in) throws IOException {
- super.readFields(in);
- this.offset = new int[]{in.readInt(), in.readInt()};
- this.cardinality = new int[]{in.readInt(), in.readInt()};
- this.matrix = readMatrix(in);
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- super.write(out);
- out.writeInt(offset[ROW]);
- out.writeInt(offset[COL]);
- out.writeInt(cardinality[ROW]);
- out.writeInt(cardinality[COL]);
- writeMatrix(out, this.matrix);
- }
}
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java Wed Jan 13 08:01:34 2010
@@ -171,26 +171,4 @@
return new DenseVector(d);
}
- @Override
- public void readFields(DataInput in) throws IOException {
- super.readFields(in);
- this.cardinality = new int[]{in.readInt(), in.readInt()};
- int colSize = in.readInt();
- this.columns = new Vector[colSize];
- for (int col = 0; col < colSize; col++) {
- columns[col] = AbstractVector.readVector(in);
- }
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- super.write(out);
- out.writeInt(cardinality[ROW]);
- out.writeInt(cardinality[COL]);
- out.writeInt(columns.length);
- for (Vector col : columns) {
- AbstractVector.writeVector(out, col);
- }
- }
-
}
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseMatrix.java Wed Jan 13 08:01:34 2010
@@ -176,28 +176,4 @@
return res;
}
- @Override
- public void readFields(DataInput in) throws IOException {
- super.readFields(in);
- this.cardinality = new int[]{in.readInt(), in.readInt()};
- int rowsize = in.readInt();
- this.rows = new HashMap<Integer, Vector>();
- for (int row = 0; row < rowsize; row++) {
- int key = in.readInt();
- rows.put(key, AbstractVector.readVector(in));
- }
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- super.write(out);
- out.writeInt(cardinality[ROW]);
- out.writeInt(cardinality[COL]);
- out.writeInt(rows.size());
- for (Map.Entry<Integer, Vector> integerVectorEntry : rows.entrySet()) {
- out.writeInt(integerVectorEntry.getKey());
- AbstractVector.writeVector(out, integerVectorEntry.getValue());
- }
- }
-
}
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java Wed Jan 13 08:01:34 2010
@@ -167,26 +167,4 @@
return rows[row];
}
- @Override
- public void readFields(DataInput in) throws IOException {
- super.readFields(in);
- this.cardinality = new int[]{in.readInt(), in.readInt()};
- int rowsize = in.readInt();
- this.rows = new Vector[rowsize];
- for (int row = 0; row < rowsize; row++) {
- rows[row] = AbstractVector.readVector(in);
- }
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- super.write(out);
- out.writeInt(cardinality[ROW]);
- out.writeInt(cardinality[COL]);
- out.writeInt(rows.length);
- for (Vector row : rows) {
- AbstractVector.writeVector(out, row);
- }
- }
-
}
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseVector.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseVector.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseVector.java Wed Jan 13 08:01:34 2010
@@ -32,10 +32,10 @@
/** Implements vector that only stores non-zero doubles */
public class SparseVector extends AbstractVector {
- private OpenIntDoubleHashMap values;
+ protected OpenIntDoubleHashMap values;
- private int cardinality;
- private double lengthSquared = -1.0;
+ protected int cardinality;
+ protected double lengthSquared = -1.0;
/** For serialization purposes only. */
public SparseVector() {
@@ -278,43 +278,6 @@
}
}
-
- @Override
- public void write(DataOutput dataOutput) throws IOException {
- dataOutput.writeUTF(this.getName() == null ? "" : this.getName());
- dataOutput.writeInt(size());
- int nde = getNumNondefaultElements();
- dataOutput.writeInt(nde);
- Iterator<Vector.Element> iter = iterateNonZero();
- int count = 0;
- while (iter.hasNext()) {
- Vector.Element element = iter.next();
- dataOutput.writeInt(element.index());
- dataOutput.writeDouble(element.get());
- count++;
- }
- assert (nde == count);
- }
-
- @Override
- public void readFields(DataInput dataInput) throws IOException {
- this.setName(dataInput.readUTF());
- this.cardinality = dataInput.readInt();
- int size = dataInput.readInt();
- OpenIntDoubleHashMap values = new OpenIntDoubleHashMap((int) (size * 1.5));
- int i = 0;
- while (i < size) {
- int index = dataInput.readInt();
- double value = dataInput.readDouble();
- values.put(index, value);
- i++;
- }
- assert (i == size);
- this.values = values;
- this.lengthSquared = -1.0;
- }
-
-
@Override
public double getLengthSquared() {
if (lengthSquared >= 0.0) {
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Vector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Vector.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Vector.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Vector.java Wed Jan 13 08:01:34 2010
@@ -17,7 +17,6 @@
package org.apache.mahout.math;
-import org.apache.hadoop.io.Writable;
import java.util.Iterator;
import java.util.Map;
@@ -27,7 +26,7 @@
* constructor that takes an int for cardinality and a no-arg constructor that can be used for marshalling the Writable
* instance <p/> NOTE: Implementations may choose to reuse the Vector.Element in the Iterable methods
*/
-public interface Vector extends Cloneable, Writable {
+public interface Vector extends Cloneable {
/**
* Vectors may have a name associated with them, which makes them easy to identify
Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java Wed Jan 13 08:01:34 2010
@@ -280,22 +280,6 @@
}
@Override
- public void write(DataOutput dataOutput) throws IOException {
- dataOutput.writeUTF(this.getName() == null ? "" : this.getName());
- dataOutput.writeInt(offset);
- dataOutput.writeInt(cardinality);
- writeVector(dataOutput, vector);
- }
-
- @Override
- public void readFields(DataInput dataInput) throws IOException {
- this.setName(dataInput.readUTF());
- this.offset = dataInput.readInt();
- this.cardinality = dataInput.readInt();
- this.vector = readVector(dataInput);
- }
-
- @Override
public boolean equals(Object o) {
return this == o || (o instanceof Vector && equivalent(this, (Vector) o));
Modified: lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/MatrixTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/MatrixTest.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/MatrixTest.java (original)
+++ lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/MatrixTest.java Wed Jan 13 08:01:34 2010
@@ -18,21 +18,6 @@
package org.apache.mahout.math;
import junit.framework.TestCase;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.mahout.math.AbstractMatrix;
-import org.apache.mahout.math.CardinalityException;
-import org.apache.mahout.math.DenseMatrix;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.IndexException;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.NegateFunction;
-import org.apache.mahout.math.PlusFunction;
-import org.apache.mahout.math.UnboundLabelException;
-import org.apache.mahout.math.Vector;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
@@ -640,20 +625,4 @@
Matrix mm = AbstractMatrix.decodeMatrix(json);
assertEquals("Fee", m.get(0, 1), mm.get("Fee", "Bar"));
}
-
- public void testMatrixWritable() throws IOException {
- Matrix m = matrixFactory(new double[][]{{1, 3, 4}, {5, 2, 3},
- {1, 4, 2}});
- DataOutputBuffer out = new DataOutputBuffer();
- m.write(out);
- out.close();
-
- DataInputStream in = new DataInputStream(new ByteArrayInputStream(out
- .getData()));
- Matrix m2 = m.like();
- m2.readFields(in);
- in.close();
- assertEquals("row size", m.size()[ROW], m2.size()[ROW]);
- assertEquals("col size", m.size()[COL], m2.size()[COL]);
- }
}
Modified: lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/TestMatrixView.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/TestMatrixView.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/TestMatrixView.java (original)
+++ lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/TestMatrixView.java Wed Jan 13 08:01:34 2010
@@ -18,22 +18,6 @@
package org.apache.mahout.math;
import junit.framework.TestCase;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.mahout.math.AbstractMatrix;
-import org.apache.mahout.math.CardinalityException;
-import org.apache.mahout.math.DenseMatrix;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.IndexException;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.MatrixView;
-import org.apache.mahout.math.NegateFunction;
-import org.apache.mahout.math.PlusFunction;
-import org.apache.mahout.math.UnboundLabelException;
-import org.apache.mahout.math.Vector;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
@@ -522,20 +506,6 @@
}
}
- public void testMatrixWritable() throws IOException {
- DataOutputBuffer out = new DataOutputBuffer();
- test.write(out);
- out.close();
-
- DataInputStream in = new DataInputStream(new ByteArrayInputStream(out
- .getData()));
- Matrix m2 = test.clone();
- m2.readFields(in);
- in.close();
- assertEquals("row size", test.size()[ROW], m2.size()[ROW]);
- assertEquals("col size", test.size()[COL], m2.size()[COL]);
- }
-
public void testLabelBindings() {
assertNull("row bindings", test.getRowLabelBindings());
assertNull("col bindings", test.getColumnLabelBindings());
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java Wed Jan 13 08:01:34 2010
@@ -20,6 +20,7 @@
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
import java.util.Iterator;
import java.io.IOException;
@@ -82,7 +83,7 @@
@Override
public Vector next() {
- return transpose ? (Vector)key : (Vector)value;
+ return (transpose ? (VectorWritable)key : (VectorWritable)value).get();
}
/**
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Wed Jan 13 08:01:34 2010
@@ -20,6 +20,7 @@
import org.apache.mahout.math.Vector;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.LongWritable;
+import org.apache.mahout.math.VectorWritable;
import java.io.IOException;
@@ -42,7 +43,7 @@
break;
}
if (point != null) {
- writer.append(new LongWritable(recNum++), point);
+ writer.append(new LongWritable(recNum++), new VectorWritable(point));
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Wed Jan 13 08:01:34 2010
@@ -44,6 +44,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.mahout.math.SparseVector;
+import org.apache.mahout.math.VectorWritable;
/**
* This class converts a set of input documents in the sequence file format to
@@ -157,8 +158,8 @@
* the speed of your disk read
*
* @param minSupport
- * @param filePath
- * @param dictionaryPath
+ * @param wordCountPath
+ * @param dictionaryPathBase
* @throws IOException
*/
private static List<Path> createDictionaryChunks(int minSupport,
@@ -267,9 +268,9 @@
.setJobName("DictionaryVectorizer Vector generator to group Partial Vectors");
conf.setMapOutputKeyClass(Text.class);
- conf.setMapOutputValueClass(SparseVector.class);
+ conf.setMapOutputValueClass(VectorWritable.class);
conf.setOutputKeyClass(Text.class);
- conf.setOutputValueClass(SparseVector.class);
+ conf.setOutputValueClass(VectorWritable.class);
FileInputFormat.setInputPaths(conf,
getCommaSeparatedPaths(partialVectorPaths));
@@ -346,7 +347,7 @@
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(Text.class);
- conf.setOutputValueClass(SparseVector.class);
+ conf.setOutputValueClass(VectorWritable.class);
DistributedCache
.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
FileInputFormat.setInputPaths(conf, new Path(input));
@@ -371,7 +372,9 @@
* Count the frequencies of words in parallel using Map/Reduce. The input
* documents have to be in {@link SequenceFile} format
*
- * @param params
+ * @param input
+ * @param output
+ * @param analyzer
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java Wed Jan 13 08:01:34 2010
@@ -42,21 +42,24 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.mahout.math.SparseVector;
+import org.apache.mahout.math.VectorWritable;
/**
* Converts a document in to a SparseVector
*/
public class PartialVectorGenerator extends MapReduceBase implements
- Reducer<Text,Text,Text,SparseVector> {
+ Reducer<Text,Text,Text, VectorWritable> {
private Analyzer analyzer;
private Map<String,Integer> dictionary = new HashMap<String,Integer>();
private FileSystem fs; // local filesystem
private URI[] localFiles; // local filenames from the distributed cache
-
+
+ private VectorWritable vectorWritable = new VectorWritable();
+
@Override
public void reduce(Text key,
Iterator<Text> values,
- OutputCollector<Text,SparseVector> output,
+ OutputCollector<Text,VectorWritable> output,
Reporter reporter) throws IOException {
if (values.hasNext()) {
@@ -71,6 +74,7 @@
int count = 0;
while ((token = ts.next(token)) != null) {
String tk = new String(token.termBuffer(), 0, token.termLength());
+ if(dictionary.containsKey(tk) == false) continue;
if (termFrequency.containsKey(tk) == false) {
count += tk.length() + 1;
termFrequency.put(tk, new MutableInt(0));
@@ -88,8 +92,8 @@
vector.setQuick(dictionary.get(tk).intValue(), pair.getValue()
.doubleValue());
}
-
- output.collect(key, vector);
+ vectorWritable.set(vector);
+ output.collect(key, vectorWritable);
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorMerger.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorMerger.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorMerger.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorMerger.java Wed Jan 13 08:01:34 2010
@@ -26,26 +26,30 @@
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.math.SparseVector;
+import org.apache.mahout.math.VectorWritable;
/**
* Converts a document in to a SparseVector
*/
public class PartialVectorMerger extends MapReduceBase implements
- Reducer<Text,SparseVector,Text,SparseVector> {
-
+ Reducer<Text,VectorWritable,Text, VectorWritable> {
+
+ private VectorWritable vectorWritable = new VectorWritable();
+
@Override
public void reduce(Text key,
- Iterator<SparseVector> values,
- OutputCollector<Text,SparseVector> output,
+ Iterator<VectorWritable> values,
+ OutputCollector<Text,VectorWritable> output,
Reporter reporter) throws IOException {
SparseVector vector =
new SparseVector(key.toString(), Integer.MAX_VALUE, 10);
while (values.hasNext()) {
- SparseVector value = values.next();
- value.addTo(vector);
+ VectorWritable value = values.next();
+ value.get().addTo(vector);
}
- output.collect(key, vector);
+ vectorWritable.set(vector);
+ output.collect(key, vectorWritable);
}
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java Wed Jan 13 08:01:34 2010
@@ -25,6 +25,7 @@
import org.apache.hadoop.io.SequenceFile;
import org.apache.mahout.math.SparseVector;
import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
import java.io.File;
@@ -39,7 +40,7 @@
Path path = new Path(tmpFile.getAbsolutePath());
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
- SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, SparseVector.class);
+ SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
RandomVectorIterable iter = new RandomVectorIterable(50);
writer.write(iter);
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java Wed Jan 13 08:01:34 2010
@@ -26,6 +26,7 @@
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.SparseVector;
import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.utils.vectors.RandomVectorIterable;
import java.io.File;
@@ -44,7 +45,7 @@
Path path = new Path(tmpFile.getAbsolutePath());
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
- SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, SparseVector.class);
+ SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
RandomVectorIterable iter = new RandomVectorIterable(50);
writer.write(iter);
@@ -52,7 +53,7 @@
SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, path, conf);
LongWritable key = new LongWritable();
- SparseVector value = new SparseVector();
+ VectorWritable value = new VectorWritable();
int count = 0;
while (seqReader.next(key, value)){
count++;
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Wed Jan 13 08:01:34 2010
@@ -19,17 +19,25 @@
import java.io.File;
import java.io.IOException;
+import java.io.StringReader;
import java.net.URISyntaxException;
-import java.util.Random;
+import java.util.*;
import junit.framework.TestCase;
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang.mutable.MutableInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.math.SparseVector;
/**
* Test the dictionary Vector
@@ -132,5 +140,112 @@
"output/wordcount", new StandardAnalyzer(), 2, 100);
- }
+ }
+
+ public void testPerf() throws Exception {
+ Analyzer analyzer = new SimpleAnalyzer();
+ String key = "key";
+ String value = "";
+ for(String doc : DOCS) value += doc + " ";
+ Map<String, Integer> dictionary = new HashMap<String,Integer>();
+
+ TokenStream ts = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
+
+ Token token = new Token();
+ int count = 0;
+ while ((token = ts.next(token)) != null) {
+ String tk = new String(token.termBuffer(), 0, token.termLength());
+ if(dictionary.containsKey(tk)) continue;
+ dictionary.put(tk, count++);
+ }
+
+
+ long vectorOnlyTotal = 0;
+ long total = 0;
+
+ Random rand = new Random(12345);
+ String[] docs = generateRandomText(1000);
+
+ for(int i=0; i<21000; i++) {
+
+ long time = System.nanoTime();
+
+ value = docs[rand.nextInt(docs.length)];
+ ts = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
+
+ SparseVector vector;
+ Map<String,MutableInt> termFrequency = new HashMap<String,MutableInt>();
+
+ token = new Token();
+ ts.reset();
+ while ((token = ts.next(token)) != null) {
+ String tk = new String(token.termBuffer(), 0, token.termLength());
+ if(dictionary.containsKey(tk) == false) continue;
+ if (termFrequency.containsKey(tk) == false) {
+ count += tk.length() + 1;
+ termFrequency.put(tk, new MutableInt(0));
+ }
+ termFrequency.get(tk).increment();
+ }
+
+ vector =
+ new SparseVector(key.toString(), Integer.MAX_VALUE, termFrequency.size());
+
+ for (Map.Entry<String,MutableInt> pair : termFrequency.entrySet()) {
+ String tk = pair.getKey();
+ if (dictionary.containsKey(tk) == false) continue;
+ vector.setQuick(dictionary.get(tk).intValue(), pair.getValue()
+ .doubleValue());
+ }
+ total += (i<1000?0:1)*(System.nanoTime() - time);
+
+ time = System.nanoTime();
+
+
+ value = docs[rand.nextInt(docs.length)];
+ ts = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
+
+ vector =
+ new SparseVector(key.toString(), Integer.MAX_VALUE, 10);
+
+ token = new Token();
+ ts.reset();
+ while ((token = ts.next(token)) != null) {
+ String tk = new String(token.termBuffer(), 0, token.termLength());
+ if(dictionary.containsKey(tk) == false) continue;
+ int tokenKey = dictionary.get(tk);
+ vector.setQuick(tokenKey, vector.getQuick(tokenKey) + 1);
+ }
+ vectorOnlyTotal += (i<1000?0:1)*(System.nanoTime() - time);
+
+
+ }
+
+ System.out.println("With map: " + (total / 1e6) + "ms/KVect, with vector only: " + (vectorOnlyTotal/1e6) + "ms/KVect");
+
+ }
+ private static final String [] DOCS = {
+ "The quick red fox jumped over the lazy brown dogs.",
+ "Mary had a little lamb whose fleece was white as snow.",
+ "Moby Dick is a story of a whale and a man obsessed.",
+ "The robber wore a black fleece jacket and a baseball cap.",
+ "The English Springer Spaniel is the best of all dogs."
+ };
+
+ public static String[] generateRandomText(int docs) throws Exception {
+ String[] s = new String[docs];
+ Random r = new Random(1234);
+ for(int i=0; i<s.length; i++) {
+ String str = DOCS[i % DOCS.length];
+ String[] tokens = str.split(" ");
+ String[] other = DOCS[r.nextInt(DOCS.length)].split(" ");
+ List<String> l = new ArrayList<String>();
+ for(String t : tokens) {
+ l.add(r.nextBoolean() ? t : other[r.nextInt(other.length)]);
+ }
+ s[i] = StringUtils.join(l.toArray(new String[l.size()]), " ");
+ }
+ return s;
+ }
+
}