You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2012/06/20 14:07:58 UTC
svn commit: r1352052 [4/7] - in /mahout/trunk: ./ buildtools/
buildtools/src/main/resources/ core/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/
core/src/main/java/org/apache/mahout/cf/t...
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java Wed Jun 20 12:07:50 2012
@@ -442,8 +442,9 @@ public final class BtJob {
double xii = xi.size() > btIndex ? xi.getQuick(btIndex) : 0.0;
// compute s_b
pmult.setMultiplicator(xii);
- if (sbAccum == null)
+ if (sbAccum == null) {
sbAccum = new DenseVector(btRow.size());
+ }
sbAccum.assign(btRow, pmult);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java Wed Jun 20 12:07:50 2012
@@ -73,9 +73,9 @@ public class DenseBlockWritable implemen
out.writeInt(m);
out.writeInt(n);
- for (int i = 0; i < m; i++) {
+ for (double[] aBlock : block) {
for (int j = 0; j < n; j++) {
- out.writeDouble(block[i][j]);
+ out.writeDouble(aBlock[j]);
}
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/Omega.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/Omega.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/Omega.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/Omega.java Wed Jun 20 12:07:50 2012
@@ -135,7 +135,9 @@ public class Omega {
if (v.isDense()) {
for (int k = 0; k < v.size(); k++)
// it's ok, this is reentrant
+ {
result += getQuick(k, index) * v.getQuick(k);
+ }
} else {
for (Iterator<Vector.Element> iter = v.iterateNonZero(); iter.hasNext();) {
@@ -157,12 +159,13 @@ public class Omega {
}
return res;
} catch (InterruptedException exc) {
- throw new RuntimeException("Interrupted", exc);
+ throw new IllegalStateException("Interrupted", exc);
} catch (ExecutionException exc) {
- if (exc.getCause() instanceof RuntimeException)
+ if (exc.getCause() instanceof RuntimeException) {
throw (RuntimeException) exc.getCause();
- else
- throw new RuntimeException(exc.getCause());
+ } else {
+ throw new IllegalStateException(exc.getCause());
+ }
}
} finally {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java Wed Jun 20 12:07:50 2012
@@ -115,7 +115,7 @@ public class SSVDCli extends AbstractJob
throw new IOException("No Hadoop configuration present");
}
- Path[] inputPaths = new Path[] { getInputPath() };
+ Path[] inputPaths = { getInputPath() };
// MAHOUT-817
if (pca && xiPath == null) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java Wed Jun 20 12:07:50 2012
@@ -35,7 +35,6 @@ import org.apache.mahout.common.iterator
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
-import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.Vector;
@@ -49,8 +48,12 @@ import com.google.common.io.Closeables;
* set of small file manipulation helpers.
*
*/
+public final class SSVDHelper {
-public class SSVDHelper {
+ private static final Pattern OUTPUT_FILE_PATTERN = Pattern.compile("(\\w+)-(m|r)-(\\d+)(\\.\\w+)?");
+
+ private SSVDHelper() {
+ }
/**
* load single vector from an hdfs file (possibly presented as glob).
@@ -66,12 +69,14 @@ public class SSVDHelper {
conf);
try {
- if (!iter.hasNext())
+ if (!iter.hasNext()) {
throw new IOException("Empty input while reading vector");
+ }
VectorWritable vw = iter.next();
- if (iter.hasNext())
+ if (iter.hasNext()) {
throw new IOException("Unexpected data after the end of vector file");
+ }
return vw.get();
@@ -83,11 +88,7 @@ public class SSVDHelper {
/**
* save single vector into hdfs file.
*
- * @param v
- * vector to save
- * @param vectorFilePath
- * @param conf
- * @throws IOException
+ * @param v vector to save
*/
public static void saveVector(Vector v,
Path vectorFilePath,
@@ -125,11 +126,10 @@ public class SSVDHelper {
}
FileStatus firstSeqFile;
- if (!fstats[0].isDir()) {
- firstSeqFile = fstats[0];
+ if (fstats[0].isDir()) {
+ firstSeqFile = fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0];
} else {
- firstSeqFile =
- fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0];
+ firstSeqFile = fstats[0];
}
SequenceFile.Reader r = null;
@@ -143,9 +143,6 @@ public class SSVDHelper {
throw new IOException("Unable to open input files to determine input label type.");
}
- private static final Pattern OUTPUT_FILE_PATTERN =
- Pattern.compile("(\\w+)-(m|r)-(\\d+)(\\.\\w+)?");
-
static final Comparator<FileStatus> PARTITION_COMPARATOR =
new Comparator<FileStatus>() {
private final Matcher matcher = OUTPUT_FILE_PATTERN.matcher("");
@@ -181,13 +178,8 @@ public class SSVDHelper {
* @param conf
* configuration
* @return Dense matrix array
- * @throws IOException
- * when I/O occurs.
*/
- public static double[][] loadDistributedRowMatrix(FileSystem fs,
- Path glob,
- Configuration conf)
- throws IOException {
+ public static double[][] loadDistributedRowMatrix(FileSystem fs, Path glob, Configuration conf) throws IOException {
FileStatus[] files = fs.globStatus(glob);
if (files == null) {
@@ -221,28 +213,17 @@ public class SSVDHelper {
}
/**
- * Load multiplel upper triangular matrices and sum them up.
+ * Load multiple upper triangular matrices and sum them up.
*
- * @param fs
- * @param glob
- * @param conf
* @return the sum of upper triangular inputs.
- * @throws IOException
*/
- public static UpperTriangular
- loadAndSumUpperTriangularMatrices(Path glob, Configuration conf)
- throws IOException {
+ public static UpperTriangular loadAndSumUpperTriangularMatrices(Path glob, Configuration conf) throws IOException {
Vector v = loadAndSumUpVectors(glob, conf);
return v == null ? null : new UpperTriangular(v);
}
/**
- * returns sum of all vectors in different files specified by glob
- *
- * @param glob
- * @param conf
- * @return
- * @throws IOException
+ * @return sum of all vectors in different files specified by glob
*/
public static Vector loadAndSumUpVectors(Path glob, Configuration conf)
throws IOException {
@@ -258,10 +239,11 @@ public class SSVDHelper {
try {
Vector v = null;
while (iter.hasNext()) {
- if (v == null)
+ if (v == null) {
v = new DenseVector(iter.next().get());
- else
+ } else {
v.assign(iter.next().get(), Functions.PLUS);
+ }
}
return v;
@@ -274,17 +256,8 @@ public class SSVDHelper {
/**
* Load only one upper triangular matrix and issue error if mroe than one is
* found.
- *
- * @param fs
- * @param glob
- * @param conf
- * @return
- * @throws IOException
*/
- public static UpperTriangular loadUpperTriangularMatrix(FileSystem fs,
- Path glob,
- Configuration conf)
- throws IOException {
+ public static UpperTriangular loadUpperTriangularMatrix(Path glob, Configuration conf) throws IOException {
/*
* there still may be more than one file in glob and only one of them must
@@ -299,12 +272,14 @@ public class SSVDHelper {
true,
conf);
try {
- if (!iter.hasNext())
+ if (!iter.hasNext()) {
throw new IOException("No triangular matrices found");
+ }
Vector v = iter.next().get();
UpperTriangular result = new UpperTriangular(v);
- if (iter.hasNext())
+ if (iter.hasNext()) {
throw new IOException("Unexpected overrun in upper triangular matrix files");
+ }
return result;
} finally {
@@ -314,11 +289,8 @@ public class SSVDHelper {
/**
* extracts row-wise raw data from a Mahout matrix for 3rd party solvers.
- * Unfortunately values member is 100% encapsulated in {@link DenseMatrix} at
+ * Unfortunately values member is 100% encapsulated in {@link org.apache.mahout.math.DenseMatrix} at
* this point, so we have to resort to abstract element-wise copying.
- *
- * @param m
- * @return
*/
public static double[][] extractRawData(Matrix m) {
int rows = m.numRows();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java Wed Jun 20 12:07:50 2012
@@ -337,16 +337,15 @@ public class SSVDSolver {
Path vPath = new Path(outputPath, "V");
Path pcaBasePath = new Path(outputPath, "pca");
- Path sbPath = null;
- Path sqPath = null;
-
- double xisquaredlen = 0;
- if (pcaMeanPath != null)
+ if (pcaMeanPath != null) {
fs.mkdirs(pcaBasePath);
+ }
Random rnd = RandomUtils.getRandom();
long seed = rnd.nextLong();
+ Path sbPath = null;
+ double xisquaredlen = 0.0;
if (pcaMeanPath != null) {
/*
* combute s_b0 if pca offset present.
@@ -392,8 +391,8 @@ public class SSVDSolver {
* bit too many (I would be happy i that were ever the case though).
*/
- sbPath = new Path(pcaBasePath, "sb0");
- sqPath = new Path(pcaBasePath, "sq0");
+ //sbPath = new Path(pcaBasePath, "sb0");
+ //sqPath = new Path(pcaBasePath, "sq0");
BtJob.run(conf,
inputPath,
@@ -410,7 +409,7 @@ public class SSVDSolver {
q <= 0);
sbPath = new Path(btPath, BtJob.OUTPUT_SB + "-*");
- sqPath = new Path(btPath, BtJob.OUTPUT_SQ + "-*");
+ Path sqPath = new Path(btPath, BtJob.OUTPUT_SQ + "-*");
// power iterations
for (int i = 0; i < q; i++) {
@@ -481,7 +480,6 @@ public class SSVDSolver {
bbtSquare.assign(mC, Functions.MINUS);
bbtSquare.assign(mC.transpose(), Functions.MINUS);
- mC = null;
Matrix outerSq = sq.cross(sq);
outerSq.assign(Functions.mult(xisquaredlen));
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SplitPartitionedWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SplitPartitionedWritable.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SplitPartitionedWritable.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SplitPartitionedWritable.java Wed Jun 20 12:07:50 2012
@@ -19,6 +19,7 @@ package org.apache.mahout.math.hadoop.st
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.io.Serializable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
@@ -125,7 +126,7 @@ public class SplitPartitionedWritable im
return 0;
}
- public static final class SplitGroupingComparator extends WritableComparator {
+ public static final class SplitGroupingComparator extends WritableComparator implements Serializable {
public SplitGroupingComparator() {
super(SplitPartitionedWritable.class, true);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java Wed Jun 20 12:07:50 2012
@@ -62,9 +62,9 @@ public class VJob {
/*
* xi and s_q are PCA-related corrections, per MAHOUT-817
*/
- protected Vector xi;
- protected Vector sq;
- protected PlusMult plusMult = new PlusMult(0);
+ Vector xi;
+ Vector sq;
+ PlusMult plusMult = new PlusMult(0);
@Override
protected void map(IntWritable key, VectorWritable value, Context context)
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java Wed Jun 20 12:07:50 2012
@@ -40,14 +40,14 @@ import org.apache.mahout.math.VectorWrit
* Job that accumulates Y'Y output
*
*/
-public class YtYJob {
+public final class YtYJob {
public static final String PROP_OMEGA_SEED = "ssvd.omegaseed";
public static final String PROP_K = "ssvd.k";
public static final String PROP_P = "ssvd.p";
// we have single output, so we use standard output
- public static final String OUTPUT_YtY = "part-";
+ public static final String OUTPUT_YT_Y = "part-";
private YtYJob() {
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java Wed Jun 20 12:07:50 2012
@@ -22,10 +22,8 @@ import org.apache.mahout.math.function.D
/**
* Gram Schmidt quick helper.
- *
- *
*/
-public class GramSchmidt {
+public final class GramSchmidt {
private GramSchmidt() {
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/ssvd/SequentialOutOfCoreSvd.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/ssvd/SequentialOutOfCoreSvd.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/ssvd/SequentialOutOfCoreSvd.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/ssvd/SequentialOutOfCoreSvd.java Wed Jun 20 12:07:50 2012
@@ -94,7 +94,8 @@ public class SequentialOutOfCoreSvd {
private final int seed;
private final int dim;
- public SequentialOutOfCoreSvd(Iterable<File> partsOfA, File tmpDir, int internalDimension, int columnsPerSlice) throws IOException {
+ public SequentialOutOfCoreSvd(Iterable<File> partsOfA, File tmpDir, int internalDimension, int columnsPerSlice)
+ throws IOException {
this.columnsPerSlice = columnsPerSlice;
this.dim = internalDimension;
@@ -127,7 +128,7 @@ public class SequentialOutOfCoreSvd {
int ncols = 0;
for (File file : partsOfA) {
MatrixWritable m = new MatrixWritable();
- final DataInputStream in = new DataInputStream(new FileInputStream(file));
+ DataInputStream in = new DataInputStream(new FileInputStream(file));
try {
m.readFields(in);
} finally {
@@ -150,7 +151,7 @@ public class SequentialOutOfCoreSvd {
MatrixWritable bTmp = new MatrixWritable();
for (int j = 0; j < ncols; j += columnsPerSlice) {
if (bFile(tmpDir, j).exists()) {
- final DataInputStream in = new DataInputStream(new FileInputStream(bFile(tmpDir, j)));
+ DataInputStream in = new DataInputStream(new FileInputStream(bFile(tmpDir, j)));
try {
bTmp.readFields(in);
} finally {
@@ -167,17 +168,18 @@ public class SequentialOutOfCoreSvd {
public void computeV(File tmpDir, int ncols) throws IOException {
// step 5, compute pieces of V
for (int j = 0; j < ncols; j += columnsPerSlice) {
- final File bPath = bFile(tmpDir, j);
+ File bPath = bFile(tmpDir, j);
if (bPath.exists()) {
MatrixWritable m = new MatrixWritable();
- final DataInputStream in = new DataInputStream(new FileInputStream(bPath));
+ DataInputStream in = new DataInputStream(new FileInputStream(bPath));
try {
m.readFields(in);
} finally {
in.close();
}
m.set(l2.solveRight(m.get().transpose()).times(svd.getV()));
- final DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(tmpDir, String.format("V-%s", bPath.getName().replaceAll(".*-", "")))));
+ DataOutputStream out = new DataOutputStream(new FileOutputStream(
+ new File(tmpDir, String.format("V-%s", bPath.getName().replaceAll(".*-", "")))));
try {
m.write(out);
} finally {
@@ -197,7 +199,8 @@ public class SequentialOutOfCoreSvd {
Matrix y = aI.times(new RandomTrinaryMatrix(seed, aI.numCols(), dim, false));
Matrix uI = r2.solveRight(y).times(svd.getU());
m.set(uI);
- final DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(tmpDir, String.format("U-%s", file.getName().replaceAll(".*-", "")))));
+ DataOutputStream out = new DataOutputStream(new FileOutputStream(
+ new File(tmpDir, String.format("U-%s", file.getName().replaceAll(".*-", "")))));
try {
m.write(out);
} finally {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/Sampler.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/Sampler.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/Sampler.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/Sampler.java Wed Jun 20 12:07:50 2012
@@ -17,7 +17,7 @@
package org.apache.mahout.math.stats;
-import org.apache.mahout.math.DenseVector;
+import com.google.common.base.Preconditions;
import org.apache.mahout.math.Vector;
import java.util.Arrays;
@@ -55,17 +55,11 @@ public class Sampler {
}
public int sample() {
- if (sampler == null) {
- throw new NullPointerException("Sampler must have been constructed with a distribution, or"
- + " else sample(Vector) should be used to sample");
- }
+ Preconditions.checkNotNull(sampler,
+ "Sampler must have been constructed with a distribution, or else sample(Vector) should be used to sample");
return sample(sampler);
}
- private static double[] samplerFor(double[] distribution) {
- return samplerFor(new DenseVector(distribution));
- }
-
private static double[] samplerFor(Vector vectorDistribution) {
int size = vectorDistribution.size();
double[] partition = new double[size];
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java Wed Jun 20 12:07:50 2012
@@ -42,7 +42,7 @@ public final class CalculateEntropyReduc
@Override
protected void reduce(NullWritable key, Iterable<DoubleWritable> values, Context context)
- throws IOException, InterruptedException {
+ throws IOException, InterruptedException {
double entropy = 0.0;
for (DoubleWritable value : values) {
entropy += value.get();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java Wed Jun 20 12:07:50 2012
@@ -32,7 +32,7 @@ public final class DoubleSumReducer exte
@Override
protected void reduce(Writable key, Iterable<DoubleWritable> values, Context context)
- throws IOException, InterruptedException {
+ throws IOException, InterruptedException {
double sum = 0.0;
for (DoubleWritable value : values) {
sum += value.get();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java Wed Jun 20 12:07:50 2012
@@ -20,7 +20,6 @@ package org.apache.mahout.math.stats.ent
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
@@ -35,8 +34,8 @@ import java.util.List;
import java.util.Map;
/**
- * A Hadoop job to compute the entropy of keys or values in a {@link SequenceFile}. Format has to be {@link Text} for
- * key or value.
+ * A Hadoop job to compute the entropy of keys or values in a {@link org.apache.hadoop.io.SequenceFile}.
+ * Format has to be {@link Text} for key or value.
* <p/>
* <ul>
* <li>-i The input sequence file</li>
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java Wed Jun 20 12:07:50 2012
@@ -20,7 +20,6 @@ package org.apache.mahout.math.stats.ent
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
-import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
@@ -31,7 +30,7 @@ import java.io.IOException;
import java.util.Iterator;
/**
- * Calculates the information gain for a {@link SequenceFile}.
+ * Calculates the information gain for a {@link org.apache.hadoop.io.SequenceFile}.
* Computes, how 'useful' are the keys when predicting the values.
* <ul>
* <li>-i The input sequence file</li>
@@ -84,15 +83,22 @@ public final class InformationGain exten
}
private void calculateEntropy() throws Exception {
- String[] args = { "-i", getInputPath().toString(), "-o", entropyPath.toString(), "-s", "value",
- "--tempDir", getTempPath().toString() };
+ String[] args = {
+ "-i", getInputPath().toString(),
+ "-o", entropyPath.toString(),
+ "-s", "value",
+ "--tempDir", getTempPath().toString(),
+ };
ToolRunner.run(new Entropy(), args);
entropy = readDoubleFromPath(entropyPath);
}
private void calculateConditionalEntropy() throws Exception {
- String[] args = { "-i", getInputPath().toString(), "-o", conditionalEntropyPath.toString(),
- "--tempDir", getTempPath().toString() };
+ String[] args = {
+ "-i", getInputPath().toString(),
+ "-o", conditionalEntropyPath.toString(),
+ "--tempDir", getTempPath().toString(),
+ };
ToolRunner.run(new ConditionalEntropy(), args);
conditionalEntropy = readDoubleFromPath(conditionalEntropyPath);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java Wed Jun 20 12:07:50 2012
@@ -59,29 +59,20 @@ import org.apache.mahout.vectorizer.term
* value containing the tokenized document. You may use {@link DocumentProcessor} to tokenize the document.
* This is a dictionary based Vectorizer.
*/
-public final class DictionaryVectorizer implements Vectorizer{
+public final class DictionaryVectorizer implements Vectorizer {
public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
-
public static final String MIN_SUPPORT = "min.support";
-
public static final String MAX_NGRAMS = "max.ngrams";
-
public static final int DEFAULT_MIN_SUPPORT = 2;
private static final String DICTIONARY_FILE = "dictionary.file-";
-
private static final int MAX_CHUNKSIZE = 10000;
-
private static final int MIN_CHUNKSIZE = 100;
-
private static final String OUTPUT_FILES_PATTERN = "part-*";
-
// 4 byte overhead for each entry in the OpenObjectIntHashMap
private static final int DICTIONARY_BYTE_OVERHEAD = 4;
-
private static final String VECTOR_OUTPUT_FOLDER = "partial-vectors-";
-
private static final String DICTIONARY_JOB_FOLDER = "wordcount";
/**
@@ -90,7 +81,8 @@ public final class DictionaryVectorizer
private DictionaryVectorizer() {
}
- //TODO: move more of SparseVectorsFromSequenceFile in here, and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework.
+ //TODO: move more of SparseVectorsFromSequenceFile in here, and then fold SparseVectorsFrom with
+ // EncodedVectorsFrom to have one framework.
@Override
public void createVectors(Path input, Path output, VectorizerConfig config)
@@ -325,8 +317,9 @@ public final class DictionaryVectorizer
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
}
/**
@@ -363,7 +356,8 @@ public final class DictionaryVectorizer
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java Wed Jun 20 12:07:50 2012
@@ -91,8 +91,9 @@ public final class DocumentProcessor {
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java Wed Jun 20 12:07:50 2012
@@ -27,14 +27,11 @@ import org.apache.mahout.common.HadoopUt
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
import org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
/**
* Converts a given set of sequence files into SparseVectors
*/
public final class EncodedVectorsFromSequenceFiles extends AbstractJob {
- private static final Logger log = LoggerFactory.getLogger(EncodedVectorsFromSequenceFiles.class);
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new EncodedVectorsFromSequenceFiles(), args);
@@ -45,11 +42,21 @@ public final class EncodedVectorsFromSeq
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.analyzerOption().create());
- addOption(buildOption("sequentialAccessVector", "seq", "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false", false, false, null));
- addOption(buildOption("namedVector", "nv", "Create named vectors using the key. False by default", false, false, null));
- addOption("cardinality", "c", "The cardinality to use for creating the vectors. Default is 5000", String.valueOf(5000));
- addOption("encoderFieldName", "en", "The name of the encoder to be passed to the FeatureVectorEncoder constructor. Default is text. Note this is not the class name of a FeatureValueEncoder, but is instead the construction argument.", "text");
- addOption("encoderClass", "ec", "The class name of the encoder to be used. Default is " + LuceneTextValueEncoder.class.getName(), LuceneTextValueEncoder.class.getName());
+ addOption(buildOption("sequentialAccessVector", "seq",
+ "(Optional) Whether output vectors should be SequentialAccessVectors. " +
+ "If set true else false",
+ false, false, null));
+ addOption(buildOption("namedVector", "nv",
+ "Create named vectors using the key. False by default", false, false, null));
+ addOption("cardinality", "c",
+ "The cardinality to use for creating the vectors. Default is 5000", "5000");
+ addOption("encoderFieldName", "en",
+ "The name of the encoder to be passed to the FeatureVectorEncoder constructor. Default is text. " +
+ "Note this is not the class name of a FeatureValueEncoder, but is instead the construction argument.",
+ "text");
+ addOption("encoderClass", "ec",
+ "The class name of the encoder to be used. Default is " + LuceneTextValueEncoder.class.getName(),
+ LuceneTextValueEncoder.class.getName());
addOption(DefaultOptionCreator.overwriteOption().create());
if (parseArguments(args) == null) {
return -1;
@@ -64,12 +71,10 @@ public final class EncodedVectorsFromSeq
Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption();
-
Configuration conf = getConf();
boolean sequentialAccessOutput = hasOption("sequentialAccessVector");
-
boolean namedVectors = hasOption("namedVector");
int cardinality = 5000;
if (hasOption("cardinality")) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java Wed Jun 20 12:07:50 2012
@@ -1,5 +1,4 @@
-package org.apache.mahout.vectorizer;
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -16,6 +15,8 @@ package org.apache.mahout.vectorizer;
* limitations under the License.
*/
+package org.apache.mahout.vectorizer;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
@@ -38,7 +39,7 @@ import java.net.URI;
import java.util.ArrayList;
import java.util.List;
-public class HighDFWordsPruner {
+public final class HighDFWordsPruner {
public static final String OUT_DIR_SUFFIX = "-pruned";
public static final String STD_CALC_DIR = "stdcalc";
@@ -47,14 +48,12 @@ public class HighDFWordsPruner {
private HighDFWordsPruner() {
}
-
public static void pruneVectors(Path tfDir, Path prunedTFDir, Path prunedPartialTFDir, long maxDF,
Configuration baseConf,
Pair<Long[], List<Path>> docFrequenciesFeatures,
float normPower,
boolean logNormalize,
- int numReducers) throws IOException, InterruptedException,
- ClassNotFoundException {
+ int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
int partialVectorIndex = 0;
List<Path> partialVectorPaths = new ArrayList<Path>();
@@ -64,8 +63,7 @@ public class HighDFWordsPruner {
pruneVectorsPartial(tfDir, partialVectorOutputPath, path, maxDF, baseConf);
}
- mergePartialVectors(partialVectorPaths, prunedTFDir, baseConf, normPower, logNormalize,
- numReducers);
+ mergePartialVectors(partialVectorPaths, prunedTFDir, baseConf, normPower, logNormalize, numReducers);
HadoopUtil.delete(new Configuration(baseConf), prunedPartialTFDir);
}
@@ -93,8 +91,9 @@ public class HighDFWordsPruner {
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
}
public static void mergePartialVectors(Iterable<Path> partialVectorPaths,
@@ -103,7 +102,7 @@ public class HighDFWordsPruner {
float normPower,
boolean logNormalize,
int numReducers)
- throws IOException, InterruptedException, ClassNotFoundException {
+ throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration(baseConf);
// this conf parameter needs to be set enable serialisation of conf values
@@ -132,8 +131,9 @@ public class HighDFWordsPruner {
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
}
private static String getCommaSeparatedPaths(Iterable<Path> paths) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java Wed Jun 20 12:07:50 2012
@@ -63,8 +63,9 @@ public class SimpleTextEncodingVectorize
boolean finished = job.waitForCompletion(true);
log.info("result of run: {}", finished);
- if (!finished)
+ if (!finished) {
throw new IllegalStateException("Job failed!");
+ }
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java Wed Jun 20 12:07:50 2012
@@ -86,16 +86,17 @@ public final class SparseVectorsFromSequ
"The minimum document frequency. Default is 1").withShortName("md").create();
Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
- abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
- "The max percentage of docs for the DF. Can be used to remove really high frequency terms."
- + " Expressed as an integer between 0 and 100. Default is 99. If maxDFSigma is also set, it will override this value.").withShortName("x").create();
+ abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The max percentage of docs for the DF. Can be used to remove really high frequency terms."
+ + " Expressed as an integer between 0 and 100. Default is 99. If maxDFSigma is also set, "
+ + "it will override this value.").withShortName("x").create();
Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false).withArgument(
abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create()).withDescription(
- "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors."
- + " Can be used to remove really high frequency terms."
- + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors "
- + "will be filtered out. Default is -1.0. Overrides maxDFPercent").withShortName("xs").create();
+ "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) "
+ + "of the document frequencies of these vectors. Can be used to remove really high frequency terms."
+ + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less than 0 "
+ + "no vectors will be filtered out. Default is -1.0. Overrides maxDFPercent").withShortName("xs").create();
Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false).withArgument(
abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
@@ -139,8 +140,8 @@ public final class SparseVectorsFromSequ
Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
.withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
- .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
- .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
+ .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
+ .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
.withOption(helpOpt).withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt)
.withOption(logNormalizeOpt)
.create();
@@ -229,7 +230,7 @@ public final class SparseVectorsFromSequ
}
double maxDFSigma = -1.0;
if (cmdLine.hasOption(maxDFSigmaOpt)) {
- maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
+ maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
}
float norm = PartialVectorMerger.NO_NORMALIZING;
@@ -249,7 +250,8 @@ public final class SparseVectorsFromSequ
Configuration conf = getConf();
Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
- //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
+ //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom
+ // to have one framework for all of this.
DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);
boolean sequentialAccessOutput = false;
@@ -262,65 +264,91 @@ public final class SparseVectorsFromSequ
namedVectors = true;
}
boolean shouldPrune = maxDFSigma >=0.0;
- String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER+"-toprune" : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
+ String tfDirName = shouldPrune
+ ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER+"-toprune"
+ : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
- if (!processIdf) {
- DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize,
- minLLRValue, norm, logNormalize, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors);
+ if (processIdf) {
+ DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
+ outputDir,
+ tfDirName,
+ conf,
+ minSupport,
+ maxNGramSize,
+ minLLRValue,
+ -1.0f,
+ false,
+ reduceTasks,
+ chunkSize,
+ sequentialAccessOutput,
+ namedVectors);
} else {
- DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize,
- minLLRValue, -1.0f, false, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors);
+ DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
+ outputDir,
+ tfDirName,
+ conf,
+ minSupport,
+ maxNGramSize,
+ minLLRValue,
+ norm,
+ logNormalize,
+ reduceTasks,
+ chunkSize,
+ sequentialAccessOutput,
+ namedVectors);
}
+
Pair<Long[], List<Path>> docFrequenciesFeatures = null;
- // Should document frequency features be processed
- if (shouldPrune || processIdf) {
- docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName),
- outputDir, conf, chunkSize);
- }
-
- long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
- if (shouldPrune) {
- Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
- Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);
-
- // Calculate the standard deviation
- double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
- long vectorCount = docFrequenciesFeatures.getFirst()[1];
- maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
-
- // Prune the term frequency vectors
- Path tfDir = new Path(outputDir, tfDirName);
- Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
- Path prunedPartialTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER
- + "-partial");
- if (processIdf) {
- HighDFWordsPruner.pruneVectors(tfDir,
- prunedTFDir,
- prunedPartialTFDir,
- maxDF,
- conf,
- docFrequenciesFeatures,
- -1.0f,
- false,
- reduceTasks);
- } else {
- HighDFWordsPruner.pruneVectors(tfDir,
- prunedTFDir,
- prunedPartialTFDir,
- maxDF,
- conf,
- docFrequenciesFeatures,
- norm,
- logNormalize,
- reduceTasks);
- }
- HadoopUtil.delete(new Configuration(conf), tfDir);
- }
+ // Should document frequency features be processed
+ if (shouldPrune || processIdf) {
+ docFrequenciesFeatures =
+ TFIDFConverter.calculateDF(new Path(outputDir, tfDirName),outputDir, conf, chunkSize);
+ }
+
+ long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
+ if (shouldPrune) {
+ Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
+ Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);
+
+ // Calculate the standard deviation
+ double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
+ long vectorCount = docFrequenciesFeatures.getFirst()[1];
+ maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
+
+ // Prune the term frequency vectors
+ Path tfDir = new Path(outputDir, tfDirName);
+ Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
+ Path prunedPartialTFDir =
+ new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
+
+ if (processIdf) {
+ HighDFWordsPruner.pruneVectors(tfDir,
+ prunedTFDir,
+ prunedPartialTFDir,
+ maxDF,
+ conf,
+ docFrequenciesFeatures,
+ -1.0f,
+ false,
+ reduceTasks);
+ } else {
+ HighDFWordsPruner.pruneVectors(tfDir,
+ prunedTFDir,
+ prunedPartialTFDir,
+ maxDF,
+ conf,
+ docFrequenciesFeatures,
+ norm,
+ logNormalize,
+ reduceTasks);
+ }
+ HadoopUtil.delete(new Configuration(conf), tfDir);
+ }
if (processIdf) {
- TFIDFConverter.processTfIdf(
- new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
- outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
- sequentialAccessOutput, namedVectors, reduceTasks);
+ TFIDFConverter.processTfIdf(
+ new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
+ outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
+ sequentialAccessOutput, namedVectors, reduceTasks);
}
} catch (OptionException e) {
log.error("Exception", e);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java Wed Jun 20 12:07:50 2012
@@ -23,6 +23,7 @@ import java.io.IOException;
public interface Vectorizer {
- void createVectors(Path input, Path output, VectorizerConfig config) throws IOException, ClassNotFoundException, InterruptedException;
+ void createVectors(Path input, Path output, VectorizerConfig config)
+ throws IOException, ClassNotFoundException, InterruptedException;
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java Wed Jun 20 12:07:50 2012
@@ -235,8 +235,9 @@ public final class CollocDriver extends
job.setNumReduceTasks(reduceTasks);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}
@@ -277,7 +278,8 @@ public final class CollocDriver extends
job.setNumReduceTasks(reduceTasks);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/Gram.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/Gram.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/Gram.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/Gram.java Wed Jun 20 12:07:50 2012
@@ -23,6 +23,7 @@ import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
+import com.google.common.base.Preconditions;
import org.apache.hadoop.io.BinaryComparable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
@@ -96,11 +97,7 @@ public class Gram extends BinaryComparab
* whether the gram is at the head of its text unit or tail or unigram
*/
public Gram(String ngram, int frequency, Type type) {
-
- if (ngram == null) {
- throw new NullPointerException();
- }
-
+ Preconditions.checkNotNull(ngram);
try {
// extra character is used for storing type which is part
// of the sort key.
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/common/PartialVectorMerger.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/common/PartialVectorMerger.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/common/PartialVectorMerger.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/common/PartialVectorMerger.java Wed Jun 20 12:07:50 2012
@@ -127,8 +127,9 @@ public final class PartialVectorMerger {
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
}
private static String getCommaSeparatedPaths(Iterable<Path> paths) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java Wed Jun 20 12:07:50 2012
@@ -17,6 +17,7 @@
package org.apache.mahout.vectorizer.encoders;
+import com.google.common.base.Charsets;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import org.apache.mahout.math.Vector;
@@ -57,7 +58,7 @@ public class AdaptiveWordValueEncoder ex
// the counts here are adjusted so that every observed value has an extra 0.5 count
// as does a hypothetical unobserved value. This smooths our estimates a bit and
// allows the first word seen to have a non-zero weight of -log(1.5 / 2)
- double thisWord = dictionary.count(new String(originalForm)) + 0.5;
+ double thisWord = dictionary.count(new String(originalForm, Charsets.UTF_8)) + 0.5;
double allWords = dictionary.size() + dictionary.elementSet().size() * 0.5 + 0.5;
return -Math.log(thisWord / allWords);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingContinuousValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingContinuousValueEncoder.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingContinuousValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingContinuousValueEncoder.java Wed Jun 20 12:07:50 2012
@@ -17,6 +17,7 @@
package org.apache.mahout.vectorizer.encoders;
+import com.google.common.base.Charsets;
import org.apache.mahout.math.map.OpenIntIntHashMap;
import com.google.common.base.Preconditions;
@@ -54,7 +55,7 @@ public class CachingContinuousValueEncod
if (caches[probe].containsKey(originalForm.hashCode())) {
return caches[probe].get(originalForm.hashCode());
}
- int hash = hashForProbe(originalForm.getBytes(), dataSize, name, probe);
+ int hash = hashForProbe(originalForm.getBytes(Charsets.UTF_8), dataSize, name, probe);
caches[probe].put(originalForm.hashCode(), hash);
return hash;
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingStaticWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingStaticWordValueEncoder.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingStaticWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingStaticWordValueEncoder.java Wed Jun 20 12:07:50 2012
@@ -17,6 +17,7 @@
package org.apache.mahout.vectorizer.encoders;
+import com.google.common.base.Charsets;
import org.apache.mahout.math.map.OpenIntIntHashMap;
import com.google.common.base.Preconditions;
@@ -55,7 +56,7 @@ public class CachingStaticWordValueEncod
if (caches[probe].containsKey(originalForm.hashCode())) {
return caches[probe].get(originalForm.hashCode());
}
- int hash = hashForProbe(originalForm.getBytes(), dataSize, name, probe);
+ int hash = hashForProbe(originalForm.getBytes(Charsets.UTF_8), dataSize, name, probe);
caches[probe].put(originalForm.hashCode(), hash);
return hash;
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java Wed Jun 20 12:07:50 2012
@@ -17,12 +17,14 @@
package org.apache.mahout.vectorizer.encoders;
+import com.google.common.base.Charsets;
import org.apache.mahout.math.Vector;
/**
* Continuous values are stored in fixed randomized location in the feature vector.
*/
public class ContinuousValueEncoder extends CachingValueEncoder {
+
public ContinuousValueEncoder(String name) {
super(name, CONTINUOUS_VALUE_HASH_SEED);
}
@@ -48,11 +50,10 @@ public class ContinuousValueEncoder exte
@Override
protected double getWeight(byte[] originalForm, double w) {
- if (originalForm != null) {
- return w * Double.parseDouble(new String(originalForm));
- } else {
+ if (originalForm == null) {
return w;
}
+ return w * Double.parseDouble(new String(originalForm, Charsets.UTF_8));
}
/**
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java Wed Jun 20 12:07:50 2012
@@ -62,7 +62,7 @@ public class TFPartialVectorReducer exte
@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
- throws IOException, InterruptedException {
+ throws IOException, InterruptedException {
Iterator<StringTuple> it = values.iterator();
if (!it.hasNext()) {
return;
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java Wed Jun 20 12:07:50 2012
@@ -58,36 +58,24 @@ import org.apache.mahout.vectorizer.term
public final class TFIDFConverter {
public static final String VECTOR_COUNT = "vector.count";
-
public static final String FEATURE_COUNT = "feature.count";
-
public static final String MIN_DF = "min.df";
-
public static final String MAX_DF = "max.df";
-
//public static final String TFIDF_OUTPUT_FOLDER = "tfidf";
private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
-
private static final String FREQUENCY_FILE = "frequency.file-";
-
private static final int MAX_CHUNKSIZE = 10000;
-
private static final int MIN_CHUNKSIZE = 100;
-
private static final String OUTPUT_FILES_PATTERN = "part-*";
-
private static final int SEQUENCEFILE_BYTE_OVERHEAD = 45;
-
private static final String VECTOR_OUTPUT_FOLDER = "partial-vectors-";
-
public static final String WORDCOUNT_OUTPUT_FOLDER = "df-count";
/**
* Cannot be initialized. Use the static functions
*/
private TFIDFConverter() {
-
}
/**
@@ -182,10 +170,11 @@ public final class TFIDFConverter {
* recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
* partial vectors without thrashing the system due to increased swapping
*/
- public static Pair<Long[], List<Path>> calculateDF(Path input,
- Path output,
- Configuration baseConf,
- int chunkSizeInMegabytes) throws IOException, InterruptedException, ClassNotFoundException {
+ public static Pair<Long[],List<Path>> calculateDF(Path input,
+ Path output,
+ Configuration baseConf,
+ int chunkSizeInMegabytes)
+ throws IOException, InterruptedException, ClassNotFoundException {
if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
chunkSizeInMegabytes = MIN_CHUNKSIZE;
@@ -330,8 +319,9 @@ public final class TFIDFConverter {
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
}
/**
@@ -366,7 +356,8 @@ public final class TFIDFConverter {
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded) {
throw new IllegalStateException("Job failed!");
+ }
}
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizerTest.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizerTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizerTest.java Wed Jun 20 12:07:50 2012
@@ -91,7 +91,7 @@ public class ALSWRFactorizerTest extends
public void ratingVector() throws Exception {
PreferenceArray prefs = dataModel.getPreferencesFromUser(1);
- Vector ratingVector = factorizer.ratingVector(prefs);
+ Vector ratingVector = ALSWRFactorizer.ratingVector(prefs);
assertEquals(prefs.length(), ratingVector.getNumNondefaultElements());
assertEquals(prefs.get(0).getValue(), ratingVector.get(0), EPSILON);
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java Wed Jun 20 12:07:50 2012
@@ -17,13 +17,7 @@
package org.apache.mahout.classifier.df.mapreduce.partial;
-import static org.easymock.EasyMock.anyObject;
-import static org.easymock.EasyMock.capture;
-import static org.easymock.EasyMock.createMock;
-import static org.easymock.EasyMock.expectLastCall;
-import static org.easymock.EasyMock.replay;
-import static org.easymock.EasyMock.verify;
-
+import static org.easymock.EasyMock.*;
import java.util.Random;
import org.apache.hadoop.io.LongWritable;
@@ -35,7 +29,6 @@ import org.apache.mahout.classifier.df.d
import org.apache.mahout.classifier.df.data.DataLoader;
import org.apache.mahout.classifier.df.data.Dataset;
import org.apache.mahout.classifier.df.data.Utils;
-import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
import org.apache.mahout.classifier.df.node.Leaf;
import org.apache.mahout.classifier.df.node.Node;
import org.apache.mahout.common.MahoutTestCase;
@@ -82,10 +75,11 @@ public final class Step1MapperTest exten
private static class TreeIDCapture extends Capture<TreeID> {
- public TreeIDCapture() {
+ private TreeIDCapture() {
super(CaptureType.ALL);
}
+ @Override
public void setValue(final TreeID value) {
super.setValue(value.clone());
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java Wed Jun 20 12:07:50 2012
@@ -121,7 +121,7 @@ public final class ClusteringTestUtils {
private final Random random;
private final Sampler[] samplers;
- public LDASampler(Matrix model, Random random) {
+ LDASampler(Matrix model, Random random) {
this.random = random;
samplers = new Sampler[model.numRows()];
for (int i = 0; i < samplers.length; i++) {
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Wed Jun 20 12:07:50 2012
@@ -17,6 +17,7 @@
package org.apache.mahout.clustering.canopy;
+import java.util.Collection;
import java.util.List;
import java.util.Set;
@@ -352,8 +353,9 @@ public final class TestCanopyCreation ex
}
}
- boolean findAndRemove(Pair<Double,Double> target,
- List<Pair<Double,Double>> list, double epsilon) {
+ static boolean findAndRemove(Pair<Double, Double> target,
+ Collection<Pair<Double, Double>> list,
+ double epsilon) {
for (Pair<Double,Double> curr : list) {
if ( (Math.abs(target.getFirst() - curr.getFirst()) < epsilon)
&& (Math.abs(target.getSecond() - curr.getSecond()) < epsilon) ) {
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java Wed Jun 20 12:07:50 2012
@@ -24,8 +24,6 @@ import java.util.Set;
import com.google.common.collect.Sets;
-import junit.framework.Assert;
-
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
@@ -45,6 +43,7 @@ import org.apache.mahout.common.iterator
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
+import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
@@ -56,19 +55,12 @@ public class ClusterClassificationDriver
{5, 4}, {4, 5}, {5, 5}, {9, 9}, {8, 8}};
private FileSystem fs;
-
private Path clusteringOutputPath;
-
private Configuration conf;
-
private Path pointsPath;
-
private Path classifiedOutputPath;
-
private List<Vector> firstCluster;
-
private List<Vector> secondCluster;
-
private List<Vector> thirdCluster;
@Override
@@ -107,7 +99,7 @@ public class ClusterClassificationDriver
ClusteringTestUtils.writePointsToFile(points, true,
new Path(pointsPath, "file1"), fs, conf);
runClustering(pointsPath, conf, false);
- runClassificationWithOutlierRemoval(conf, false);
+ runClassificationWithOutlierRemoval(false);
collectVectorsForAssertion();
assertVectorsWithOutlierRemoval();
}
@@ -125,7 +117,7 @@ public class ClusterClassificationDriver
ClusteringTestUtils.writePointsToFile(points,
new Path(pointsPath, "file1"), fs, conf);
runClustering(pointsPath, conf, true);
- runClassificationWithoutOutlierRemoval(conf);
+ runClassificationWithoutOutlierRemoval();
collectVectorsForAssertion();
assertVectorsWithoutOutlierRemoval();
}
@@ -143,7 +135,7 @@ public class ClusterClassificationDriver
ClusteringTestUtils.writePointsToFile(points,
new Path(pointsPath, "file1"), fs, conf);
runClustering(pointsPath, conf, true);
- runClassificationWithOutlierRemoval(conf, true);
+ runClassificationWithOutlierRemoval(true);
collectVectorsForAssertion();
assertVectorsWithOutlierRemoval();
}
@@ -158,17 +150,14 @@ public class ClusterClassificationDriver
finalClustersPath);
}
- private void runClassificationWithoutOutlierRemoval(Configuration conf)
- throws IOException, InterruptedException, ClassNotFoundException {
- ClusterClassificationDriver.run(pointsPath, clusteringOutputPath,
- classifiedOutputPath, 0.0, true, true);
+ private void runClassificationWithoutOutlierRemoval()
+ throws IOException, InterruptedException, ClassNotFoundException {
+ ClusterClassificationDriver.run(pointsPath, clusteringOutputPath, classifiedOutputPath, 0.0, true, true);
}
- private void runClassificationWithOutlierRemoval(Configuration conf2,
- boolean runSequential) throws IOException, InterruptedException,
- ClassNotFoundException {
- ClusterClassificationDriver.run(pointsPath, clusteringOutputPath,
- classifiedOutputPath, 0.73, true, runSequential);
+ private void runClassificationWithOutlierRemoval(boolean runSequential)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ ClusterClassificationDriver.run(pointsPath, clusteringOutputPath, classifiedOutputPath, 0.73, true, runSequential);
}
private void collectVectorsForAssertion() throws IOException {
@@ -188,11 +177,11 @@ public class ClusterClassificationDriver
}
private void collectVector(String clusterId, Vector vector) {
- if (clusterId.equals("0")) {
+ if ("0".equals(clusterId)) {
firstCluster.add(vector);
- } else if (clusterId.equals("1")) {
+ } else if ("1".equals(clusterId)) {
secondCluster.add(vector);
- } else if (clusterId.equals("2")) {
+ } else if ("2".equals(clusterId)) {
thirdCluster.add(vector);
}
}
@@ -233,23 +222,21 @@ public class ClusterClassificationDriver
}
private void checkClustersWithOutlierRemoval() {
- Set<String> reference = Sets.newHashSet(new String[] {"{1:9.0,0:9.0}",
- "{1:1.0,0:1.0}"});
- int singletonCnt = 0;
- int emptyCnt = 0;
+ Set<String> reference = Sets.newHashSet("{1:9.0,0:9.0}", "{1:1.0,0:1.0}");
List<List<Vector>> clusters = Lists.newArrayList();
clusters.add(firstCluster);
clusters.add(secondCluster);
clusters.add(thirdCluster);
+ int singletonCnt = 0;
+ int emptyCnt = 0;
for (List<Vector> vList : clusters) {
- if (vList.size() == 0) {
+ if (vList.isEmpty()) {
emptyCnt++;
} else {
singletonCnt++;
- Assert.assertTrue("expecting only singleton clusters; got size=" + vList.size(),
- vList.size() == 1);
+ assertEquals("expecting only singleton clusters; got size=" + vList.size(), 1, vList.size());
Assert.assertTrue("not expecting cluster:" + vList.get(0).asFormatString(),
reference.contains(vList.get(0).asFormatString()));
reference.remove(vList.get(0).asFormatString());
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java Wed Jun 20 12:07:50 2012
@@ -50,7 +50,7 @@ import com.google.common.io.Closeables;
public final class TestMapReduce extends MahoutTestCase {
- private Collection<VectorWritable> sampleData = Lists.newArrayList();
+ private final Collection<VectorWritable> sampleData = Lists.newArrayList();
private FileSystem fs;
@@ -291,7 +291,7 @@ public final class TestMapReduce extends
printModels(getClusters(outputPath, maxIterations));
}
- private void printModels(List<List<Cluster>> result) {
+ private static void printModels(Iterable<List<Cluster>> result) {
int row = 0;
StringBuilder models = new StringBuilder(100);
for (List<Cluster> r : result) {
@@ -306,7 +306,7 @@ public final class TestMapReduce extends
System.out.println(models.toString());
}
- private List<List<Cluster>> getClusters(Path output, int numIterations) throws IOException {
+ private Iterable<List<Cluster>> getClusters(Path output, int numIterations) throws IOException {
List<List<Cluster>> result = new ArrayList<List<Cluster>>();
for (int i = 1; i <= numIterations; i++) {
ClusterClassifier posterior = new ClusterClassifier();
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Wed Jun 20 12:07:50 2012
@@ -56,21 +56,6 @@ public final class TestFuzzyKmeansCluste
fs = FileSystem.get(conf);
}
- private static double round(double val, int places) {
- long factor = (long) Math.pow(10, places);
-
- // Shift the decimal the correct number of places
- // to the right.
- val *= factor;
-
- // Round to the nearest integer.
- long tmp = Math.round(val);
-
- // Shift the decimal the correct number of places
- // back to the left.
- return (double) tmp / factor;
- }
-
private static Vector tweakValue(Vector point) {
return point.plus(0.1);
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java Wed Jun 20 12:07:50 2012
@@ -198,8 +198,7 @@ public final class TestClusterClassifier
public void testClusterIteratorKMeans() {
List<Vector> data = TestKmeansClustering.getPoints(TestKmeansClustering.REFERENCE);
ClusterClassifier prior = newKlusterClassifier();
- ClusterIterator iterator = new ClusterIterator();
- ClusterClassifier posterior = iterator.iterate(data, prior, 5);
+ ClusterClassifier posterior = ClusterIterator.iterate(data, prior, 5);
assertEquals(3, posterior.getModels().size());
for (Cluster cluster : posterior.getModels()) {
System.out.println(cluster.asFormatString(null));
@@ -210,8 +209,7 @@ public final class TestClusterClassifier
public void testClusterIteratorDirichlet() {
List<Vector> data = TestKmeansClustering.getPoints(TestKmeansClustering.REFERENCE);
ClusterClassifier prior = newKlusterClassifier();
- ClusterIterator iterator = new ClusterIterator();
- ClusterClassifier posterior = iterator.iterate(data, prior, 5);
+ ClusterClassifier posterior = ClusterIterator.iterate(data, prior, 5);
assertEquals(3, posterior.getModels().size());
for (Cluster cluster : posterior.getModels()) {
System.out.println(cluster.asFormatString(null));
@@ -235,7 +233,7 @@ public final class TestClusterClassifier
for (Cluster cluster : prior.getModels()) {
System.out.println(cluster.asFormatString(null));
}
- new ClusterIterator().iterateSeq(conf, pointsPath, path, outPath, 5);
+ ClusterIterator.iterateSeq(conf, pointsPath, path, outPath, 5);
for (int i = 1; i <= 4; i++) {
System.out.println("Classifier-" + i);
@@ -251,7 +249,7 @@ public final class TestClusterClassifier
}
@Test
- public void testMRFileClusterIteratorKMeans() throws IOException, InterruptedException, ClassNotFoundException {
+ public void testMRFileClusterIteratorKMeans() throws Exception {
Path pointsPath = getTestTempDirPath("points");
Path priorPath = getTestTempDirPath("prior");
Path outPath = getTestTempDirPath("output");
@@ -269,7 +267,7 @@ public final class TestClusterClassifier
for (Cluster cluster : prior.getModels()) {
System.out.println(cluster.asFormatString(null));
}
- new ClusterIterator().iterateMR(conf, pointsPath, path, outPath, 5);
+ ClusterIterator.iterateMR(conf, pointsPath, path, outPath, 5);
for (int i = 1; i <= 4; i++) {
System.out.println("Classifier-" + i);
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Wed Jun 20 12:07:50 2012
@@ -281,7 +281,7 @@ public final class TestKmeansClustering
&& !got43) {
got43 = true;
} else {
- assertTrue("got unexpected center: "+v+" ["+v.getClass().toString()+"]", false);
+ fail("got unexpected center: " + v + " [" + v.getClass().toString() + ']');
}
}
assertEquals("got unexpected number of centers", 2, count);
@@ -301,11 +301,11 @@ public final class TestKmeansClustering
collector.collect(record.getFirst(), record.getSecond());
}
- boolean gotLowClust = false; // clusters should be [1, *] and [2, *]
- boolean gotHighClust = false; // vs [3 , *], [4 , *] and [5, *]
+ //boolean gotLowClust = false; // clusters should be [1, *] and [2, *]
+ //boolean gotHighClust = false; // vs [3 , *], [4 , *] and [5, *]
for (IntWritable k : collector.getKeys()) {
List<WeightedVectorWritable> wvList = collector.getValue(k);
- assertTrue("empty cluster!", wvList.size() != 0);
+ assertTrue("empty cluster!", !wvList.isEmpty());
if (wvList.get(0).getVector().get(0) <= 2.0) {
for (WeightedVectorWritable wv : wvList) {
Vector v = wv.getVector();
@@ -313,7 +313,7 @@ public final class TestKmeansClustering
assertTrue("bad cluster!", v.get(idx) <= 2.0);
}
assertEquals("Wrong size cluster", 4, wvList.size());
- gotLowClust= true;
+ //gotLowClust= true;
} else {
for (WeightedVectorWritable wv : wvList) {
Vector v = wv.getVector();
@@ -321,7 +321,7 @@ public final class TestKmeansClustering
assertTrue("bad cluster!", v.get(idx) > 2.0);
}
assertEquals("Wrong size cluster", 5, wvList.size());
- gotHighClust= true;
+ //gotHighClust= true;
}
}
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/cvb/TestCVBModelTrainer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/cvb/TestCVBModelTrainer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/cvb/TestCVBModelTrainer.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/cvb/TestCVBModelTrainer.java Wed Jun 20 12:07:50 2012
@@ -64,8 +64,7 @@ public class TestCVBModelTrainer extends
double[] perps = new double[numTrials];
for (int trial = 0; trial < numTrials; trial++) {
InMemoryCollapsedVariationalBayes0 cvb =
- new InMemoryCollapsedVariationalBayes0(sampledCorpus, terms, numTestTopics, ALPHA, ETA,
- 2, 1, 0, (trial+1) * 123456L);
+ new InMemoryCollapsedVariationalBayes0(sampledCorpus, terms, numTestTopics, ALPHA, ETA, 2, 1, 0);
cvb.setVerbose(true);
perps[trial] = cvb.iterateUntilConvergence(0, 5, 0, 0.2);
System.out.println(perps[trial]);
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Wed Jun 20 12:07:50 2012
@@ -360,12 +360,13 @@ public final class TestMeanShift extends
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
Collection<VectorWritable> points = Lists.newArrayList();
+ // TODO fix test so it doesn't need this random seed!
Random r = new Random(123);
Vector[] permutedRaw = new Vector[raw.length];
- for (int i = 0; i < raw.length; i++)
- permutedRaw = raw;
- for (int i = 0; i < permutedRaw.length; i++)
+ System.arraycopy(raw, 0, permutedRaw, 0, raw.length);
+ for (int i = 0; i < permutedRaw.length; i++) {
permutedRaw[i] = permutedRaw[i + r.nextInt(raw.length - i)];
+ }
for (Vector v : permutedRaw) {
points.add(new VectorWritable(v));
}