You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2012/06/20 14:07:58 UTC

svn commit: r1352052 [4/7] - in /mahout/trunk: ./ buildtools/ buildtools/src/main/resources/ core/ core/src/main/java/org/apache/mahout/cf/taste/hadoop/ core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ core/src/main/java/org/apache/mahout/cf/t...

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/BtJob.java Wed Jun 20 12:07:50 2012
@@ -442,8 +442,9 @@ public final class BtJob {
           double xii = xi.size() > btIndex ? xi.getQuick(btIndex) : 0.0;
           // compute s_b
           pmult.setMultiplicator(xii);
-          if (sbAccum == null)
+          if (sbAccum == null) {
             sbAccum = new DenseVector(btRow.size());
+          }
           sbAccum.assign(btRow, pmult);
         }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/DenseBlockWritable.java Wed Jun 20 12:07:50 2012
@@ -73,9 +73,9 @@ public class DenseBlockWritable implemen
 
     out.writeInt(m);
     out.writeInt(n);
-    for (int i = 0; i < m; i++) {
+    for (double[] aBlock : block) {
       for (int j = 0; j < n; j++) {
-        out.writeDouble(block[i][j]);
+        out.writeDouble(aBlock[j]);
       }
     }
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/Omega.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/Omega.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/Omega.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/Omega.java Wed Jun 20 12:07:50 2012
@@ -135,7 +135,9 @@ public class Omega {
             if (v.isDense()) {
               for (int k = 0; k < v.size(); k++)
                 // it's ok, this is reentrant
+              {
                 result += getQuick(k, index) * v.getQuick(k);
+              }
 
             } else {
               for (Iterator<Vector.Element> iter = v.iterateNonZero(); iter.hasNext();) {
@@ -157,12 +159,13 @@ public class Omega {
         }
         return res;
       } catch (InterruptedException exc) {
-        throw new RuntimeException("Interrupted", exc);
+        throw new IllegalStateException("Interrupted", exc);
       } catch (ExecutionException exc) {
-        if (exc.getCause() instanceof RuntimeException)
+        if (exc.getCause() instanceof RuntimeException) {
           throw (RuntimeException) exc.getCause();
-        else
-          throw new RuntimeException(exc.getCause());
+        } else {
+          throw new IllegalStateException(exc.getCause());
+        }
       }
 
     } finally {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java Wed Jun 20 12:07:50 2012
@@ -115,7 +115,7 @@ public class SSVDCli extends AbstractJob
       throw new IOException("No Hadoop configuration present");
     }
 
-    Path[] inputPaths = new Path[] { getInputPath() };
+    Path[] inputPaths = { getInputPath() };
 
     // MAHOUT-817
     if (pca && xiPath == null) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java Wed Jun 20 12:07:50 2012
@@ -35,7 +35,6 @@ import org.apache.mahout.common.iterator
 import org.apache.mahout.common.iterator.sequencefile.PathType;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
-import org.apache.mahout.math.DenseMatrix;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Matrix;
 import org.apache.mahout.math.Vector;
@@ -49,8 +48,12 @@ import com.google.common.io.Closeables;
  * set of small file manipulation helpers.
  *
  */
+public final class SSVDHelper {
 
-public class SSVDHelper {
+  private static final Pattern OUTPUT_FILE_PATTERN = Pattern.compile("(\\w+)-(m|r)-(\\d+)(\\.\\w+)?");
+
+  private SSVDHelper() {
+  }
 
   /**
    * load single vector from an hdfs file (possibly presented as glob).
@@ -66,12 +69,14 @@ public class SSVDHelper {
                                                        conf);
 
     try {
-      if (!iter.hasNext())
+      if (!iter.hasNext()) {
         throw new IOException("Empty input while reading vector");
+      }
       VectorWritable vw = iter.next();
 
-      if (iter.hasNext())
+      if (iter.hasNext()) {
         throw new IOException("Unexpected data after the end of vector file");
+      }
 
       return vw.get();
 
@@ -83,11 +88,7 @@ public class SSVDHelper {
   /**
    * save single vector into hdfs file.
    *
-   * @param v
-   *          vector to save
-   * @param vectorFilePath
-   * @param conf
-   * @throws IOException
+   * @param v vector to save
    */
   public static void saveVector(Vector v,
                                 Path vectorFilePath,
@@ -125,11 +126,10 @@ public class SSVDHelper {
       }
 
       FileStatus firstSeqFile;
-      if (!fstats[0].isDir()) {
-        firstSeqFile = fstats[0];
+      if (fstats[0].isDir()) {
+        firstSeqFile = fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0];
       } else {
-        firstSeqFile =
-          fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0];
+        firstSeqFile = fstats[0];
       }
 
       SequenceFile.Reader r = null;
@@ -143,9 +143,6 @@ public class SSVDHelper {
     throw new IOException("Unable to open input files to determine input label type.");
   }
 
-  private static final Pattern OUTPUT_FILE_PATTERN =
-    Pattern.compile("(\\w+)-(m|r)-(\\d+)(\\.\\w+)?");
-
   static final Comparator<FileStatus> PARTITION_COMPARATOR =
     new Comparator<FileStatus>() {
       private final Matcher matcher = OUTPUT_FILE_PATTERN.matcher("");
@@ -181,13 +178,8 @@ public class SSVDHelper {
    * @param conf
    *          configuration
    * @return Dense matrix array
-   * @throws IOException
-   *           when I/O occurs.
    */
-  public static double[][] loadDistributedRowMatrix(FileSystem fs,
-                                                    Path glob,
-                                                    Configuration conf)
-    throws IOException {
+  public static double[][] loadDistributedRowMatrix(FileSystem fs, Path glob, Configuration conf) throws IOException {
 
     FileStatus[] files = fs.globStatus(glob);
     if (files == null) {
@@ -221,28 +213,17 @@ public class SSVDHelper {
   }
 
   /**
-   * Load multiplel upper triangular matrices and sum them up.
+   * Load multiple upper triangular matrices and sum them up.
    *
-   * @param fs
-   * @param glob
-   * @param conf
    * @return the sum of upper triangular inputs.
-   * @throws IOException
    */
-  public static UpperTriangular
-      loadAndSumUpperTriangularMatrices(Path glob, Configuration conf)
-        throws IOException {
+  public static UpperTriangular loadAndSumUpperTriangularMatrices(Path glob, Configuration conf) throws IOException {
     Vector v = loadAndSumUpVectors(glob, conf);
     return v == null ? null : new UpperTriangular(v);
   }
 
   /**
-   * returns sum of all vectors in different files specified by glob
-   *
-   * @param glob
-   * @param conf
-   * @return
-   * @throws IOException
+   * @return sum of all vectors in different files specified by glob
    */
   public static Vector loadAndSumUpVectors(Path glob, Configuration conf)
     throws IOException {
@@ -258,10 +239,11 @@ public class SSVDHelper {
     try {
       Vector v = null;
       while (iter.hasNext()) {
-        if (v == null)
+        if (v == null) {
           v = new DenseVector(iter.next().get());
-        else
+        } else {
           v.assign(iter.next().get(), Functions.PLUS);
+        }
       }
       return v;
 
@@ -274,17 +256,8 @@ public class SSVDHelper {
   /**
    * Load only one upper triangular matrix and issue error if mroe than one is
    * found.
-   *
-   * @param fs
-   * @param glob
-   * @param conf
-   * @return
-   * @throws IOException
    */
-  public static UpperTriangular loadUpperTriangularMatrix(FileSystem fs,
-                                                          Path glob,
-                                                          Configuration conf)
-    throws IOException {
+  public static UpperTriangular loadUpperTriangularMatrix(Path glob, Configuration conf) throws IOException {
 
     /*
      * there still may be more than one file in glob and only one of them must
@@ -299,12 +272,14 @@ public class SSVDHelper {
                                                        true,
                                                        conf);
     try {
-      if (!iter.hasNext())
+      if (!iter.hasNext()) {
         throw new IOException("No triangular matrices found");
+      }
       Vector v = iter.next().get();
       UpperTriangular result = new UpperTriangular(v);
-      if (iter.hasNext())
+      if (iter.hasNext()) {
         throw new IOException("Unexpected overrun in upper triangular matrix files");
+      }
       return result;
 
     } finally {
@@ -314,11 +289,8 @@ public class SSVDHelper {
 
   /**
    * extracts row-wise raw data from a Mahout matrix for 3rd party solvers.
-   * Unfortunately values member is 100% encapsulated in {@link DenseMatrix} at
+   * Unfortunately values member is 100% encapsulated in {@link org.apache.mahout.math.DenseMatrix} at
    * this point, so we have to resort to abstract element-wise copying.
-   *
-   * @param m
-   * @return
    */
   public static double[][] extractRawData(Matrix m) {
     int rows = m.numRows();

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java Wed Jun 20 12:07:50 2012
@@ -337,16 +337,15 @@ public class SSVDSolver {
       Path vPath = new Path(outputPath, "V");
 
       Path pcaBasePath = new Path(outputPath, "pca");
-      Path sbPath = null;
-      Path sqPath = null;
-
-      double xisquaredlen = 0;
 
-      if (pcaMeanPath != null)
+      if (pcaMeanPath != null) {
         fs.mkdirs(pcaBasePath);
+      }
       Random rnd = RandomUtils.getRandom();
       long seed = rnd.nextLong();
 
+      Path sbPath = null;
+      double xisquaredlen = 0.0;
       if (pcaMeanPath != null) {
         /*
          * combute s_b0 if pca offset present.
@@ -392,8 +391,8 @@ public class SSVDSolver {
        * bit too many (I would be happy i that were ever the case though).
        */
 
-      sbPath = new Path(pcaBasePath, "sb0");
-      sqPath = new Path(pcaBasePath, "sq0");
+      //sbPath = new Path(pcaBasePath, "sb0");
+      //sqPath = new Path(pcaBasePath, "sq0");
 
       BtJob.run(conf,
                 inputPath,
@@ -410,7 +409,7 @@ public class SSVDSolver {
                 q <= 0);
 
       sbPath = new Path(btPath, BtJob.OUTPUT_SB + "-*");
-      sqPath = new Path(btPath, BtJob.OUTPUT_SQ + "-*");
+      Path sqPath = new Path(btPath, BtJob.OUTPUT_SQ + "-*");
 
       // power iterations
       for (int i = 0; i < q; i++) {
@@ -481,7 +480,6 @@ public class SSVDSolver {
 
         bbtSquare.assign(mC, Functions.MINUS);
         bbtSquare.assign(mC.transpose(), Functions.MINUS);
-        mC = null;
 
         Matrix outerSq = sq.cross(sq);
         outerSq.assign(Functions.mult(xisquaredlen));

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SplitPartitionedWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SplitPartitionedWritable.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SplitPartitionedWritable.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SplitPartitionedWritable.java Wed Jun 20 12:07:50 2012
@@ -19,6 +19,7 @@ package org.apache.mahout.math.hadoop.st
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
+import java.io.Serializable;
 
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.io.WritableComparator;
@@ -125,7 +126,7 @@ public class SplitPartitionedWritable im
     return 0;
   }
 
-  public static final class SplitGroupingComparator extends WritableComparator {
+  public static final class SplitGroupingComparator extends WritableComparator implements Serializable {
 
     public SplitGroupingComparator() {
       super(SplitPartitionedWritable.class, true);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/VJob.java Wed Jun 20 12:07:50 2012
@@ -62,9 +62,9 @@ public class VJob {
     /*
      * xi and s_q are PCA-related corrections, per MAHOUT-817
      */
-    protected Vector xi;
-    protected Vector sq;
-    protected PlusMult plusMult = new PlusMult(0);
+    Vector xi;
+    Vector sq;
+    PlusMult plusMult = new PlusMult(0);
 
     @Override
     protected void map(IntWritable key, VectorWritable value, Context context)

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/YtYJob.java Wed Jun 20 12:07:50 2012
@@ -40,14 +40,14 @@ import org.apache.mahout.math.VectorWrit
  * Job that accumulates Y'Y output
  * 
  */
-public class YtYJob {
+public final class YtYJob {
 
   public static final String PROP_OMEGA_SEED = "ssvd.omegaseed";
   public static final String PROP_K = "ssvd.k";
   public static final String PROP_P = "ssvd.p";
 
   // we have single output, so we use standard output
-  public static final String OUTPUT_YtY = "part-";
+  public static final String OUTPUT_YT_Y = "part-";
 
   private YtYJob() {
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GramSchmidt.java Wed Jun 20 12:07:50 2012
@@ -22,10 +22,8 @@ import org.apache.mahout.math.function.D
 
 /**
  * Gram Schmidt quick helper.
- * 
- * 
  */
-public class GramSchmidt {
+public final class GramSchmidt {
 
   private GramSchmidt() {
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/ssvd/SequentialOutOfCoreSvd.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/ssvd/SequentialOutOfCoreSvd.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/ssvd/SequentialOutOfCoreSvd.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/ssvd/SequentialOutOfCoreSvd.java Wed Jun 20 12:07:50 2012
@@ -94,7 +94,8 @@ public class SequentialOutOfCoreSvd {
   private final int seed;
   private final int dim;
 
-  public SequentialOutOfCoreSvd(Iterable<File> partsOfA, File tmpDir, int internalDimension, int columnsPerSlice) throws IOException {
+  public SequentialOutOfCoreSvd(Iterable<File> partsOfA, File tmpDir, int internalDimension, int columnsPerSlice)
+    throws IOException {
     this.columnsPerSlice = columnsPerSlice;
     this.dim = internalDimension;
 
@@ -127,7 +128,7 @@ public class SequentialOutOfCoreSvd {
     int ncols = 0;
     for (File file : partsOfA) {
       MatrixWritable m = new MatrixWritable();
-      final DataInputStream in = new DataInputStream(new FileInputStream(file));
+      DataInputStream in = new DataInputStream(new FileInputStream(file));
       try {
         m.readFields(in);
       } finally {
@@ -150,7 +151,7 @@ public class SequentialOutOfCoreSvd {
     MatrixWritable bTmp = new MatrixWritable();
     for (int j = 0; j < ncols; j += columnsPerSlice) {
       if (bFile(tmpDir, j).exists()) {
-        final DataInputStream in = new DataInputStream(new FileInputStream(bFile(tmpDir, j)));
+        DataInputStream in = new DataInputStream(new FileInputStream(bFile(tmpDir, j)));
         try {
           bTmp.readFields(in);
         } finally {
@@ -167,17 +168,18 @@ public class SequentialOutOfCoreSvd {
   public void computeV(File tmpDir, int ncols) throws IOException {
     // step 5, compute pieces of V
     for (int j = 0; j < ncols; j += columnsPerSlice) {
-      final File bPath = bFile(tmpDir, j);
+      File bPath = bFile(tmpDir, j);
       if (bPath.exists()) {
         MatrixWritable m = new MatrixWritable();
-        final DataInputStream in = new DataInputStream(new FileInputStream(bPath));
+        DataInputStream in = new DataInputStream(new FileInputStream(bPath));
         try {
           m.readFields(in);
         } finally {
           in.close();
         }
         m.set(l2.solveRight(m.get().transpose()).times(svd.getV()));
-        final DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(tmpDir, String.format("V-%s", bPath.getName().replaceAll(".*-", "")))));
+        DataOutputStream out = new DataOutputStream(new FileOutputStream(
+            new File(tmpDir, String.format("V-%s", bPath.getName().replaceAll(".*-", "")))));
         try {
           m.write(out);
         } finally {
@@ -197,7 +199,8 @@ public class SequentialOutOfCoreSvd {
       Matrix y = aI.times(new RandomTrinaryMatrix(seed, aI.numCols(), dim, false));
       Matrix uI = r2.solveRight(y).times(svd.getU());
       m.set(uI);
-      final DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(tmpDir, String.format("U-%s", file.getName().replaceAll(".*-", "")))));
+      DataOutputStream out = new DataOutputStream(new FileOutputStream(
+          new File(tmpDir, String.format("U-%s", file.getName().replaceAll(".*-", "")))));
       try {
         m.write(out);
       } finally {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/Sampler.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/Sampler.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/Sampler.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/Sampler.java Wed Jun 20 12:07:50 2012
@@ -17,7 +17,7 @@
 
 package org.apache.mahout.math.stats;
 
-import org.apache.mahout.math.DenseVector;
+import com.google.common.base.Preconditions;
 import org.apache.mahout.math.Vector;
 
 import java.util.Arrays;
@@ -55,17 +55,11 @@ public class Sampler {
   }
 
   public int sample() {
-    if (sampler == null) {
-      throw new NullPointerException("Sampler must have been constructed with a distribution, or"
-        + " else sample(Vector) should be used to sample");
-    }
+    Preconditions.checkNotNull(sampler,
+      "Sampler must have been constructed with a distribution, or else sample(Vector) should be used to sample");
     return sample(sampler);
   }
 
-  private static double[] samplerFor(double[] distribution) {
-    return samplerFor(new DenseVector(distribution));
-  }
-
   private static double[] samplerFor(Vector vectorDistribution) {
     int size = vectorDistribution.size();
     double[] partition = new double[size];

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/CalculateEntropyReducer.java Wed Jun 20 12:07:50 2012
@@ -42,7 +42,7 @@ public final class CalculateEntropyReduc
 
   @Override
   protected void reduce(NullWritable key, Iterable<DoubleWritable> values, Context context)
-      throws IOException, InterruptedException {
+    throws IOException, InterruptedException {
     double entropy = 0.0;
     for (DoubleWritable value : values) {
       entropy += value.get();

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/DoubleSumReducer.java Wed Jun 20 12:07:50 2012
@@ -32,7 +32,7 @@ public final class DoubleSumReducer exte
 
   @Override
   protected void reduce(Writable key, Iterable<DoubleWritable> values, Context context)
-      throws IOException, InterruptedException {
+    throws IOException, InterruptedException {
     double sum = 0.0;
     for (DoubleWritable value : values) {
       sum += value.get();

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java Wed Jun 20 12:07:50 2012
@@ -20,7 +20,6 @@ package org.apache.mahout.math.stats.ent
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
@@ -35,8 +34,8 @@ import java.util.List;
 import java.util.Map;
 
 /**
- * A Hadoop job to compute the entropy of keys or values in a {@link SequenceFile}. Format has to be {@link Text} for
- * key or value.
+ * A Hadoop job to compute the entropy of keys or values in a {@link org.apache.hadoop.io.SequenceFile}.
+ * Format has to be {@link Text} for key or value.
  * <p/>
  * <ul>
  * <li>-i The input sequence file</li>

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/InformationGain.java Wed Jun 20 12:07:50 2012
@@ -20,7 +20,6 @@ package org.apache.mahout.math.stats.ent
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.DoubleWritable;
-import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.iterator.sequencefile.PathFilters;
@@ -31,7 +30,7 @@ import java.io.IOException;
 import java.util.Iterator;
 
 /**
- * Calculates the information gain for a {@link SequenceFile}.
+ * Calculates the information gain for a {@link org.apache.hadoop.io.SequenceFile}.
  * Computes, how 'useful' are the keys when predicting the values.
  * <ul>
  * <li>-i The input sequence file</li>
@@ -84,15 +83,22 @@ public final class InformationGain exten
   }
 
   private void calculateEntropy() throws Exception {
-    String[] args = { "-i", getInputPath().toString(), "-o", entropyPath.toString(), "-s", "value",
-        "--tempDir", getTempPath().toString() };
+    String[] args = {
+      "-i", getInputPath().toString(),
+      "-o", entropyPath.toString(),
+      "-s", "value",
+      "--tempDir", getTempPath().toString(),
+    };
     ToolRunner.run(new Entropy(), args);
     entropy = readDoubleFromPath(entropyPath);
   }
 
   private void calculateConditionalEntropy() throws Exception {
-    String[] args = { "-i", getInputPath().toString(), "-o", conditionalEntropyPath.toString(),
-        "--tempDir", getTempPath().toString() };
+    String[] args = {
+      "-i", getInputPath().toString(),
+      "-o", conditionalEntropyPath.toString(),
+      "--tempDir", getTempPath().toString(),
+    };
     ToolRunner.run(new ConditionalEntropy(), args);
     conditionalEntropy = readDoubleFromPath(conditionalEntropyPath);
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java Wed Jun 20 12:07:50 2012
@@ -59,29 +59,20 @@ import org.apache.mahout.vectorizer.term
  * value containing the tokenized document. You may use {@link DocumentProcessor} to tokenize the document.
  * This is a dictionary based Vectorizer.
  */
-public final class DictionaryVectorizer implements Vectorizer{
+public final class DictionaryVectorizer implements Vectorizer {
   
   public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
-  
   public static final String MIN_SUPPORT = "min.support";
-  
   public static final String MAX_NGRAMS = "max.ngrams";
-  
   public static final int DEFAULT_MIN_SUPPORT = 2;
   
   private static final String DICTIONARY_FILE = "dictionary.file-";
-  
   private static final int MAX_CHUNKSIZE = 10000;
-  
   private static final int MIN_CHUNKSIZE = 100;
-  
   private static final String OUTPUT_FILES_PATTERN = "part-*";
-  
   // 4 byte overhead for each entry in the OpenObjectIntHashMap
   private static final int DICTIONARY_BYTE_OVERHEAD = 4;
-  
   private static final String VECTOR_OUTPUT_FOLDER = "partial-vectors-";
-  
   private static final String DICTIONARY_JOB_FOLDER = "wordcount";
   
   /**
@@ -90,7 +81,8 @@ public final class DictionaryVectorizer 
   private DictionaryVectorizer() {
   }
 
-  //TODO: move more of SparseVectorsFromSequenceFile in here, and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework.
+  //TODO: move more of SparseVectorsFromSequenceFile in here, and then fold SparseVectorsFrom with
+  // EncodedVectorsFrom to have one framework.
 
   @Override
   public void createVectors(Path input, Path output, VectorizerConfig config)
@@ -325,8 +317,9 @@ public final class DictionaryVectorizer 
     HadoopUtil.delete(conf, output);
     
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
   }
   
   /**
@@ -363,7 +356,8 @@ public final class DictionaryVectorizer 
     HadoopUtil.delete(conf, output);
     
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
   }
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DocumentProcessor.java Wed Jun 20 12:07:50 2012
@@ -91,8 +91,9 @@ public final class DocumentProcessor {
     HadoopUtil.delete(conf, output);
 
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
 
   }
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java Wed Jun 20 12:07:50 2012
@@ -27,14 +27,11 @@ import org.apache.mahout.common.HadoopUt
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
 import org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 /**
  * Converts a given set of sequence files into SparseVectors
  */
 public final class EncodedVectorsFromSequenceFiles extends AbstractJob {
-  private static final Logger log = LoggerFactory.getLogger(EncodedVectorsFromSequenceFiles.class);
 
   public static void main(String[] args) throws Exception {
     ToolRunner.run(new Configuration(), new EncodedVectorsFromSequenceFiles(), args);
@@ -45,11 +42,21 @@ public final class EncodedVectorsFromSeq
     addInputOption();
     addOutputOption();
     addOption(DefaultOptionCreator.analyzerOption().create());
-    addOption(buildOption("sequentialAccessVector", "seq", "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false", false, false, null));
-    addOption(buildOption("namedVector", "nv", "Create named vectors using the key.  False by default", false, false, null));
-    addOption("cardinality", "c", "The cardinality to use for creating the vectors.  Default is 5000", String.valueOf(5000));
-    addOption("encoderFieldName", "en", "The name of the encoder to be passed to the FeatureVectorEncoder constructor.  Default is text.  Note this is not the class name of a FeatureValueEncoder, but is instead the construction argument.", "text");
-    addOption("encoderClass", "ec", "The class name of the encoder to be used. Default is " + LuceneTextValueEncoder.class.getName(), LuceneTextValueEncoder.class.getName());
+    addOption(buildOption("sequentialAccessVector", "seq",
+                          "(Optional) Whether output vectors should be SequentialAccessVectors. " +
+                              "If set true else false",
+                          false, false, null));
+    addOption(buildOption("namedVector", "nv",
+                          "Create named vectors using the key.  False by default", false, false, null));
+    addOption("cardinality", "c",
+              "The cardinality to use for creating the vectors.  Default is 5000", "5000");
+    addOption("encoderFieldName", "en",
+              "The name of the encoder to be passed to the FeatureVectorEncoder constructor. Default is text. " +
+                  "Note this is not the class name of a FeatureValueEncoder, but is instead the construction argument.",
+              "text");
+    addOption("encoderClass", "ec",
+              "The class name of the encoder to be used. Default is " + LuceneTextValueEncoder.class.getName(),
+              LuceneTextValueEncoder.class.getName());
     addOption(DefaultOptionCreator.overwriteOption().create());
     if (parseArguments(args) == null) {
       return -1;
@@ -64,12 +71,10 @@ public final class EncodedVectorsFromSeq
 
     Class<? extends Analyzer> analyzerClass = getAnalyzerClassFromOption();
 
-
     Configuration conf = getConf();
 
     boolean sequentialAccessOutput = hasOption("sequentialAccessVector");
 
-
     boolean namedVectors = hasOption("namedVector");
     int cardinality = 5000;
     if (hasOption("cardinality")) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/HighDFWordsPruner.java Wed Jun 20 12:07:50 2012
@@ -1,5 +1,4 @@
-package org.apache.mahout.vectorizer;
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -16,6 +15,8 @@ package org.apache.mahout.vectorizer;
  * limitations under the License.
  */
 
+package org.apache.mahout.vectorizer;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.Path;
@@ -38,7 +39,7 @@ import java.net.URI;
 import java.util.ArrayList;
 import java.util.List;
 
-public class HighDFWordsPruner {
+public final class HighDFWordsPruner {
 
   public static final String OUT_DIR_SUFFIX = "-pruned";
   public static final String STD_CALC_DIR = "stdcalc";
@@ -47,14 +48,12 @@ public class HighDFWordsPruner {
   private HighDFWordsPruner() {
   }
 
-
   public static void pruneVectors(Path tfDir, Path prunedTFDir, Path prunedPartialTFDir, long maxDF,
                                   Configuration baseConf,
                                   Pair<Long[], List<Path>> docFrequenciesFeatures,
                                   float normPower,
                                   boolean logNormalize,
-                                  int numReducers) throws IOException, InterruptedException,
-          ClassNotFoundException {
+                                  int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
 
     int partialVectorIndex = 0;
     List<Path> partialVectorPaths = new ArrayList<Path>();
@@ -64,8 +63,7 @@ public class HighDFWordsPruner {
       pruneVectorsPartial(tfDir, partialVectorOutputPath, path, maxDF, baseConf);
     }
 
-    mergePartialVectors(partialVectorPaths, prunedTFDir, baseConf, normPower, logNormalize,
-            numReducers);
+    mergePartialVectors(partialVectorPaths, prunedTFDir, baseConf, normPower, logNormalize, numReducers);
     HadoopUtil.delete(new Configuration(baseConf), prunedPartialTFDir);
   }
 
@@ -93,8 +91,9 @@ public class HighDFWordsPruner {
     HadoopUtil.delete(conf, output);
 
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
   }
 
   public static void mergePartialVectors(Iterable<Path> partialVectorPaths,
@@ -103,7 +102,7 @@ public class HighDFWordsPruner {
                                          float normPower,
                                          boolean logNormalize,
                                          int numReducers)
-          throws IOException, InterruptedException, ClassNotFoundException {
+    throws IOException, InterruptedException, ClassNotFoundException {
 
     Configuration conf = new Configuration(baseConf);
     // this conf parameter needs to be set enable serialisation of conf values
@@ -132,8 +131,9 @@ public class HighDFWordsPruner {
     HadoopUtil.delete(conf, output);
 
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
   }
 
   private static String getCommaSeparatedPaths(Iterable<Path> paths) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java Wed Jun 20 12:07:50 2012
@@ -63,8 +63,9 @@ public class SimpleTextEncodingVectorize
     boolean finished = job.waitForCompletion(true);
 
     log.info("result of run: {}", finished);
-    if (!finished) 
+    if (!finished) {
       throw new IllegalStateException("Job failed!");
+    }
   }
 
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java Wed Jun 20 12:07:50 2012
@@ -86,16 +86,17 @@ public final class SparseVectorsFromSequ
       "The minimum document frequency.  Default is 1").withShortName("md").create();
 
     Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
-      abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
-          + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, it will override this value.").withShortName("x").create();
+        abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
+            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, "
+            + "it will override this value.").withShortName("x").create();
 
     Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false).withArgument(
       abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create()).withDescription(
-      "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors." 
-              + "  Can be used to remove really high frequency terms."
-          + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors " 
-              + "will be filtered out. Default is -1.0.  Overrides maxDFPercent").withShortName("xs").create();
+      "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) "
+          + "of the document frequencies of these vectors. Can be used to remove really high frequency terms."
+          + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less than 0 "
+          + "no vectors will be filtered out. Default is -1.0.  Overrides maxDFPercent").withShortName("xs").create();
     
     Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false).withArgument(
       abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
@@ -139,8 +140,8 @@ public final class SparseVectorsFromSequ
     
     Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
         .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
-        .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
-        .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
+        .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
+        .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
         .withOption(helpOpt).withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt)
         .withOption(logNormalizeOpt)
         .create();
@@ -229,7 +230,7 @@ public final class SparseVectorsFromSequ
       }
       double maxDFSigma = -1.0;
       if (cmdLine.hasOption(maxDFSigmaOpt)) {
-    	  maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
+        maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
       }
       
       float norm = PartialVectorMerger.NO_NORMALIZING;
@@ -249,7 +250,8 @@ public final class SparseVectorsFromSequ
 
       Configuration conf = getConf();
       Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
-      //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
+      //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom
+      // to have one framework for all of this.
       DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);
 
       boolean sequentialAccessOutput = false;
@@ -262,65 +264,91 @@ public final class SparseVectorsFromSequ
         namedVectors = true;
       }
       boolean shouldPrune = maxDFSigma >=0.0;
-      String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER+"-toprune" : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
+      String tfDirName = shouldPrune
+          ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER+"-toprune"
+          : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
 
-      if (!processIdf) {
-        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize,
-          minLLRValue, norm, logNormalize, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors);
+      if (processIdf) {
+        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
+                                                        outputDir,
+                                                        tfDirName,
+                                                        conf,
+                                                        minSupport,
+                                                        maxNGramSize,
+                                                        minLLRValue,
+                                                        -1.0f,
+                                                        false,
+                                                        reduceTasks,
+                                                        chunkSize,
+                                                        sequentialAccessOutput,
+                                                        namedVectors);
       } else {
-        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf, minSupport, maxNGramSize,
-          minLLRValue, -1.0f, false, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors);
+        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
+                                                        outputDir,
+                                                        tfDirName,
+                                                        conf,
+                                                        minSupport,
+                                                        maxNGramSize,
+                                                        minLLRValue,
+                                                        norm,
+                                                        logNormalize,
+                                                        reduceTasks,
+                                                        chunkSize,
+                                                        sequentialAccessOutput,
+                                                        namedVectors);
       }
+
       Pair<Long[], List<Path>> docFrequenciesFeatures = null;
-       // Should document frequency features be processed
-       if (shouldPrune || processIdf) {
-         docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName),
-                 outputDir, conf, chunkSize);
-       }
-
-       long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
-       if (shouldPrune) {
-         Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
-         Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);
-
-         // Calculate the standard deviation
-         double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
-         long vectorCount = docFrequenciesFeatures.getFirst()[1];
-         maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
-
-         // Prune the term frequency vectors
-         Path tfDir = new Path(outputDir, tfDirName);
-         Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
-         Path prunedPartialTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER
-                 + "-partial");
-         if (processIdf) {
-           HighDFWordsPruner.pruneVectors(tfDir,
-                                          prunedTFDir,
-                                          prunedPartialTFDir,
-                                          maxDF,
-                                          conf,
-                                          docFrequenciesFeatures,
-                                          -1.0f,
-                                          false,
-                                          reduceTasks);
-         } else {
-           HighDFWordsPruner.pruneVectors(tfDir,
-                                          prunedTFDir,
-                                          prunedPartialTFDir,
-                                          maxDF,
-                                          conf,
-                                          docFrequenciesFeatures,
-                                          norm,
-                                          logNormalize,
-                                          reduceTasks);
-         }
-         HadoopUtil.delete(new Configuration(conf), tfDir);
-       }
+      // Should document frequency features be processed
+      if (shouldPrune || processIdf) {
+        docFrequenciesFeatures =
+            TFIDFConverter.calculateDF(new Path(outputDir, tfDirName),outputDir, conf, chunkSize);
+      }
+
+      long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
+      if (shouldPrune) {
+        Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
+        Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);
+
+        // Calculate the standard deviation
+        double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
+        long vectorCount = docFrequenciesFeatures.getFirst()[1];
+        maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
+
+        // Prune the term frequency vectors
+        Path tfDir = new Path(outputDir, tfDirName);
+        Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
+        Path prunedPartialTFDir =
+            new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
+
+        if (processIdf) {
+          HighDFWordsPruner.pruneVectors(tfDir,
+                                         prunedTFDir,
+                                         prunedPartialTFDir,
+                                         maxDF,
+                                         conf,
+                                         docFrequenciesFeatures,
+                                         -1.0f,
+                                         false,
+                                         reduceTasks);
+        } else {
+          HighDFWordsPruner.pruneVectors(tfDir,
+                                         prunedTFDir,
+                                         prunedPartialTFDir,
+                                         maxDF,
+                                         conf,
+                                         docFrequenciesFeatures,
+                                         norm,
+                                         logNormalize,
+                                         reduceTasks);
+        }
+        HadoopUtil.delete(new Configuration(conf), tfDir);
+      }
       if (processIdf) {
-          TFIDFConverter.processTfIdf(
-                 new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
-                 outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
-                 sequentialAccessOutput, namedVectors, reduceTasks);
+        TFIDFConverter.processTfIdf(
+               new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
+               outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
+               sequentialAccessOutput, namedVectors, reduceTasks);
       }
     } catch (OptionException e) {
       log.error("Exception", e);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java Wed Jun 20 12:07:50 2012
@@ -23,6 +23,7 @@ import java.io.IOException;
 
 public interface Vectorizer {
 
-  void createVectors(Path input, Path output, VectorizerConfig config) throws IOException, ClassNotFoundException, InterruptedException;
+  void createVectors(Path input, Path output, VectorizerConfig config)
+    throws IOException, ClassNotFoundException, InterruptedException;
 
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java Wed Jun 20 12:07:50 2012
@@ -235,8 +235,9 @@ public final class CollocDriver extends 
     job.setNumReduceTasks(reduceTasks);
     
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
 
     return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
   }
@@ -277,7 +278,8 @@ public final class CollocDriver extends 
     job.setNumReduceTasks(reduceTasks);
 
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
   }
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/Gram.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/Gram.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/Gram.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/Gram.java Wed Jun 20 12:07:50 2012
@@ -23,6 +23,7 @@ import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
 
+import com.google.common.base.Preconditions;
 import org.apache.hadoop.io.BinaryComparable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparable;
@@ -96,11 +97,7 @@ public class Gram extends BinaryComparab
    *          whether the gram is at the head of its text unit or tail or unigram
    */
   public Gram(String ngram, int frequency, Type type) {
-    
-    if (ngram == null) {
-      throw new NullPointerException();
-    }
-    
+    Preconditions.checkNotNull(ngram);
     try {  
       // extra character is used for storing type which is part 
       // of the sort key.

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/common/PartialVectorMerger.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/common/PartialVectorMerger.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/common/PartialVectorMerger.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/common/PartialVectorMerger.java Wed Jun 20 12:07:50 2012
@@ -127,8 +127,9 @@ public final class PartialVectorMerger {
     HadoopUtil.delete(conf, output);
 
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
   }
 
   private static String getCommaSeparatedPaths(Iterable<Path> paths) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/AdaptiveWordValueEncoder.java Wed Jun 20 12:07:50 2012
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
+import com.google.common.base.Charsets;
 import com.google.common.collect.HashMultiset;
 import com.google.common.collect.Multiset;
 import org.apache.mahout.math.Vector;
@@ -57,7 +58,7 @@ public class AdaptiveWordValueEncoder ex
     // the counts here are adjusted so that every observed value has an extra 0.5 count
     // as does a hypothetical unobserved value.  This smooths our estimates a bit and
     // allows the first word seen to have a non-zero weight of -log(1.5 / 2)
-    double thisWord = dictionary.count(new String(originalForm)) + 0.5;
+    double thisWord = dictionary.count(new String(originalForm, Charsets.UTF_8)) + 0.5;
     double allWords = dictionary.size() + dictionary.elementSet().size() * 0.5 + 0.5;
     return -Math.log(thisWord / allWords);
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingContinuousValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingContinuousValueEncoder.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingContinuousValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingContinuousValueEncoder.java Wed Jun 20 12:07:50 2012
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
+import com.google.common.base.Charsets;
 import org.apache.mahout.math.map.OpenIntIntHashMap;
 
 import com.google.common.base.Preconditions;
@@ -54,7 +55,7 @@ public class CachingContinuousValueEncod
     if (caches[probe].containsKey(originalForm.hashCode())) {
       return caches[probe].get(originalForm.hashCode());
     }
-    int hash = hashForProbe(originalForm.getBytes(), dataSize, name, probe);
+    int hash = hashForProbe(originalForm.getBytes(Charsets.UTF_8), dataSize, name, probe);
     caches[probe].put(originalForm.hashCode(), hash);
     return hash;
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingStaticWordValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingStaticWordValueEncoder.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingStaticWordValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/CachingStaticWordValueEncoder.java Wed Jun 20 12:07:50 2012
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
+import com.google.common.base.Charsets;
 import org.apache.mahout.math.map.OpenIntIntHashMap;
 
 import com.google.common.base.Preconditions;
@@ -55,7 +56,7 @@ public class CachingStaticWordValueEncod
     if (caches[probe].containsKey(originalForm.hashCode())) {
       return caches[probe].get(originalForm.hashCode());
     }
-    int hash = hashForProbe(originalForm.getBytes(), dataSize, name, probe);
+    int hash = hashForProbe(originalForm.getBytes(Charsets.UTF_8), dataSize, name, probe);
     caches[probe].put(originalForm.hashCode(), hash);
     return hash;
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/ContinuousValueEncoder.java Wed Jun 20 12:07:50 2012
@@ -17,12 +17,14 @@
 
 package org.apache.mahout.vectorizer.encoders;
 
+import com.google.common.base.Charsets;
 import org.apache.mahout.math.Vector;
 
 /**
  * Continuous values are stored in fixed randomized location in the feature vector.
  */
 public class ContinuousValueEncoder extends CachingValueEncoder {
+
   public ContinuousValueEncoder(String name) {
     super(name, CONTINUOUS_VALUE_HASH_SEED);
   }
@@ -48,11 +50,10 @@ public class ContinuousValueEncoder exte
 
   @Override
   protected double getWeight(byte[] originalForm, double w) {
-    if (originalForm != null) {
-      return w * Double.parseDouble(new String(originalForm));
-    } else {
+    if (originalForm == null) {
       return w;
     }
+    return w * Double.parseDouble(new String(originalForm, Charsets.UTF_8));
   }
 
   /**

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java Wed Jun 20 12:07:50 2012
@@ -62,7 +62,7 @@ public class TFPartialVectorReducer exte
 
   @Override
   protected void reduce(Text key, Iterable<StringTuple> values, Context context)
-          throws IOException, InterruptedException {
+    throws IOException, InterruptedException {
     Iterator<StringTuple> it = values.iterator();
     if (!it.hasNext()) {
       return;

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/tfidf/TFIDFConverter.java Wed Jun 20 12:07:50 2012
@@ -58,36 +58,24 @@ import org.apache.mahout.vectorizer.term
 public final class TFIDFConverter {
 
   public static final String VECTOR_COUNT = "vector.count";
-
   public static final String FEATURE_COUNT = "feature.count";
-
   public static final String MIN_DF = "min.df";
-
   public static final String MAX_DF = "max.df";
-
   //public static final String TFIDF_OUTPUT_FOLDER = "tfidf";
 
   private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
-
   private static final String FREQUENCY_FILE = "frequency.file-";
-
   private static final int MAX_CHUNKSIZE = 10000;
-
   private static final int MIN_CHUNKSIZE = 100;
-
   private static final String OUTPUT_FILES_PATTERN = "part-*";
-
   private static final int SEQUENCEFILE_BYTE_OVERHEAD = 45;
-
   private static final String VECTOR_OUTPUT_FOLDER = "partial-vectors-";
-
   public static final String WORDCOUNT_OUTPUT_FOLDER = "df-count";
 
   /**
    * Cannot be initialized. Use the static functions
    */
   private TFIDFConverter() {
-
   }
 
   /**
@@ -182,10 +170,11 @@ public final class TFIDFConverter {
    *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
    *          partial vectors without thrashing the system due to increased swapping
    */
-  public static Pair<Long[], List<Path>> calculateDF(Path input,
-                                  Path output,
-                                  Configuration baseConf,
-                                  int chunkSizeInMegabytes) throws IOException, InterruptedException, ClassNotFoundException {
+  public static Pair<Long[],List<Path>> calculateDF(Path input,
+                                                    Path output,
+                                                    Configuration baseConf,
+                                                    int chunkSizeInMegabytes)
+    throws IOException, InterruptedException, ClassNotFoundException {
 
     if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
       chunkSizeInMegabytes = MIN_CHUNKSIZE;
@@ -330,8 +319,9 @@ public final class TFIDFConverter {
     HadoopUtil.delete(conf, output);
 
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
   }
 
   /**
@@ -366,7 +356,8 @@ public final class TFIDFConverter {
     HadoopUtil.delete(conf, output);
 
     boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) 
+    if (!succeeded) {
       throw new IllegalStateException("Job failed!");
+    }
   }
 }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizerTest.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizerTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizerTest.java Wed Jun 20 12:07:50 2012
@@ -91,7 +91,7 @@ public class ALSWRFactorizerTest extends
   public void ratingVector() throws Exception {
     PreferenceArray prefs = dataModel.getPreferencesFromUser(1);
 
-    Vector ratingVector = factorizer.ratingVector(prefs);
+    Vector ratingVector = ALSWRFactorizer.ratingVector(prefs);
 
     assertEquals(prefs.length(), ratingVector.getNumNondefaultElements());
     assertEquals(prefs.get(0).getValue(), ratingVector.get(0), EPSILON);

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java Wed Jun 20 12:07:50 2012
@@ -17,13 +17,7 @@
 
 package org.apache.mahout.classifier.df.mapreduce.partial;
 
-import static org.easymock.EasyMock.anyObject;
-import static org.easymock.EasyMock.capture;
-import static org.easymock.EasyMock.createMock;
-import static org.easymock.EasyMock.expectLastCall;
-import static org.easymock.EasyMock.replay;
-import static org.easymock.EasyMock.verify;
-
+import static org.easymock.EasyMock.*;
 import java.util.Random;
 
 import org.apache.hadoop.io.LongWritable;
@@ -35,7 +29,6 @@ import org.apache.mahout.classifier.df.d
 import org.apache.mahout.classifier.df.data.DataLoader;
 import org.apache.mahout.classifier.df.data.Dataset;
 import org.apache.mahout.classifier.df.data.Utils;
-import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
 import org.apache.mahout.classifier.df.node.Leaf;
 import org.apache.mahout.classifier.df.node.Node;
 import org.apache.mahout.common.MahoutTestCase;
@@ -82,10 +75,11 @@ public final class Step1MapperTest exten
 
   private static class TreeIDCapture extends Capture<TreeID> {
 
-    public TreeIDCapture() {
+    private TreeIDCapture() {
       super(CaptureType.ALL);
     }
 
+    @Override
     public void setValue(final TreeID value) {
       super.setValue(value.clone());
     }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java Wed Jun 20 12:07:50 2012
@@ -121,7 +121,7 @@ public final class ClusteringTestUtils {
       private final Random random;
       private final Sampler[] samplers;
 
-      public LDASampler(Matrix model, Random random) {
+      LDASampler(Matrix model, Random random) {
           this.random = random;
           samplers = new Sampler[model.numRows()];
           for (int i = 0; i < samplers.length; i++) {

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Wed Jun 20 12:07:50 2012
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.clustering.canopy;
 
+import java.util.Collection;
 import java.util.List;
 import java.util.Set;
 
@@ -352,8 +353,9 @@ public final class TestCanopyCreation ex
     }
   }
 
-  boolean findAndRemove(Pair<Double,Double> target, 
-                        List<Pair<Double,Double>> list, double epsilon) {
+  static boolean findAndRemove(Pair<Double, Double> target,
+                               Collection<Pair<Double, Double>> list,
+                               double epsilon) {
     for (Pair<Double,Double> curr : list) {
       if ( (Math.abs(target.getFirst() - curr.getFirst()) < epsilon) 
            && (Math.abs(target.getSecond() - curr.getSecond()) < epsilon) ) {

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java Wed Jun 20 12:07:50 2012
@@ -24,8 +24,6 @@ import java.util.Set;
 
 import com.google.common.collect.Sets;
 
-import junit.framework.Assert;
-
 import org.apache.commons.lang.ArrayUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
@@ -45,6 +43,7 @@ import org.apache.mahout.common.iterator
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -56,19 +55,12 @@ public class ClusterClassificationDriver
       {5, 4}, {4, 5}, {5, 5}, {9, 9}, {8, 8}};
   
   private FileSystem fs;
-  
   private Path clusteringOutputPath;
-  
   private Configuration conf;
-  
   private Path pointsPath;
-  
   private Path classifiedOutputPath;
-  
   private List<Vector> firstCluster;
-  
   private List<Vector> secondCluster;
-  
   private List<Vector> thirdCluster;
   
   @Override
@@ -107,7 +99,7 @@ public class ClusterClassificationDriver
     ClusteringTestUtils.writePointsToFile(points, true, 
         new Path(pointsPath, "file1"), fs, conf);
     runClustering(pointsPath, conf, false);
-    runClassificationWithOutlierRemoval(conf, false);
+    runClassificationWithOutlierRemoval(false);
     collectVectorsForAssertion();
     assertVectorsWithOutlierRemoval();
   }
@@ -125,7 +117,7 @@ public class ClusterClassificationDriver
     ClusteringTestUtils.writePointsToFile(points,
         new Path(pointsPath, "file1"), fs, conf);
     runClustering(pointsPath, conf, true);
-    runClassificationWithoutOutlierRemoval(conf);
+    runClassificationWithoutOutlierRemoval();
     collectVectorsForAssertion();
     assertVectorsWithoutOutlierRemoval();
   }
@@ -143,7 +135,7 @@ public class ClusterClassificationDriver
     ClusteringTestUtils.writePointsToFile(points,
         new Path(pointsPath, "file1"), fs, conf);
     runClustering(pointsPath, conf, true);
-    runClassificationWithOutlierRemoval(conf, true);
+    runClassificationWithOutlierRemoval(true);
     collectVectorsForAssertion();
     assertVectorsWithOutlierRemoval();
   }
@@ -158,17 +150,14 @@ public class ClusterClassificationDriver
         finalClustersPath);
   }
   
-  private void runClassificationWithoutOutlierRemoval(Configuration conf)
-      throws IOException, InterruptedException, ClassNotFoundException {
-    ClusterClassificationDriver.run(pointsPath, clusteringOutputPath,
-        classifiedOutputPath, 0.0, true, true);
+  private void runClassificationWithoutOutlierRemoval()
+    throws IOException, InterruptedException, ClassNotFoundException {
+    ClusterClassificationDriver.run(pointsPath, clusteringOutputPath, classifiedOutputPath, 0.0, true, true);
   }
   
-  private void runClassificationWithOutlierRemoval(Configuration conf2,
-      boolean runSequential) throws IOException, InterruptedException,
-      ClassNotFoundException {
-    ClusterClassificationDriver.run(pointsPath, clusteringOutputPath,
-        classifiedOutputPath, 0.73, true, runSequential);
+  private void runClassificationWithOutlierRemoval(boolean runSequential)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    ClusterClassificationDriver.run(pointsPath, clusteringOutputPath, classifiedOutputPath, 0.73, true, runSequential);
   }
   
   private void collectVectorsForAssertion() throws IOException {
@@ -188,11 +177,11 @@ public class ClusterClassificationDriver
   }
   
   private void collectVector(String clusterId, Vector vector) {
-    if (clusterId.equals("0")) {
+    if ("0".equals(clusterId)) {
       firstCluster.add(vector);
-    } else if (clusterId.equals("1")) {
+    } else if ("1".equals(clusterId)) {
       secondCluster.add(vector);
-    } else if (clusterId.equals("2")) {
+    } else if ("2".equals(clusterId)) {
       thirdCluster.add(vector);
     }
   }
@@ -233,23 +222,21 @@ public class ClusterClassificationDriver
   }
 
   private void checkClustersWithOutlierRemoval() {
-    Set<String> reference = Sets.newHashSet(new String[] {"{1:9.0,0:9.0}",
-                                                          "{1:1.0,0:1.0}"});
-    int singletonCnt = 0;
-    int emptyCnt = 0;
+    Set<String> reference = Sets.newHashSet("{1:9.0,0:9.0}", "{1:1.0,0:1.0}");
 
     List<List<Vector>> clusters = Lists.newArrayList();
     clusters.add(firstCluster);
     clusters.add(secondCluster);
     clusters.add(thirdCluster);
 
+    int singletonCnt = 0;
+    int emptyCnt = 0;
     for (List<Vector> vList : clusters) {
-      if (vList.size() == 0) {
+      if (vList.isEmpty()) {
         emptyCnt++;
       } else {
         singletonCnt++;
-        Assert.assertTrue("expecting only singleton clusters; got size=" + vList.size(),
-                          vList.size() == 1);
+        assertEquals("expecting only singleton clusters; got size=" + vList.size(), 1, vList.size());
         Assert.assertTrue("not expecting cluster:" + vList.get(0).asFormatString(),
                           reference.contains(vList.get(0).asFormatString()));
         reference.remove(vList.get(0).asFormatString());

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java Wed Jun 20 12:07:50 2012
@@ -50,7 +50,7 @@ import com.google.common.io.Closeables;
 
 public final class TestMapReduce extends MahoutTestCase {
   
-  private Collection<VectorWritable> sampleData = Lists.newArrayList();
+  private final Collection<VectorWritable> sampleData = Lists.newArrayList();
   
   private FileSystem fs;
   
@@ -291,7 +291,7 @@ public final class TestMapReduce extends
     printModels(getClusters(outputPath, maxIterations));
   }
   
-  private void printModels(List<List<Cluster>> result) {
+  private static void printModels(Iterable<List<Cluster>> result) {
     int row = 0;
     StringBuilder models = new StringBuilder(100);
     for (List<Cluster> r : result) {
@@ -306,7 +306,7 @@ public final class TestMapReduce extends
     System.out.println(models.toString());
   }
   
-  private List<List<Cluster>> getClusters(Path output, int numIterations) throws IOException {
+  private Iterable<List<Cluster>> getClusters(Path output, int numIterations) throws IOException {
     List<List<Cluster>> result = new ArrayList<List<Cluster>>();
     for (int i = 1; i <= numIterations; i++) {
       ClusterClassifier posterior = new ClusterClassifier();

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Wed Jun 20 12:07:50 2012
@@ -56,21 +56,6 @@ public final class TestFuzzyKmeansCluste
     fs = FileSystem.get(conf);
   }
 
-  private static double round(double val, int places) {
-    long factor = (long) Math.pow(10, places);
-
-    // Shift the decimal the correct number of places
-    // to the right.
-    val *= factor;
-
-    // Round to the nearest integer.
-    long tmp = Math.round(val);
-
-    // Shift the decimal the correct number of places
-    // back to the left.
-    return (double) tmp / factor;
-  }
-
   private static Vector tweakValue(Vector point) {
     return point.plus(0.1);
   }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java Wed Jun 20 12:07:50 2012
@@ -198,8 +198,7 @@ public final class TestClusterClassifier
   public void testClusterIteratorKMeans() {
     List<Vector> data = TestKmeansClustering.getPoints(TestKmeansClustering.REFERENCE);
     ClusterClassifier prior = newKlusterClassifier();
-    ClusterIterator iterator = new ClusterIterator();
-    ClusterClassifier posterior = iterator.iterate(data, prior, 5);
+    ClusterClassifier posterior = ClusterIterator.iterate(data, prior, 5);
     assertEquals(3, posterior.getModels().size());
     for (Cluster cluster : posterior.getModels()) {
       System.out.println(cluster.asFormatString(null));
@@ -210,8 +209,7 @@ public final class TestClusterClassifier
   public void testClusterIteratorDirichlet() {
     List<Vector> data = TestKmeansClustering.getPoints(TestKmeansClustering.REFERENCE);
     ClusterClassifier prior = newKlusterClassifier();
-    ClusterIterator iterator = new ClusterIterator();
-    ClusterClassifier posterior = iterator.iterate(data, prior, 5);
+    ClusterClassifier posterior = ClusterIterator.iterate(data, prior, 5);
     assertEquals(3, posterior.getModels().size());
     for (Cluster cluster : posterior.getModels()) {
       System.out.println(cluster.asFormatString(null));
@@ -235,7 +233,7 @@ public final class TestClusterClassifier
     for (Cluster cluster : prior.getModels()) {
       System.out.println(cluster.asFormatString(null));
     }
-    new ClusterIterator().iterateSeq(conf, pointsPath, path, outPath, 5);
+    ClusterIterator.iterateSeq(conf, pointsPath, path, outPath, 5);
     
     for (int i = 1; i <= 4; i++) {
       System.out.println("Classifier-" + i);
@@ -251,7 +249,7 @@ public final class TestClusterClassifier
   }
   
   @Test
-  public void testMRFileClusterIteratorKMeans() throws IOException, InterruptedException, ClassNotFoundException {
+  public void testMRFileClusterIteratorKMeans() throws Exception {
     Path pointsPath = getTestTempDirPath("points");
     Path priorPath = getTestTempDirPath("prior");
     Path outPath = getTestTempDirPath("output");
@@ -269,7 +267,7 @@ public final class TestClusterClassifier
     for (Cluster cluster : prior.getModels()) {
       System.out.println(cluster.asFormatString(null));
     }
-    new ClusterIterator().iterateMR(conf, pointsPath, path, outPath, 5);
+    ClusterIterator.iterateMR(conf, pointsPath, path, outPath, 5);
     
     for (int i = 1; i <= 4; i++) {
       System.out.println("Classifier-" + i);

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Wed Jun 20 12:07:50 2012
@@ -281,7 +281,7 @@ public final class TestKmeansClustering 
                   && !got43) {
         got43 = true;
       } else {
-        assertTrue("got unexpected center: "+v+" ["+v.getClass().toString()+"]", false);
+        fail("got unexpected center: " + v + " [" + v.getClass().toString() + ']');
       }
     }
     assertEquals("got unexpected number of centers", 2, count);
@@ -301,11 +301,11 @@ public final class TestKmeansClustering 
       collector.collect(record.getFirst(), record.getSecond());
     }
     
-    boolean gotLowClust = false;  // clusters should be [1, *] and [2, *] 
-    boolean gotHighClust = false; // vs [3 , *],  [4 , *] and [5, *] 
+    //boolean gotLowClust = false;  // clusters should be [1, *] and [2, *]
+    //boolean gotHighClust = false; // vs [3 , *],  [4 , *] and [5, *]
     for (IntWritable k : collector.getKeys()) {
       List<WeightedVectorWritable> wvList = collector.getValue(k);
-      assertTrue("empty cluster!", wvList.size() != 0);
+      assertTrue("empty cluster!", !wvList.isEmpty());
       if (wvList.get(0).getVector().get(0) <= 2.0) {
         for (WeightedVectorWritable wv : wvList) {
           Vector v = wv.getVector();
@@ -313,7 +313,7 @@ public final class TestKmeansClustering 
           assertTrue("bad cluster!", v.get(idx) <= 2.0);
         }
         assertEquals("Wrong size cluster", 4, wvList.size());
-        gotLowClust= true;
+        //gotLowClust= true;
       } else {
         for (WeightedVectorWritable wv : wvList) {
           Vector v = wv.getVector();
@@ -321,7 +321,7 @@ public final class TestKmeansClustering 
           assertTrue("bad cluster!", v.get(idx) > 2.0);
         }
         assertEquals("Wrong size cluster", 5, wvList.size());
-        gotHighClust= true;
+        //gotHighClust= true;
       }
     }
   }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/cvb/TestCVBModelTrainer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/cvb/TestCVBModelTrainer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/cvb/TestCVBModelTrainer.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/cvb/TestCVBModelTrainer.java Wed Jun 20 12:07:50 2012
@@ -64,8 +64,7 @@ public class TestCVBModelTrainer extends
       double[] perps = new double[numTrials];
       for (int trial = 0; trial < numTrials; trial++) {
         InMemoryCollapsedVariationalBayes0 cvb =
-          new InMemoryCollapsedVariationalBayes0(sampledCorpus, terms, numTestTopics, ALPHA, ETA,
-                                                 2, 1, 0, (trial+1) * 123456L);
+          new InMemoryCollapsedVariationalBayes0(sampledCorpus, terms, numTestTopics, ALPHA, ETA, 2, 1, 0);
         cvb.setVerbose(true);
         perps[trial] = cvb.iterateUntilConvergence(0, 5, 0, 0.2);
         System.out.println(perps[trial]);

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Wed Jun 20 12:07:50 2012
@@ -360,12 +360,13 @@ public final class TestMeanShift extends
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(input.toUri(), conf);
     Collection<VectorWritable> points = Lists.newArrayList();
+    // TODO fix test so it doesn't need this random seed!
     Random r = new Random(123);
     Vector[] permutedRaw = new Vector[raw.length];
-    for (int i = 0; i < raw.length; i++)
-      permutedRaw = raw;
-    for (int i = 0; i < permutedRaw.length; i++)
+    System.arraycopy(raw, 0, permutedRaw, 0, raw.length);
+    for (int i = 0; i < permutedRaw.length; i++) {
       permutedRaw[i] = permutedRaw[i + r.nextInt(raw.length - i)];
+    }
     for (Vector v : permutedRaw) {
       points.add(new VectorWritable(v));
     }