You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/27 14:51:54 UTC
[26/51] [partial] mahout git commit: MAHOUT-2042 and MAHOUT-2045 Delete directories which were moved/no longer in use

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
deleted file mode 100644
index 6a8c659..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.IOException;
-import java.util.Set;
-import java.util.TreeSet;
-
-import com.google.common.base.Preconditions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-/**
- * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source
- * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have
- * term vectors stored for it.
- */
-public class LuceneIterator extends AbstractLuceneIterator {
-
-  protected final Set<String> idFieldSelector;
-  protected final String idField;
-
-    /**
-   * Produce a LuceneIterable that can create the Vector plus normalize it.
-   *
-   * @param indexReader {@link IndexReader} to read the documents from.
-   * @param idField     field containing the id. May be null.
-   * @param field       field to use for the Vector
-   * @param termInfo    termInfo
-   * @param weight      weight
-   * @param normPower   the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
-   */
-  public LuceneIterator(IndexReader indexReader, String idField, String field, TermInfo termInfo, Weight weight,
-                        double normPower) {
-    this(indexReader, idField, field, termInfo, weight, normPower, 0.0);
-  }
-
-  /**
-   * @param indexReader {@link IndexReader} to read the documents from.
-   * @param idField    field containing the id. May be null.
-   * @param field      field to use for the Vector
-   * @param termInfo   termInfo
-   * @param weight     weight
-   * @param normPower  the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
-   * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1].
-   * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo,
-   * org.apache.mahout.vectorizer.Weight, double)
-   */
-  public LuceneIterator(IndexReader indexReader,
-                        String idField,
-                        String field,
-                        TermInfo termInfo,
-                        Weight weight,
-                        double normPower,
-                        double maxPercentErrorDocs) {
-      super(termInfo, normPower, indexReader, weight, maxPercentErrorDocs, field);
-      // term docs(null) is a better way of iterating all the docs in Lucene
-    Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || normPower >= 0,
-        "normPower must be non-negative or -1, but normPower = " + normPower);
-    Preconditions.checkArgument(maxPercentErrorDocs >= 0.0 && maxPercentErrorDocs <= 1.0,
-        "Must be: 0.0 <= maxPercentErrorDocs <= 1.0");
-    this.idField = idField;
-    if (idField != null) {
-      idFieldSelector = new TreeSet<>();
-      idFieldSelector.add(idField);
-    } else {
-      /*The field in the index  containing the index. If null, then the Lucene internal doc id is used
-      which is prone to error if the underlying index changes*/
-      idFieldSelector = null;
-    }
-  }
-
-  @Override
-  protected String getVectorName(int documentIndex) throws IOException {
-    String name;
-    if (idField != null) {
-      name = indexReader.document(documentIndex, idFieldSelector).get(idField);
-    } else {
-      name = String.valueOf(documentIndex);
-    }
-    return name;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
deleted file mode 100644
index 5830ccc..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-
-/**
- * Not thread-safe
- */
-public class TFDFMapper  {
-
-  private Vector vector;
-  
-  private final Weight weight;
-  private long numTerms;
-  private final TermInfo termInfo;
-  private String field;
-  private final int numDocs;
-  
-  public TFDFMapper(int numDocs, Weight weight, TermInfo termInfo) {
-    this.weight = weight;
-    this.termInfo = termInfo;
-    this.numDocs = numDocs;
-  }
-
-  public void setExpectations(String field, long numTerms) {
-    this.field = field;
-    vector = new RandomAccessSparseVector(termInfo.totalTerms(field));
-    this.numTerms = numTerms;
-  }
-  
-  public void map(BytesRef term, int frequency) {
-    TermEntry entry = termInfo.getTermEntry(field, term.utf8ToString());
-    if (entry != null) {
-      vector.setQuick(entry.getTermIdx(), weight.calculate(frequency, entry.getDocFreq(), (int)numTerms, numDocs));
-    }
-  }
-  
-  public Vector getVector() {
-    return this.vector;
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
deleted file mode 100644
index b0311c7..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.mahout.common.RandomUtils;
-
-class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> {
-
-  private final String term;
-  private final int inClusterDF;
-  private final int outClusterDF;
-  private final double logLikelihoodRatio;
-
-  TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF, double logLikelihoodRatio) {
-    this.term = term;
-    this.inClusterDF = inClusterDF;
-    this.outClusterDF = outClusterDF;
-    this.logLikelihoodRatio = logLikelihoodRatio;
-  }
-
-  @Override
-  public int hashCode() {
-    return term.hashCode() ^ inClusterDF ^ outClusterDF ^ RandomUtils.hashDouble(logLikelihoodRatio);
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    if (!(o instanceof TermInfoClusterInOut)) {
-      return false;
-    }
-    TermInfoClusterInOut other = (TermInfoClusterInOut) o;
-    return term.equals(other.getTerm())
-        && inClusterDF == other.getInClusterDF()
-        && outClusterDF == other.getOutClusterDF()
-        && logLikelihoodRatio == other.getLogLikelihoodRatio();
-  }
-
-  @Override
-  public int compareTo(TermInfoClusterInOut that) {
-    int res = Double.compare(that.logLikelihoodRatio, logLikelihoodRatio);
-    if (res == 0) {
-      res = term.compareTo(that.term);
-    }
-    return res;
-  }
-
-  public int getInClusterDiff() {
-    return this.inClusterDF - this.outClusterDF;
-  }
-
-  String getTerm() {
-    return term;
-  }
-
-  int getInClusterDF() {
-    return inClusterDF;
-  }
-
-  int getOutClusterDF() {
-    return outClusterDF;
-  }
-
-  double getLogLikelihoodRatio() {
-    return logLikelihoodRatio;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java b/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
deleted file mode 100644
index 463a45f..0000000
--- a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import org.apache.mahout.cf.taste.impl.TasteTestCase;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-import org.easymock.EasyMock;
-import org.junit.Test;
-
-import javax.sql.DataSource;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-
-public class MySQLJDBCInMemoryItemSimilarityTest extends TasteTestCase {
-
-  @Test
-  public void testMemoryLoad() throws Exception {
-
-    DataSource dataSource = EasyMock.createMock(DataSource.class);
-    Connection connection = EasyMock.createMock(Connection.class);
-    PreparedStatement statement = EasyMock.createMock(PreparedStatement.class);
-    ResultSet resultSet = EasyMock.createMock(ResultSet.class);
-
-    EasyMock.expect(dataSource.getConnection()).andReturn(connection);
-    EasyMock.expect(connection.prepareStatement(MySQLJDBCInMemoryItemSimilarity.DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL,
-        ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)).andReturn(statement);
-    statement.setFetchDirection(ResultSet.FETCH_FORWARD);
-    EasyMock.expect(statement.executeQuery()).andReturn(resultSet);
-
-    EasyMock.expect(resultSet.next()).andReturn(true);
-
-    EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
-    EasyMock.expect(resultSet.getLong(2)).andReturn(2L);
-    EasyMock.expect(resultSet.getDouble(3)).andReturn(0.5);
-    EasyMock.expect(resultSet.next()).andReturn(true);
-
-    EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
-    EasyMock.expect(resultSet.getLong(2)).andReturn(3L);
-    EasyMock.expect(resultSet.getDouble(3)).andReturn(0.4);
-    EasyMock.expect(resultSet.next()).andReturn(true);
-
-    EasyMock.expect(resultSet.getLong(1)).andReturn(3L);
-    EasyMock.expect(resultSet.getLong(2)).andReturn(4L);
-    EasyMock.expect(resultSet.getDouble(3)).andReturn(0.1);
-
-    EasyMock.expect(resultSet.next()).andReturn(false);
-
-    resultSet.close();
-    statement.close();
-    connection.close();
-
-    EasyMock.replay(dataSource, connection, statement, resultSet);
-
-    ItemSimilarity similarity = new MySQLJDBCInMemoryItemSimilarity(dataSource);
-
-    assertEquals(0.5, similarity.itemSimilarity(1L, 2L), EPSILON);
-    assertEquals(0.4, similarity.itemSimilarity(1L, 3L), EPSILON);
-    assertEquals(0.1, similarity.itemSimilarity(3L, 4L), EPSILON);
-    assertTrue(Double.isNaN(similarity.itemSimilarity(1L, 4L)));
-
-    EasyMock.verify(dataSource, connection, statement, resultSet);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
deleted file mode 100644
index 01d46fc..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
+++ /dev/null
@@ -1,236 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.utils.vectors.lucene.CachedTermInfo;
-import org.apache.mahout.utils.vectors.lucene.LuceneIterable;
-import org.apache.mahout.vectorizer.TFIDF;
-import org.apache.mahout.vectorizer.Weight;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class TestClusterDumper extends MahoutTestCase {
-
-  private static final String[] DOCS = {
-      "The quick red fox jumped over the lazy brown dogs.",
-      "The quick brown fox jumped over the lazy red dogs.",
-      "The quick red cat jumped over the lazy brown dogs.",
-      "The quick brown cat jumped over the lazy red dogs.",
-      "Mary had a little lamb whose fleece was white as snow.",
-      "Mary had a little goat whose fleece was white as snow.",
-      "Mary had a little lamb whose fleece was black as tar.",
-      "Dick had a little goat whose fleece was white as snow.",
-      "Moby Dick is a story of a whale and a man obsessed.",
-      "Moby Bob is a story of a walrus and a man obsessed.",
-      "Moby Dick is a story of a whale and a crazy man.",
-      "The robber wore a black fleece jacket and a baseball cap.",
-      "The robber wore a red fleece jacket and a baseball cap.",
-      "The robber wore a white fleece jacket and a baseball cap.",
-      "The English Springer Spaniel is the best of all dogs."};
-
-  private List<VectorWritable> sampleData;
-
-  private String[] termDictionary;
-
-  @Override
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    Configuration conf = getConfiguration();
-    FileSystem fs = FileSystem.get(conf);
-    // Create test data
-    getSampleData(DOCS);
-    ClusteringTestUtils.writePointsToFile(sampleData, true,
-        getTestTempFilePath("testdata/file1"), fs, conf);
-  }
-
-  private void getSampleData(String[] docs2) throws IOException {
-    sampleData = new ArrayList<>();
-    RAMDirectory directory = new RAMDirectory();
-    try (IndexWriter writer = new IndexWriter(directory,
-        new IndexWriterConfig(new StandardAnalyzer()))){
-      for (int i = 0; i < docs2.length; i++) {
-        Document doc = new Document();
-        Field id = new StringField("id", "doc_" + i, Field.Store.YES);
-        doc.add(id);
-        // Store both position and offset information
-        FieldType fieldType = new FieldType();
-        fieldType.setStored(false);
-        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-        fieldType.setTokenized(true);
-        fieldType.setStoreTermVectors(true);
-        fieldType.setStoreTermVectorPositions(true);
-        fieldType.setStoreTermVectorOffsets(true);
-        fieldType.freeze();
-        Field text = new Field("content", docs2[i], fieldType);
-        doc.add(text);
-        writer.addDocument(doc);
-      }
-    }
-
-    IndexReader reader = DirectoryReader.open(directory);
-
-    Weight weight = new TFIDF();
-    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
-
-    int numTerms = 0;
-    for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
-      it.next();
-      numTerms++;
-    }
-    termDictionary = new String[numTerms];
-    int i = 0;
-    for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
-      String term = it.next().getTerm();
-      termDictionary[i] = term;
-      System.out.println(i + " " + term);
-      i++;
-    }
-    Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content",
-        termInfo,weight);
-
-    i = 0;
-    for (Vector vector : iterable) {
-      assertNotNull(vector);
-      NamedVector namedVector;
-      if (vector instanceof NamedVector) {
-        // rename it for testing purposes
-        namedVector = new NamedVector(((NamedVector) vector).getDelegate(),
-            "P(" + i + ')');
-
-      } else {
-        namedVector = new NamedVector(vector, "P(" + i + ')');
-      }
-      System.out.println(AbstractCluster.formatVector(namedVector,
-          termDictionary));
-      sampleData.add(new VectorWritable(namedVector));
-      i++;
-    }
-  }
-
-  /**
-   * Return the path to the final iteration's clusters
-   */
-  private static Path finalClusterPath(Configuration conf, Path output,
-      int maxIterations) throws IOException {
-    FileSystem fs = FileSystem.get(conf);
-    for (int i = maxIterations; i >= 0; i--) {
-      Path clusters = new Path(output, "clusters-" + i + "-final");
-      if (fs.exists(clusters)) {
-        return clusters;
-      }
-    }
-    return null;
-  }
-
-  @Test
-  public void testKmeans() throws Exception {
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    Path input = getTestTempFilePath("input");
-    Path output = getTestTempDirPath("output");
-    Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
-    Configuration conf = getConfiguration();
-    FileSystem fs = FileSystem.get(conf);
-    // Write test data to file
-    ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
-    // Select initial centroids
-    RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
-    // Run k-means
-    Path kMeansOutput = new Path(output, "kmeans");
-    KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, true, 0.0, false);
-    // Print out clusters
-    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
-            output, 10), new Path(kMeansOutput, "clusteredPoints"));
-    clusterDumper.printClusters(termDictionary);
-  }
-
-  @Test
-  public void testJsonClusterDumper() throws Exception {
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    Path input = getTestTempFilePath("input");
-    Path output = getTestTempDirPath("output");
-    Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
-    Configuration conf = getConfiguration();
-    FileSystem fs = FileSystem.get(conf);
-    // Write test data to file
-    ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
-    // Select initial centroids
-    RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
-    // Run k-means
-    Path kmeansOutput = new Path(output, "kmeans");
-    KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kmeansOutput, 0.001, 10, true, 0.0, false);
-    // Print out clusters
-    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
-        output, 10), new Path(kmeansOutput, "clusteredPoints"));
-    clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON);
-    clusterDumper.printClusters(termDictionary);
-  }
-
-  @Test
-  public void testFuzzyKmeans() throws Exception {
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    Path input = getTestTempFilePath("input");
-    Path output = getTestTempDirPath("output");
-    Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
-    Configuration conf = getConfiguration();
-    FileSystem fs = FileSystem.get(conf);
-    // Write test data to file
-    ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
-    // Select initial centroids
-    RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
-    // Run k-means
-    Path kMeansOutput = new Path(output, "kmeans");
-    FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, 1.1f, true,
-        true, 0, true);
-    // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
-        output, 10), new Path(kMeansOutput, "clusteredPoints"));
-    clusterDumper.printClusters(termDictionary);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
deleted file mode 100644
index 8a226a0..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
+++ /dev/null
@@ -1,321 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.canopy.Canopy;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.VectorWritable;
-import org.junit.Before;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-public final class TestClusterEvaluator extends MahoutTestCase {
-  
-  private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
-  
-  private List<VectorWritable> referenceData = Lists.newArrayList();
-  
-  private final List<VectorWritable> sampleData = Lists.newArrayList();
-  
-  private Map<Integer,List<VectorWritable>> representativePoints;
-  
-  private List<Cluster> clusters;
-  
-  private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class);
-  
-  private Configuration conf;
-  
-  private FileSystem fs;
-  
-  private Path testdata;
-  
-  private Path output;
-  
-  @Override
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    conf = getConfiguration();
-    fs = FileSystem.get(conf);
-    testdata = getTestTempDirPath("testdata");
-    output = getTestTempDirPath("output");
-    // Create small reference data set
-    referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
-    // generate larger test data set for the clustering tests to chew on
-    generateSamples();
-  }
-  
-  /**
-   * Generate random samples and add them to the sampleData
-   * 
-   * @param num
-   *          int number of samples to generate
-   * @param mx
-   *          double x-value of the sample mean
-   * @param my
-   *          double y-value of the sample mean
-   * @param sd
-   *          double standard deviation of the samples
-   */
-  private void generateSamples(int num, double mx, double my, double sd) {
-    log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
-    for (int i = 0; i < num; i++) {
-      sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
-          UncommonDistributions.rNorm(my, sd)})));
-    }
-  }
-  
-  private void generateSamples() {
-    generateSamples(500, 1, 1, 3);
-    generateSamples(300, 1, 0, 0.5);
-    generateSamples(300, 0, 2, 0.1);
-  }
-  
-  private void printRepPoints(int numIterations) {
-    RepresentativePointsDriver.printRepresentativePoints(output, numIterations);
-  }
-  
-  /**
-   * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
-   * 
-   * @param dC
-   *          a double cluster center offset
-   * @param dP
-   *          a double representative point offset
-   * @param measure
-   *          the DistanceMeasure
-   */
-  private void initData(double dC, double dP, DistanceMeasure measure) {
-    clusters = Lists.newArrayList();
-    clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure));
-    clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure));
-    clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure));
-    clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure));
-    representativePoints = Maps.newHashMap();
-    for (Cluster cluster : clusters) {
-      List<VectorWritable> points = Lists.newArrayList();
-      representativePoints.put(cluster.getId(), points);
-      points.add(new VectorWritable(cluster.getCenter().clone()));
-      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP}))));
-      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP}))));
-      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP}))));
-      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
-    }
-  }
-
-  @Test
-  public void testRepresentativePoints() throws Exception {
-    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    Configuration conf = getConfiguration();
-    // run using MR reference point calculation
-    CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
-    int numIterations = 2;
-    Path clustersIn = new Path(output, "clusters-0-final");
-    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
-        numIterations, false);
-    printRepPoints(numIterations);
-    ClusterEvaluator evaluatorMR = new ClusterEvaluator(conf, clustersIn);
-    // now run again using sequential reference point calculation
-    HadoopUtil.delete(conf, output);
-    CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
-    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
-        numIterations, true);
-    printRepPoints(numIterations);
-    ClusterEvaluator evaluatorSeq = new ClusterEvaluator(conf, clustersIn);
-    // compare results
-    assertEquals("InterCluster Density", evaluatorMR.interClusterDensity(), evaluatorSeq.interClusterDensity(), EPSILON);
-    assertEquals("IntraCluster Density", evaluatorMR.intraClusterDensity(), evaluatorSeq.intraClusterDensity(), EPSILON);
-  }
-  
-  @Test
-  public void testCluster0() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
-    assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
-  }
-  
-  @Test
-  public void testCluster1() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.5, measure);
-    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
-    assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
-  }
-  
-  @Test
-  public void testCluster2() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.75, measure);
-    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
-    assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
-  }
-  
-  /**
-   * adding an empty cluster should modify the inter cluster density but not change the intra-cluster density as that
-   * cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator
-   * 
-   * @throws IOException
-   */
-  @Test
-  public void testEmptyCluster() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure);
-    clusters.add(cluster);
-    List<VectorWritable> points = Lists.newArrayList();
-    representativePoints.put(cluster.getId(), points);
-    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
-    assertEquals("inter cluster density", 0.371534146934532, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
-  }
-  
-  /**
-   * adding an single-valued cluster should modify the inter cluster density but not change the intra-cluster density as
-   * that cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator
-   * 
-   * @throws IOException
-   */
-  @Test
-  public void testSingleValueCluster() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
-    clusters.add(cluster);
-    List<VectorWritable> points = Lists.newArrayList();
-    points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1}))));
-    representativePoints.put(cluster.getId(), points);
-    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
-    assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
-  }
-  
-  /**
-   * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These
-   * clusters are included in the inter-cluster density but their NaN intra-density values are ignored by the evaluator.
-   * 
-   * @throws IOException
-   */
-  @Test
-  public void testAllSameValueCluster() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
-    clusters.add(cluster);
-    List<VectorWritable> points = Lists.newArrayList();
-    points.add(new VectorWritable(cluster.getCenter()));
-    points.add(new VectorWritable(cluster.getCenter()));
-    points.add(new VectorWritable(cluster.getCenter()));
-    representativePoints.put(cluster.getId(), points);
-    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
-    assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
-  }
-  
-  @Test
-  public void testCanopy() throws Exception {
-    ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    Configuration conf = getConfiguration();
-    CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
-    int numIterations = 10;
-    Path clustersIn = new Path(output, "clusters-0-final");
-    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
-        numIterations, true);
-    //printRepPoints(numIterations);
-    ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
-    // now print out the Results
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-  }
-  
-  @Test
-  public void testKmeans() throws Exception {
-    ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    // now run the Canopy job to prime kMeans canopies
-    Configuration conf = getConfiguration();
-    CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true);
-    // now run the KMeans job
-    Path kmeansOutput = new Path(output, "kmeans");
-    KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true);
-    int numIterations = 10;
-    Path clustersIn = new Path(kmeansOutput, "clusters-2");
-    RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure,
-        numIterations, true);
-    RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations);
-    ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
-    // now print out the Results
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-  }
-  
-  @Test
-  public void testFuzzyKmeans() throws Exception {
-    ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    // now run the Canopy job to prime kMeans canopies
-    Configuration conf = getConfiguration();
-    CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true);
-    Path fuzzyKMeansOutput = new Path(output, "fuzzyk");
-    // now run the KMeans job
-    FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2,
-        true, true, 0, true);
-    int numIterations = 10;
-    Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4");
-    RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput,
-        measure, numIterations, true);
-    RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations);
-    ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
-    // now print out the Results
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
deleted file mode 100644
index 597ed01..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
+++ /dev/null
@@ -1,326 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.cdbw;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.ClusteringTestUtils;
-import org.apache.mahout.clustering.TestClusterEvaluator;
-import org.apache.mahout.clustering.UncommonDistributions;
-import org.apache.mahout.clustering.canopy.Canopy;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.junit.Before;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class TestCDbwEvaluator extends MahoutTestCase {
-  
-  private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
-  
-  private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class);
-  
-  private Map<Integer,List<VectorWritable>> representativePoints;
-  
-  private List<Cluster> clusters;
-  
-  private Configuration conf;
-  
-  private FileSystem fs;
-  
-  private final Collection<VectorWritable> sampleData = new ArrayList<>();
-  
-  private List<VectorWritable> referenceData = new ArrayList<>();
-  
-  private Path testdata;
-  
-  private Path output;
-  
-  @Override
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    conf = getConfiguration();
-    fs = FileSystem.get(conf);
-    testdata = getTestTempDirPath("testdata");
-    output = getTestTempDirPath("output");
-    // Create small reference data set
-    referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
-    // generate larger test data set for the clustering tests to chew on
-    generateSamples();
-  }
-  
-  /**
-   * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
-   * 
-   * @param dC
-   *          a double cluster center offset
-   * @param dP
-   *          a double representative point offset
-   * @param measure
-   *          the DistanceMeasure
-   */
-  private void initData(double dC, double dP, DistanceMeasure measure) {
-    clusters = new ArrayList<>();
-    clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure));
-    clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure));
-    clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure));
-    clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure));
-    representativePoints = new HashMap<>();
-    for (Cluster cluster : clusters) {
-      List<VectorWritable> points = new ArrayList<>();
-      representativePoints.put(cluster.getId(), points);
-      points.add(new VectorWritable(cluster.getCenter().clone()));
-      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP}))));
-      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP}))));
-      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP}))));
-      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
-    }
-  }
-  
-  /**
-   * Generate random samples and add them to the sampleData
-   * 
-   * @param num
-   *          int number of samples to generate
-   * @param mx
-   *          double x-value of the sample mean
-   * @param my
-   *          double y-value of the sample mean
-   * @param sd
-   *          double standard deviation of the samples
-   */
-  private void generateSamples(int num, double mx, double my, double sd) {
-    log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
-    for (int i = 0; i < num; i++) {
-      sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
-          UncommonDistributions.rNorm(my, sd)})));
-    }
-  }
-  
-  private void generateSamples() {
-    generateSamples(500, 1, 1, 3);
-    generateSamples(300, 1, 0, 0.5);
-    generateSamples(300, 0, 2, 0.1);
-  }
-  
-  @Test
-  public void testCDbw0() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    System.out.println("CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-  @Test
-  public void testCDbw1() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.5, measure);
-    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    System.out.println("CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-  @Test
-  public void testCDbw2() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.75, measure);
-    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    System.out.println("CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-  @Test
-  public void testEmptyCluster() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure);
-    clusters.add(cluster);
-    List<VectorWritable> points = new ArrayList<>();
-    representativePoints.put(cluster.getId(), points);
-    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    System.out.println("CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-  @Test
-  public void testSingleValueCluster() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
-    clusters.add(cluster);
-    List<VectorWritable> points = new ArrayList<>();
-    points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1}))));
-    representativePoints.put(cluster.getId(), points);
-    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    System.out.println("CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-  /**
-   * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These
-   * clusters should be ignored like empty clusters above
-   * 
-   * @throws IOException
-   */
-  @Test
-  public void testAllSameValueCluster() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
-    clusters.add(cluster);
-    List<VectorWritable> points = new ArrayList<>();
-    points.add(new VectorWritable(cluster.getCenter()));
-    points.add(new VectorWritable(cluster.getCenter()));
-    points.add(new VectorWritable(cluster.getCenter()));
-    representativePoints.put(cluster.getId(), points);
-    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    System.out.println("CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-  /**
-   * Clustering can produce very, very tight clusters that can cause the std calculation to fail. These clusters should
-   * be processed correctly.
-   * 
-   * @throws IOException
-   */
-  @Test
-  public void testAlmostSameValueCluster() throws IOException {
-    ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
-    clusters.add(cluster);
-    List<VectorWritable> points = new ArrayList<>();
-    Vector delta = new DenseVector(new double[] {0, Double.MIN_NORMAL});
-    points.add(new VectorWritable(delta.clone()));
-    points.add(new VectorWritable(delta.clone()));
-    points.add(new VectorWritable(delta.clone()));
-    points.add(new VectorWritable(delta.clone()));
-    points.add(new VectorWritable(delta.clone()));
-    representativePoints.put(cluster.getId(), points);
-    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    System.out.println("CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-  @Test
-  public void testCanopy() throws Exception {
-    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, true, 0.0, true);
-    int numIterations = 10;
-    Path clustersIn = new Path(output, "clusters-0-final");
-    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
-        numIterations, true);
-    CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
-    // printRepPoints(numIterations);
-    // now print out the Results
-    System.out.println("Canopy CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-  @Test
-  public void testKmeans() throws Exception {
-    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    // now run the Canopy job to prime kMeans canopies
-    CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true);
-    // now run the KMeans job
-    Path kmeansOutput = new Path(output, "kmeans");
-    KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true);
-    int numIterations = 10;
-    Path clustersIn = new Path(kmeansOutput, "clusters-10-final");
-    RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure,
-        numIterations, true);
-    CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
-    RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations);
-    // now print out the Results
-    System.out.println("K-Means CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-  @Test
-  public void testFuzzyKmeans() throws Exception {
-    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    // now run the Canopy job to prime kMeans canopies
-    CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true);
-    Path fuzzyKMeansOutput = new Path(output, "fuzzyk");
-    // now run the KMeans job
-    FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2,
-        true, true, 0, true);
-    int numIterations = 10;
-    Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4");
-    RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput,
-        measure, numIterations, true);
-    CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
-    RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations);
-    // now print out the Results
-    System.out.println("Fuzzy K-Means CDbw = " + evaluator.getCDbw());
-    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
-    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
-    System.out.println("Separation = " + evaluator.separation());
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java b/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
deleted file mode 100644
index ba73c82..0000000
--- a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-/**
- * Unit tests for the MailArchivesClusteringAnalyzer text analyzer.
- */
-public class MailArchivesClusteringAnalyzerTest extends MahoutTestCase {
-  
-  @Test
-  public void testAnalysis() throws Exception {
-    Analyzer analyzer = new MailArchivesClusteringAnalyzer();
-    
-    String text = "A test message\n"
-                  + "atokenthatistoolongtobeusefulforclustertextanalysis\n"
-                  + "Mahout is a scalable, machine-learning LIBRARY\n"
-                  + "we've added some additional stopwords such as html, mailto, regards\t"
-                  + "apache_hadoop provides the foundation for scalability\n"
-                  + "www.nabble.com general-help@incubator.apache.org\n"
-                  + "public void int protected package";
-    Reader reader = new StringReader(text);
-    
-    // if you change the text above, then you may need to change this as well
-    // order matters too
-    String[] expectedTokens = {
-        "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad",
-        "stopword", "apache_hadoop","provid", "foundat", "scalabl"
-    };
-        
-    TokenStream tokenStream = analyzer.tokenStream("test", reader);
-    assertNotNull(tokenStream);
-    tokenStream.reset();
-    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
-    int e = 0;
-    while (tokenStream.incrementToken() && e < expectedTokens.length) {
-      assertEquals(expectedTokens[e++], termAtt.toString());
-    }
-    assertEquals(e, expectedTokens.length);
-    tokenStream.end();
-    tokenStream.close();
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
deleted file mode 100644
index ef2b8a6..0000000
--- a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
+++ /dev/null
@@ -1,240 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.util.zip.GZIPOutputStream;
-
-import org.apache.commons.lang3.SystemUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Test case for the SequenceFilesFromMailArchives command-line application.
- */
-public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase {
-
-  private File inputDir;
-
-  /**
-   * Create the input and output directories needed for testing
-   * the SequenceFilesFromMailArchives application.
-   */
-  @Override
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    inputDir = getTestTempDir("mail-archives-in");
-
-    // write test mail messages to a gzipped file in a nested directory
-    File subDir = new File(inputDir, "subdir");
-    subDir.mkdir();
-    File gzFile = new File(subDir, "mail-messages.gz");
-    try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile))) {
-      gzOut.write(testMailMessages.getBytes("UTF-8"));
-      gzOut.finish();
-    }
-    
-    File subDir2 = new File(subDir, "subsubdir");
-    subDir2.mkdir();
-    File gzFile2 = new File(subDir2, "mail-messages-2.gz");
-    try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile2))) {
-      gzOut.write(testMailMessages.getBytes("UTF-8"));
-      gzOut.finish();
-    }
-  }
-
-  @Test
-  public void testSequential() throws Exception {
-
-    File outputDir = this.getTestTempDir("mail-archives-out");
-
-    String[] args = {
-      "--input", inputDir.getAbsolutePath(),
-      "--output", outputDir.getAbsolutePath(),
-      "--charset", "UTF-8",
-      "--keyPrefix", "TEST",
-      "--method", "sequential",
-      "--body", "--subject", "--separator", ""
-    };
-
-    // run the application's main method
-    SequenceFilesFromMailArchives.main(args);
-
-    // app should create a single SequenceFile named "chunk-0" in the output dir
-    File expectedChunkFile = new File(outputDir, "chunk-0");
-    String expectedChunkPath = expectedChunkFile.getAbsolutePath();
-    Assert.assertTrue("Expected chunk file " + expectedChunkPath + " not found!", expectedChunkFile.isFile());
-
-    Configuration conf = getConfiguration();
-    SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(new Path(expectedChunkPath), true, conf);
-    Assert.assertTrue("First key/value pair not found!", iterator.hasNext());
-    Pair<Text, Text> record = iterator.next();
-
-    File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz");
-    Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), record.getFirst().toString());
-    Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
-
-    Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-
-    record = iterator.next();
-    Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), record.getFirst().toString());
-    Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-
-    record = iterator.next();
-    File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz");
-    Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), record.getFirst().toString());
-    Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
-
-    Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-    record = iterator.next();
-    Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), record.getFirst().toString());
-    Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-
-    Assert.assertFalse("Only two key/value pairs expected!", iterator.hasNext());
-  }
-
-  @Test
-  public void testMapReduce() throws Exception {
-
-    Path tmpDir = getTestTempDirPath();
-    Path mrOutputDir = new Path(tmpDir, "mail-archives-out-mr");
-    Configuration configuration = getConfiguration();
-    FileSystem fs = FileSystem.get(configuration);
-
-    File expectedInputFile = new File(inputDir.toString());
-
-    String[] args = {
-      "-Dhadoop.tmp.dir=" + configuration.get("hadoop.tmp.dir"),
-      "--input", expectedInputFile.getAbsolutePath(),
-      "--output", mrOutputDir.toString(),
-      "--charset", "UTF-8",
-      "--keyPrefix", "TEST",
-      "--method", "mapreduce",
-      "--body", "--subject", "--separator", ""
-    };
-
-    // run the application's main method
-    SequenceFilesFromMailArchives.main(args);
-
-    // app should create a single SequenceFile named "chunk-0" in the output dir
-    FileStatus[] fileStatuses = fs.listStatus(mrOutputDir.suffix("/part-m-00000"));
-    assertEquals(1, fileStatuses.length); // only one
-    assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
-    SequenceFileIterator<Text, Text> iterator =
-      new SequenceFileIterator<>(mrOutputDir.suffix("/part-m-00000"), true, configuration);
-
-    Assert.assertTrue("First key/value pair not found!", iterator.hasNext());
-    Pair<Text, Text> record = iterator.next();
-
-    File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz");
-
-    String expected = record.getFirst().toString();
-    if (SystemUtils.IS_OS_WINDOWS) {
-      expected = expected.replace("/", "\\");
-    }
-    Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), expected);
-    Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
-    Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-
-    record = iterator.next();
-    expected = record.getFirst().toString();
-    if (SystemUtils.IS_OS_WINDOWS) {
-      expected = expected.replace("/", "\\");
-    }
-    Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), expected);
-    Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-
-    // test other file
-    File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz");
-    record = iterator.next();
-    expected = record.getFirst().toString();
-    if (SystemUtils.IS_OS_WINDOWS) {
-      expected = expected.replace("/", "\\");
-    }
-    Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), expected);
-    Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
-    Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-
-    record = iterator.next();
-    expected = record.getFirst().toString();
-    if (SystemUtils.IS_OS_WINDOWS) {
-      expected = expected.replace("/", "\\");
-    }
-    Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), expected);
-    Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-    Assert.assertFalse("Only four key/value pairs expected!", iterator.hasNext());
-  }
-
-  // Messages extracted and made anonymous from the ASF mail archives
-  private static final String[][] testVars = {
-    new String[] {
-      "user@example.com",
-      "Ant task for JDK1.1 collections build option",
-      "\nThis is just a test message\n--\nTesty McTester\n"
-    },
-    new String[] {
-      "somebody@example.com",
-      "Problem with build files in several directories",
-      "\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n"
-    }
-  };
-
-  private static final String testMailMessages =
-    "From user@example.com  Mon Jul 24 19:13:53 2000\n"
-      + "Return-Path: <us...@example.com>\n"
-      + "Mailing-List: contact ant-user-help@jakarta.apache.org; run by ezmlm\n"
-      + "Delivered-To: mailing list ant-user@jakarta.apache.org\n"
-      + "Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53 -0000\n"
-      + "Message-ID: <" + testVars[0][0] + ">\n"
-      + "From: \"Testy McTester\" <us...@example.com>\n"
-      + "To: <an...@jakarta.apache.org>\n"
-      + "Subject: " + testVars[0][1] + '\n'
-      + "Date: Mon, 24 Jul 2000 12:24:56 -0700\n"
-      + "MIME-Version: 1.0\n"
-      + "Content-Type: text/plain;\n"
-      + "  charset=\"Windows-1252\"\n"
-      + "Content-Transfer-Encoding: 7bit\n"
-      + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"
-      + testVars[0][2] + '\n'
-      + "From somebody@example.com  Wed Jul 26 11:32:16 2000\n"
-      + "Return-Path: <so...@example.com>\n"
-      + "Mailing-List: contact ant-user-help@jakarta.apache.org; run by ezmlm\n"
-      + "Delivered-To: mailing list ant-user@jakarta.apache.org\n"
-      + "Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16 -0000\n"
-      + "User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n"
-      + "Date: Wed, 26 Jul 2000 13:32:08 +0200\n"
-      + "Subject: " + testVars[1][1] + '\n'
-      + "From: Another Test <so...@example.com>\n"
-      + "To: <an...@jakarta.apache.org>\n"
-      + "Message-Id: <" + testVars[1][0] + ">\n"
-      + "Mime-Version: 1.0\n"
-      + "Content-Type: text/plain; charset=\"US-ASCII\"\n"
-      + "Content-Transfer-Encoding: 7bit\n"
-      + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"
-      + testVars[1][2];
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java b/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
deleted file mode 100644
index 227521a..0000000
--- a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text;
-
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-
-/**
- * Dummy Path Filter for testing the MapReduce version of
- * SequenceFilesFromDirectory
- */
-public class TestPathFilter implements PathFilter {
-
-  @Override
-  public boolean accept(Path path) {
-    return path.getName().startsWith("t") || path.getName().startsWith("r") || path.getName().startsWith("f");
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java b/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
deleted file mode 100644
index 040c8e4..0000000
--- a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class TestSequenceFilesFromDirectory extends MahoutTestCase {
-
-  private static final Logger logger = LoggerFactory.getLogger(TestSequenceFilesFromDirectory.class);
-
-  private static final String[][] DATA1 = {
-    {"test1", "This is the first text."},
-    {"test2", "This is the second text."},
-    {"test3", "This is the third text."}
-  };
-
-  private static final String[][] DATA2 = {
-    {"recursive_test1", "This is the first text."},
-    {"recursive_test2", "This is the second text."},
-    {"recursive_test3", "This is the third text."}
-  };
-
-  @Test
-  public void testSequenceFileFromDirectoryBasic() throws Exception {
-    // parameters
-    Configuration configuration = getConfiguration();
-
-    FileSystem fs = FileSystem.get(configuration);
-
-    // create
-    Path tmpDir = this.getTestTempDirPath();
-    Path inputDir = new Path(tmpDir, "inputDir");
-    fs.mkdirs(inputDir);
-
-    Path outputDir = new Path(tmpDir, "outputDir");
-    Path outputDirRecursive = new Path(tmpDir, "outputDirRecursive");
-
-    Path inputDirRecursive = new Path(tmpDir, "inputDirRecur");
-    fs.mkdirs(inputDirRecursive);
-
-    // prepare input files
-    createFilesFromArrays(configuration, inputDir, DATA1);
-
-    SequenceFilesFromDirectory.main(new String[]{
-      "--input", inputDir.toString(),
-      "--output", outputDir.toString(),
-      "--chunkSize", "64",
-      "--charset", Charsets.UTF_8.name(),
-      "--keyPrefix", "UID",
-      "--method", "sequential"});
-
-    // check output chunk files
-    checkChunkFiles(configuration, outputDir, DATA1, "UID");
-
-    createRecursiveDirFilesFromArrays(configuration, inputDirRecursive, DATA2);
-
-    FileStatus fstInputPath = fs.getFileStatus(inputDirRecursive);
-    String dirs = HadoopUtil.buildDirList(fs, fstInputPath);
-
-    System.out.println("\n\n ----- recursive dirs: " + dirs);
-    SequenceFilesFromDirectory.main(new String[]{
-      "--input", inputDirRecursive.toString(),
-      "--output", outputDirRecursive.toString(),
-      "--chunkSize", "64",
-      "--charset", Charsets.UTF_8.name(),
-      "--keyPrefix", "UID",
-      "--method", "sequential"});
-
-    checkRecursiveChunkFiles(configuration, outputDirRecursive, DATA2, "UID");
-  }
-
-  @Test
-  public void testSequenceFileFromDirectoryMapReduce() throws Exception {
-
-    Configuration conf = getConfiguration();
-
-    FileSystem fs = FileSystem.get(conf);
-
-    // create
-    Path tmpDir = this.getTestTempDirPath();
-    Path inputDir = new Path(tmpDir, "inputDir");
-    fs.mkdirs(inputDir);
-
-    Path inputDirRecur = new Path(tmpDir, "inputDirRecur");
-    fs.mkdirs(inputDirRecur);
-
-    Path mrOutputDir = new Path(tmpDir, "mrOutputDir");
-    Path mrOutputDirRecur = new Path(tmpDir, "mrOutputDirRecur");
-
-    createFilesFromArrays(conf, inputDir, DATA1);
-
-    SequenceFilesFromDirectory.main(new String[]{
-      "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"),
-      "--input", inputDir.toString(),
-      "--output", mrOutputDir.toString(),
-      "--chunkSize", "64",
-      "--charset", Charsets.UTF_8.name(),
-      "--method", "mapreduce",
-      "--keyPrefix", "UID",
-      "--fileFilterClass", "org.apache.mahout.text.TestPathFilter"
-    });
-
-    checkMRResultFiles(conf, mrOutputDir, DATA1, "UID");
-
-    createRecursiveDirFilesFromArrays(conf, inputDirRecur, DATA2);
-
-    FileStatus fst_input_path = fs.getFileStatus(inputDirRecur);
-    String dirs = HadoopUtil.buildDirList(fs, fst_input_path);
-
-    logger.info("\n\n ---- recursive dirs: {}", dirs);
-
-    SequenceFilesFromDirectory.main(new String[]{
-      "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"),
-      "--input", inputDirRecur.toString(),
-      "--output", mrOutputDirRecur.toString(),
-      "--chunkSize", "64",
-      "--charset", Charsets.UTF_8.name(),
-      "--method", "mapreduce",
-      "--keyPrefix", "UID",
-      "--fileFilterClass", "org.apache.mahout.text.TestPathFilter"
-    });
-
-    checkMRResultFilesRecursive(conf, mrOutputDirRecur, DATA2, "UID");
-  }
-
-
-  private static void createFilesFromArrays(Configuration conf, Path inputDir, String[][] data) throws IOException {
-    FileSystem fs = FileSystem.get(conf);
-    for (String[] aData : data) {
-      try (OutputStreamWriter writer =
-               new OutputStreamWriter(fs.create(new Path(inputDir, aData[0])), Charsets.UTF_8)){
-        writer.write(aData[1]);
-      }
-    }
-  }
-
-  private static void createRecursiveDirFilesFromArrays(Configuration configuration, Path inputDir,
-                                                        String[][] data) throws IOException {
-    FileSystem fs = FileSystem.get(configuration);
-
-    logger.info("creativeRecursiveDirFilesFromArrays > based on: {}", inputDir.toString());
-    Path curPath;
-    String currentRecursiveDir = inputDir.toString();
-
-    for (String[] aData : data) {
-      currentRecursiveDir += "/" + aData[0];
-      File subDir = new File(currentRecursiveDir);
-      subDir.mkdir();
-
-      curPath = new Path(subDir.toString(), "file.txt");
-      logger.info("Created file: {}", curPath.toString());
-
-      try (OutputStreamWriter writer = new OutputStreamWriter(fs.create(curPath), Charsets.UTF_8)){
-        writer.write(aData[1]);
-      }
-    }
-  }
-
-  private static void checkChunkFiles(Configuration configuration,
-                                      Path outputDir,
-                                      String[][] data,
-                                      String prefix) throws IOException {
-    FileSystem fs = FileSystem.get(configuration);
-
-    // output exists?
-    FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
-    assertEquals(1, fileStatuses.length); // only one
-    assertEquals("chunk-0", fileStatuses[0].getPath().getName());
-
-    Map<String, String> fileToData = new HashMap<>();
-    for (String[] aData : data) {
-      fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
-    }
-
-    // read a chunk to check content
-    try (SequenceFileIterator<Text, Text> iterator =
-             new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)){
-      while (iterator.hasNext()) {
-        Pair<Text, Text> record = iterator.next();
-        String retrievedData = fileToData.get(record.getFirst().toString().trim());
-        assertNotNull(retrievedData);
-        assertEquals(retrievedData, record.getSecond().toString().trim());
-      }
-    }
-  }
-
-  private static void checkRecursiveChunkFiles(Configuration configuration,
-                                               Path outputDir,
-                                               String[][] data,
-                                               String prefix) throws IOException {
-    FileSystem fs = FileSystem.get(configuration);
-
-    System.out.println(" ----------- check_Recursive_ChunkFiles ------------");
-
-    // output exists?
-    FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
-    assertEquals(1, fileStatuses.length); // only one
-    assertEquals("chunk-0", fileStatuses[0].getPath().getName());
-
-
-    Map<String, String> fileToData = new HashMap<>();
-    String currentPath = prefix;
-    for (String[] aData : data) {
-      currentPath += Path.SEPARATOR + aData[0];
-      fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
-    }
-
-    // read a chunk to check content
-    try (SequenceFileIterator<Text, Text> iterator =
-             new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)) {
-      while (iterator.hasNext()) {
-        Pair<Text, Text> record = iterator.next();
-        String retrievedData = fileToData.get(record.getFirst().toString().trim());
-        System.out.printf("%s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim());
-
-        assertNotNull(retrievedData);
-        assertEquals(retrievedData, record.getSecond().toString().trim());
-        System.out.printf(">>> k: %s, v: %s\n", record.getFirst().toString(), record.getSecond().toString());
-      }
-    }
-  }
-
-  private static void checkMRResultFiles(Configuration conf, Path outputDir,
-                                         String[][] data, String prefix) throws IOException {
-    FileSystem fs = FileSystem.get(conf);
-
-    // output exists?
-    FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
-    assertEquals(1, fileStatuses.length); // only one
-    assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
-    Map<String, String> fileToData = new HashMap<>();
-    for (String[] aData : data) {
-      System.out.printf("map.put: %s %s\n", prefix + Path.SEPARATOR + aData[0], aData[1]);
-      fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
-    }
-
-    // read a chunk to check content
-    try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(
-        fileStatuses[0].getPath(), true, conf)) {
-      while (iterator.hasNext()) {
-        Pair<Text, Text> record = iterator.next();
-        String retrievedData = fileToData.get(record.getFirst().toString().trim());
-
-        System.out.printf("MR> %s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim());
-        assertNotNull(retrievedData);
-        assertEquals(retrievedData, record.getSecond().toString().trim());
-      }
-    }
-  }
-
-  private static void checkMRResultFilesRecursive(Configuration configuration, Path outputDir,
-                                                  String[][] data, String prefix) throws IOException {
-    FileSystem fs = FileSystem.get(configuration);
-
-    // output exists?
-    FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
-    assertEquals(1, fileStatuses.length); // only one
-    assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
-    Map<String, String> fileToData = new HashMap<>();
-    String currentPath = prefix;
-
-    for (String[] aData : data) {
-      currentPath += Path.SEPARATOR + aData[0];
-      fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
-    }
-
-    // read a chunk to check content
-    try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(
-        fileStatuses[0].getPath(), true, configuration)){
-      while (iterator.hasNext()) {
-        Pair<Text, Text> record = iterator.next();
-        System.out.printf("MR-Recur > Trying to check: %s\n", record.getFirst().toString().trim());
-        String retrievedData = fileToData.get(record.getFirst().toString().trim());
-        assertNotNull(retrievedData);
-        assertEquals(retrievedData, record.getSecond().toString().trim());
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
deleted file mode 100644
index 7483b2d..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-
-/**
- * Used for testing lucene2seq
- */
-@Deprecated
-public class MultipleFieldsDocument extends SingleFieldDocument {
-
-  public static final String FIELD1 = "field1";
-  public static final String FIELD2 = "field2";
-
-  private String field1;
-  private String field2;
-
-  public MultipleFieldsDocument(String id, String field, String field1, String field2) {
-    super(id, field);
-    this.field1 = field1;
-    this.field2 = field2;
-  }
-
-  public String getField1() {
-    return field1;
-  }
-
-  public String getField2() {
-    return field2;
-  }
-
-  @Override
-  public Document asLuceneDocument() {
-    Document document = super.asLuceneDocument();
-
-    document.add(new TextField(FIELD1, this.field1, Field.Store.YES));
-    document.add(new TextField(FIELD2, this.field2, Field.Store.YES));
-
-    return document;
-  }
-}