You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/27 14:51:54 UTC
[26/51] [partial] mahout git commit: MAHOUT-2042 and MAHOUT-2045
Delete directories which were moved/no longer in use
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
deleted file mode 100644
index 6a8c659..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import java.io.IOException;
-import java.util.Set;
-import java.util.TreeSet;
-
-import com.google.common.base.Preconditions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-/**
- * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source
- * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have
- * term vectors stored for it.
- */
-public class LuceneIterator extends AbstractLuceneIterator {
-
- protected final Set<String> idFieldSelector;
- protected final String idField;
-
- /**
- * Produce a LuceneIterable that can create the Vector plus normalize it.
- *
- * @param indexReader {@link IndexReader} to read the documents from.
- * @param idField field containing the id. May be null.
- * @param field field to use for the Vector
- * @param termInfo termInfo
- * @param weight weight
- * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
- */
- public LuceneIterator(IndexReader indexReader, String idField, String field, TermInfo termInfo, Weight weight,
- double normPower) {
- this(indexReader, idField, field, termInfo, weight, normPower, 0.0);
- }
-
- /**
- * @param indexReader {@link IndexReader} to read the documents from.
- * @param idField field containing the id. May be null.
- * @param field field to use for the Vector
- * @param termInfo termInfo
- * @param weight weight
- * @param normPower the normalization value. Must be non-negative, or {@link LuceneIterable#NO_NORMALIZING}
- * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1].
- * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo,
- * org.apache.mahout.vectorizer.Weight, double)
- */
- public LuceneIterator(IndexReader indexReader,
- String idField,
- String field,
- TermInfo termInfo,
- Weight weight,
- double normPower,
- double maxPercentErrorDocs) {
- super(termInfo, normPower, indexReader, weight, maxPercentErrorDocs, field);
- // term docs(null) is a better way of iterating all the docs in Lucene
- Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || normPower >= 0,
- "normPower must be non-negative or -1, but normPower = " + normPower);
- Preconditions.checkArgument(maxPercentErrorDocs >= 0.0 && maxPercentErrorDocs <= 1.0,
- "Must be: 0.0 <= maxPercentErrorDocs <= 1.0");
- this.idField = idField;
- if (idField != null) {
- idFieldSelector = new TreeSet<>();
- idFieldSelector.add(idField);
- } else {
- /*The field in the index containing the index. If null, then the Lucene internal doc id is used
- which is prone to error if the underlying index changes*/
- idFieldSelector = null;
- }
- }
-
- @Override
- protected String getVectorName(int documentIndex) throws IOException {
- String name;
- if (idField != null) {
- name = indexReader.document(documentIndex, idFieldSelector).get(idField);
- } else {
- name = String.valueOf(documentIndex);
- }
- return name;
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
deleted file mode 100644
index 5830ccc..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.lucene.util.BytesRef;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.vectorizer.Weight;
-
-
-/**
- * Not thread-safe
- */
-public class TFDFMapper {
-
- private Vector vector;
-
- private final Weight weight;
- private long numTerms;
- private final TermInfo termInfo;
- private String field;
- private final int numDocs;
-
- public TFDFMapper(int numDocs, Weight weight, TermInfo termInfo) {
- this.weight = weight;
- this.termInfo = termInfo;
- this.numDocs = numDocs;
- }
-
- public void setExpectations(String field, long numTerms) {
- this.field = field;
- vector = new RandomAccessSparseVector(termInfo.totalTerms(field));
- this.numTerms = numTerms;
- }
-
- public void map(BytesRef term, int frequency) {
- TermEntry entry = termInfo.getTermEntry(field, term.utf8ToString());
- if (entry != null) {
- vector.setQuick(entry.getTermIdx(), weight.calculate(frequency, entry.getDocFreq(), (int)numTerms, numDocs));
- }
- }
-
- public Vector getVector() {
- return this.vector;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
deleted file mode 100644
index b0311c7..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/TermInfoClusterInOut.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils.vectors.lucene;
-
-import org.apache.mahout.common.RandomUtils;
-
-class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> {
-
- private final String term;
- private final int inClusterDF;
- private final int outClusterDF;
- private final double logLikelihoodRatio;
-
- TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF, double logLikelihoodRatio) {
- this.term = term;
- this.inClusterDF = inClusterDF;
- this.outClusterDF = outClusterDF;
- this.logLikelihoodRatio = logLikelihoodRatio;
- }
-
- @Override
- public int hashCode() {
- return term.hashCode() ^ inClusterDF ^ outClusterDF ^ RandomUtils.hashDouble(logLikelihoodRatio);
- }
-
- @Override
- public boolean equals(Object o) {
- if (!(o instanceof TermInfoClusterInOut)) {
- return false;
- }
- TermInfoClusterInOut other = (TermInfoClusterInOut) o;
- return term.equals(other.getTerm())
- && inClusterDF == other.getInClusterDF()
- && outClusterDF == other.getOutClusterDF()
- && logLikelihoodRatio == other.getLogLikelihoodRatio();
- }
-
- @Override
- public int compareTo(TermInfoClusterInOut that) {
- int res = Double.compare(that.logLikelihoodRatio, logLikelihoodRatio);
- if (res == 0) {
- res = term.compareTo(that.term);
- }
- return res;
- }
-
- public int getInClusterDiff() {
- return this.inClusterDF - this.outClusterDF;
- }
-
- String getTerm() {
- return term;
- }
-
- int getInClusterDF() {
- return inClusterDF;
- }
-
- int getOutClusterDF() {
- return outClusterDF;
- }
-
- double getLogLikelihoodRatio() {
- return logLikelihoodRatio;
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java b/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
deleted file mode 100644
index 463a45f..0000000
--- a/integration/src/test/java/org/apache/mahout/cf/taste/impl/similarity/jdbc/MySQLJDBCInMemoryItemSimilarityTest.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.impl.similarity.jdbc;
-
-import org.apache.mahout.cf.taste.impl.TasteTestCase;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-import org.easymock.EasyMock;
-import org.junit.Test;
-
-import javax.sql.DataSource;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-
-public class MySQLJDBCInMemoryItemSimilarityTest extends TasteTestCase {
-
- @Test
- public void testMemoryLoad() throws Exception {
-
- DataSource dataSource = EasyMock.createMock(DataSource.class);
- Connection connection = EasyMock.createMock(Connection.class);
- PreparedStatement statement = EasyMock.createMock(PreparedStatement.class);
- ResultSet resultSet = EasyMock.createMock(ResultSet.class);
-
- EasyMock.expect(dataSource.getConnection()).andReturn(connection);
- EasyMock.expect(connection.prepareStatement(MySQLJDBCInMemoryItemSimilarity.DEFAULT_GET_ALL_ITEMSIMILARITIES_SQL,
- ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)).andReturn(statement);
- statement.setFetchDirection(ResultSet.FETCH_FORWARD);
- EasyMock.expect(statement.executeQuery()).andReturn(resultSet);
-
- EasyMock.expect(resultSet.next()).andReturn(true);
-
- EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
- EasyMock.expect(resultSet.getLong(2)).andReturn(2L);
- EasyMock.expect(resultSet.getDouble(3)).andReturn(0.5);
- EasyMock.expect(resultSet.next()).andReturn(true);
-
- EasyMock.expect(resultSet.getLong(1)).andReturn(1L);
- EasyMock.expect(resultSet.getLong(2)).andReturn(3L);
- EasyMock.expect(resultSet.getDouble(3)).andReturn(0.4);
- EasyMock.expect(resultSet.next()).andReturn(true);
-
- EasyMock.expect(resultSet.getLong(1)).andReturn(3L);
- EasyMock.expect(resultSet.getLong(2)).andReturn(4L);
- EasyMock.expect(resultSet.getDouble(3)).andReturn(0.1);
-
- EasyMock.expect(resultSet.next()).andReturn(false);
-
- resultSet.close();
- statement.close();
- connection.close();
-
- EasyMock.replay(dataSource, connection, statement, resultSet);
-
- ItemSimilarity similarity = new MySQLJDBCInMemoryItemSimilarity(dataSource);
-
- assertEquals(0.5, similarity.itemSimilarity(1L, 2L), EPSILON);
- assertEquals(0.4, similarity.itemSimilarity(1L, 3L), EPSILON);
- assertEquals(0.1, similarity.itemSimilarity(3L, 4L), EPSILON);
- assertTrue(Double.isNaN(similarity.itemSimilarity(1L, 4L)));
-
- EasyMock.verify(dataSource, connection, statement, resultSet);
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
deleted file mode 100644
index 01d46fc..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
+++ /dev/null
@@ -1,236 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.apache.mahout.utils.vectors.TermEntry;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.utils.vectors.lucene.CachedTermInfo;
-import org.apache.mahout.utils.vectors.lucene.LuceneIterable;
-import org.apache.mahout.vectorizer.TFIDF;
-import org.apache.mahout.vectorizer.Weight;
-import org.junit.Before;
-import org.junit.Test;
-
-public final class TestClusterDumper extends MahoutTestCase {
-
- private static final String[] DOCS = {
- "The quick red fox jumped over the lazy brown dogs.",
- "The quick brown fox jumped over the lazy red dogs.",
- "The quick red cat jumped over the lazy brown dogs.",
- "The quick brown cat jumped over the lazy red dogs.",
- "Mary had a little lamb whose fleece was white as snow.",
- "Mary had a little goat whose fleece was white as snow.",
- "Mary had a little lamb whose fleece was black as tar.",
- "Dick had a little goat whose fleece was white as snow.",
- "Moby Dick is a story of a whale and a man obsessed.",
- "Moby Bob is a story of a walrus and a man obsessed.",
- "Moby Dick is a story of a whale and a crazy man.",
- "The robber wore a black fleece jacket and a baseball cap.",
- "The robber wore a red fleece jacket and a baseball cap.",
- "The robber wore a white fleece jacket and a baseball cap.",
- "The English Springer Spaniel is the best of all dogs."};
-
- private List<VectorWritable> sampleData;
-
- private String[] termDictionary;
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- // Create test data
- getSampleData(DOCS);
- ClusteringTestUtils.writePointsToFile(sampleData, true,
- getTestTempFilePath("testdata/file1"), fs, conf);
- }
-
- private void getSampleData(String[] docs2) throws IOException {
- sampleData = new ArrayList<>();
- RAMDirectory directory = new RAMDirectory();
- try (IndexWriter writer = new IndexWriter(directory,
- new IndexWriterConfig(new StandardAnalyzer()))){
- for (int i = 0; i < docs2.length; i++) {
- Document doc = new Document();
- Field id = new StringField("id", "doc_" + i, Field.Store.YES);
- doc.add(id);
- // Store both position and offset information
- FieldType fieldType = new FieldType();
- fieldType.setStored(false);
- fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
- fieldType.setTokenized(true);
- fieldType.setStoreTermVectors(true);
- fieldType.setStoreTermVectorPositions(true);
- fieldType.setStoreTermVectorOffsets(true);
- fieldType.freeze();
- Field text = new Field("content", docs2[i], fieldType);
- doc.add(text);
- writer.addDocument(doc);
- }
- }
-
- IndexReader reader = DirectoryReader.open(directory);
-
- Weight weight = new TFIDF();
- TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
-
- int numTerms = 0;
- for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
- it.next();
- numTerms++;
- }
- termDictionary = new String[numTerms];
- int i = 0;
- for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
- String term = it.next().getTerm();
- termDictionary[i] = term;
- System.out.println(i + " " + term);
- i++;
- }
- Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content",
- termInfo,weight);
-
- i = 0;
- for (Vector vector : iterable) {
- assertNotNull(vector);
- NamedVector namedVector;
- if (vector instanceof NamedVector) {
- // rename it for testing purposes
- namedVector = new NamedVector(((NamedVector) vector).getDelegate(),
- "P(" + i + ')');
-
- } else {
- namedVector = new NamedVector(vector, "P(" + i + ')');
- }
- System.out.println(AbstractCluster.formatVector(namedVector,
- termDictionary));
- sampleData.add(new VectorWritable(namedVector));
- i++;
- }
- }
-
- /**
- * Return the path to the final iteration's clusters
- */
- private static Path finalClusterPath(Configuration conf, Path output,
- int maxIterations) throws IOException {
- FileSystem fs = FileSystem.get(conf);
- for (int i = maxIterations; i >= 0; i--) {
- Path clusters = new Path(output, "clusters-" + i + "-final");
- if (fs.exists(clusters)) {
- return clusters;
- }
- }
- return null;
- }
-
- @Test
- public void testKmeans() throws Exception {
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Path input = getTestTempFilePath("input");
- Path output = getTestTempDirPath("output");
- Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- // Write test data to file
- ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
- // Select initial centroids
- RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
- // Run k-means
- Path kMeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, true, 0.0, false);
- // Print out clusters
- ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
- output, 10), new Path(kMeansOutput, "clusteredPoints"));
- clusterDumper.printClusters(termDictionary);
- }
-
- @Test
- public void testJsonClusterDumper() throws Exception {
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Path input = getTestTempFilePath("input");
- Path output = getTestTempDirPath("output");
- Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- // Write test data to file
- ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
- // Select initial centroids
- RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
- // Run k-means
- Path kmeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kmeansOutput, 0.001, 10, true, 0.0, false);
- // Print out clusters
- ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
- output, 10), new Path(kmeansOutput, "clusteredPoints"));
- clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON);
- clusterDumper.printClusters(termDictionary);
- }
-
- @Test
- public void testFuzzyKmeans() throws Exception {
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Path input = getTestTempFilePath("input");
- Path output = getTestTempDirPath("output");
- Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
- Configuration conf = getConfiguration();
- FileSystem fs = FileSystem.get(conf);
- // Write test data to file
- ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
- // Select initial centroids
- RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
- // Run k-means
- Path kMeansOutput = new Path(output, "kmeans");
- FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, 1.1f, true,
- true, 0, true);
- // run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
- output, 10), new Path(kMeansOutput, "clusteredPoints"));
- clusterDumper.printClusters(termDictionary);
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
deleted file mode 100644
index 8a226a0..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
+++ /dev/null
@@ -1,321 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.canopy.Canopy;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.VectorWritable;
-import org.junit.Before;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-public final class TestClusterEvaluator extends MahoutTestCase {
-
- private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
-
- private List<VectorWritable> referenceData = Lists.newArrayList();
-
- private final List<VectorWritable> sampleData = Lists.newArrayList();
-
- private Map<Integer,List<VectorWritable>> representativePoints;
-
- private List<Cluster> clusters;
-
- private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class);
-
- private Configuration conf;
-
- private FileSystem fs;
-
- private Path testdata;
-
- private Path output;
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- conf = getConfiguration();
- fs = FileSystem.get(conf);
- testdata = getTestTempDirPath("testdata");
- output = getTestTempDirPath("output");
- // Create small reference data set
- referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
- // generate larger test data set for the clustering tests to chew on
- generateSamples();
- }
-
- /**
- * Generate random samples and add them to the sampleData
- *
- * @param num
- * int number of samples to generate
- * @param mx
- * double x-value of the sample mean
- * @param my
- * double y-value of the sample mean
- * @param sd
- * double standard deviation of the samples
- */
- private void generateSamples(int num, double mx, double my, double sd) {
- log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
- for (int i = 0; i < num; i++) {
- sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
- UncommonDistributions.rNorm(my, sd)})));
- }
- }
-
- private void generateSamples() {
- generateSamples(500, 1, 1, 3);
- generateSamples(300, 1, 0, 0.5);
- generateSamples(300, 0, 2, 0.1);
- }
-
- private void printRepPoints(int numIterations) {
- RepresentativePointsDriver.printRepresentativePoints(output, numIterations);
- }
-
- /**
- * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
- *
- * @param dC
- * a double cluster center offset
- * @param dP
- * a double representative point offset
- * @param measure
- * the DistanceMeasure
- */
- private void initData(double dC, double dP, DistanceMeasure measure) {
- clusters = Lists.newArrayList();
- clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure));
- representativePoints = Maps.newHashMap();
- for (Cluster cluster : clusters) {
- List<VectorWritable> points = Lists.newArrayList();
- representativePoints.put(cluster.getId(), points);
- points.add(new VectorWritable(cluster.getCenter().clone()));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
- }
- }
-
- @Test
- public void testRepresentativePoints() throws Exception {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Configuration conf = getConfiguration();
- // run using MR reference point calculation
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
- int numIterations = 2;
- Path clustersIn = new Path(output, "clusters-0-final");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
- numIterations, false);
- printRepPoints(numIterations);
- ClusterEvaluator evaluatorMR = new ClusterEvaluator(conf, clustersIn);
- // now run again using sequential reference point calculation
- HadoopUtil.delete(conf, output);
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
- RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
- numIterations, true);
- printRepPoints(numIterations);
- ClusterEvaluator evaluatorSeq = new ClusterEvaluator(conf, clustersIn);
- // compare results
- assertEquals("InterCluster Density", evaluatorMR.interClusterDensity(), evaluatorSeq.interClusterDensity(), EPSILON);
- assertEquals("IntraCluster Density", evaluatorMR.intraClusterDensity(), evaluatorSeq.intraClusterDensity(), EPSILON);
- }
-
- @Test
- public void testCluster0() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- @Test
- public void testCluster1() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.5, measure);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- @Test
- public void testCluster2() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.75, measure);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- /**
- * adding an empty cluster should modify the inter cluster density but not change the intra-cluster density as that
- * cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator
- *
- * @throws IOException
- */
- @Test
- public void testEmptyCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = Lists.newArrayList();
- representativePoints.put(cluster.getId(), points);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.371534146934532, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- /**
- * adding an single-valued cluster should modify the inter cluster density but not change the intra-cluster density as
- * that cluster would have NaN as its intra-cluster density and NaN values are ignored by the evaluator
- *
- * @throws IOException
- */
- @Test
- public void testSingleValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = Lists.newArrayList();
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1}))));
- representativePoints.put(cluster.getId(), points);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- /**
- * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These
- * clusters are included in the inter-cluster density but their NaN intra-density values are ignored by the evaluator.
- *
- * @throws IOException
- */
- @Test
- public void testAllSameValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = Lists.newArrayList();
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- representativePoints.put(cluster.getId(), points);
- ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
- assertEquals("inter cluster density", 0.3656854249492381, evaluator.interClusterDensity(), EPSILON);
- assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
- }
-
- @Test
- public void testCanopy() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- Configuration conf = getConfiguration();
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, 0.0, true);
- int numIterations = 10;
- Path clustersIn = new Path(output, "clusters-0-final");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
- numIterations, true);
- //printRepPoints(numIterations);
- ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
- // now print out the Results
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- }
-
- @Test
- public void testKmeans() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
- Configuration conf = getConfiguration();
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true);
- // now run the KMeans job
- Path kmeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true);
- int numIterations = 10;
- Path clustersIn = new Path(kmeansOutput, "clusters-2");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure,
- numIterations, true);
- RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations);
- ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
- // now print out the Results
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- }
-
- @Test
- public void testFuzzyKmeans() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, "file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
- Configuration conf = getConfiguration();
- CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true);
- Path fuzzyKMeansOutput = new Path(output, "fuzzyk");
- // now run the KMeans job
- FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2,
- true, true, 0, true);
- int numIterations = 10;
- Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput,
- measure, numIterations, true);
- RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations);
- ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
- // now print out the Results
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
deleted file mode 100644
index 597ed01..0000000
--- a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
+++ /dev/null
@@ -1,326 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.cdbw;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.ClusteringTestUtils;
-import org.apache.mahout.clustering.TestClusterEvaluator;
-import org.apache.mahout.clustering.UncommonDistributions;
-import org.apache.mahout.clustering.canopy.Canopy;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.junit.Before;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class TestCDbwEvaluator extends MahoutTestCase {
-
- private static final double[][] REFERENCE = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
-
- private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class);
-
- private Map<Integer,List<VectorWritable>> representativePoints;
-
- private List<Cluster> clusters;
-
- private Configuration conf;
-
- private FileSystem fs;
-
- private final Collection<VectorWritable> sampleData = new ArrayList<>();
-
- private List<VectorWritable> referenceData = new ArrayList<>();
-
- private Path testdata;
-
- private Path output;
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- conf = getConfiguration();
- fs = FileSystem.get(conf);
- testdata = getTestTempDirPath("testdata");
- output = getTestTempDirPath("output");
- // Create small reference data set
- referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
- // generate larger test data set for the clustering tests to chew on
- generateSamples();
- }
-
- /**
- * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
- *
- * @param dC
- * a double cluster center offset
- * @param dP
- * a double representative point offset
- * @param measure
- * the DistanceMeasure
- */
- private void initData(double dC, double dP, DistanceMeasure measure) {
- clusters = new ArrayList<>();
- clusters.add(new Canopy(new DenseVector(new double[] {-dC, -dC}), 1, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {-dC, dC}), 3, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {dC, dC}), 5, measure));
- clusters.add(new Canopy(new DenseVector(new double[] {dC, -dC}), 7, measure));
- representativePoints = new HashMap<>();
- for (Cluster cluster : clusters) {
- List<VectorWritable> points = new ArrayList<>();
- representativePoints.put(cluster.getId(), points);
- points.add(new VectorWritable(cluster.getCenter().clone()));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {dP, -dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, -dP}))));
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
- }
- }
-
- /**
- * Generate random samples and add them to the sampleData
- *
- * @param num
- * int number of samples to generate
- * @param mx
- * double x-value of the sample mean
- * @param my
- * double y-value of the sample mean
- * @param sd
- * double standard deviation of the samples
- */
- private void generateSamples(int num, double mx, double my, double sd) {
- log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
- for (int i = 0; i < num; i++) {
- sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
- UncommonDistributions.rNorm(my, sd)})));
- }
- }
-
- private void generateSamples() {
- generateSamples(500, 1, 1, 3);
- generateSamples(300, 1, 0, 0.5);
- generateSamples(300, 0, 2, 0.1);
- }
-
- @Test
- public void testCDbw0() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testCDbw1() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.5, measure);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testCDbw2() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.75, measure);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testEmptyCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {10, 10}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = new ArrayList<>();
- representativePoints.put(cluster.getId(), points);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testSingleValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = new ArrayList<>();
- points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {1, 1}))));
- representativePoints.put(cluster.getId(), points);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- /**
- * Representative points extraction will duplicate the cluster center if the cluster has no assigned points. These
- * clusters should be ignored like empty clusters above
- *
- * @throws IOException
- */
- @Test
- public void testAllSameValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = new ArrayList<>();
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- representativePoints.put(cluster.getId(), points);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- /**
- * Clustering can produce very, very tight clusters that can cause the std calculation to fail. These clusters should
- * be processed correctly.
- *
- * @throws IOException
- */
- @Test
- public void testAlmostSameValueCluster() throws IOException {
- ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- initData(1, 0.25, measure);
- Canopy cluster = new Canopy(new DenseVector(new double[] {0, 0}), 19, measure);
- clusters.add(cluster);
- List<VectorWritable> points = new ArrayList<>();
- Vector delta = new DenseVector(new double[] {0, Double.MIN_NORMAL});
- points.add(new VectorWritable(delta.clone()));
- points.add(new VectorWritable(delta.clone()));
- points.add(new VectorWritable(delta.clone()));
- points.add(new VectorWritable(delta.clone()));
- points.add(new VectorWritable(delta.clone()));
- representativePoints.put(cluster.getId(), points);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
- System.out.println("CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testCanopy() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, true, 0.0, true);
- int numIterations = 10;
- Path clustersIn = new Path(output, "clusters-0-final");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure,
- numIterations, true);
- CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- // printRepPoints(numIterations);
- // now print out the Results
- System.out.println("Canopy CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testKmeans() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
- CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true);
- // now run the KMeans job
- Path kmeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true);
- int numIterations = 10;
- Path clustersIn = new Path(kmeansOutput, "clusters-10-final");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure,
- numIterations, true);
- CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- RepresentativePointsDriver.printRepresentativePoints(kmeansOutput, numIterations);
- // now print out the Results
- System.out.println("K-Means CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
- @Test
- public void testFuzzyKmeans() throws Exception {
- ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
- DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
- CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true);
- Path fuzzyKMeansOutput = new Path(output, "fuzzyk");
- // now run the KMeans job
- FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2,
- true, true, 0, true);
- int numIterations = 10;
- Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4");
- RepresentativePointsDriver.run(conf, clustersIn, new Path(fuzzyKMeansOutput, "clusteredPoints"), fuzzyKMeansOutput,
- measure, numIterations, true);
- CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- RepresentativePointsDriver.printRepresentativePoints(fuzzyKMeansOutput, numIterations);
- // now print out the Results
- System.out.println("Fuzzy K-Means CDbw = " + evaluator.getCDbw());
- System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
- System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
- System.out.println("Separation = " + evaluator.separation());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java b/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
deleted file mode 100644
index ba73c82..0000000
--- a/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-/**
- * Unit tests for the MailArchivesClusteringAnalyzer text analyzer.
- */
-public class MailArchivesClusteringAnalyzerTest extends MahoutTestCase {
-
- @Test
- public void testAnalysis() throws Exception {
- Analyzer analyzer = new MailArchivesClusteringAnalyzer();
-
- String text = "A test message\n"
- + "atokenthatistoolongtobeusefulforclustertextanalysis\n"
- + "Mahout is a scalable, machine-learning LIBRARY\n"
- + "we've added some additional stopwords such as html, mailto, regards\t"
- + "apache_hadoop provides the foundation for scalability\n"
- + "www.nabble.com general-help@incubator.apache.org\n"
- + "public void int protected package";
- Reader reader = new StringReader(text);
-
- // if you change the text above, then you may need to change this as well
- // order matters too
- String[] expectedTokens = {
- "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad",
- "stopword", "apache_hadoop","provid", "foundat", "scalabl"
- };
-
- TokenStream tokenStream = analyzer.tokenStream("test", reader);
- assertNotNull(tokenStream);
- tokenStream.reset();
- CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
- int e = 0;
- while (tokenStream.incrementToken() && e < expectedTokens.length) {
- assertEquals(expectedTokens[e++], termAtt.toString());
- }
- assertEquals(e, expectedTokens.length);
- tokenStream.end();
- tokenStream.close();
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java b/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
deleted file mode 100644
index ef2b8a6..0000000
--- a/integration/src/test/java/org/apache/mahout/text/SequenceFilesFromMailArchivesTest.java
+++ /dev/null
@@ -1,240 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.util.zip.GZIPOutputStream;
-
-import org.apache.commons.lang3.SystemUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Test case for the SequenceFilesFromMailArchives command-line application.
- */
-public final class SequenceFilesFromMailArchivesTest extends MahoutTestCase {
-
- private File inputDir;
-
- /**
- * Create the input and output directories needed for testing
- * the SequenceFilesFromMailArchives application.
- */
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- inputDir = getTestTempDir("mail-archives-in");
-
- // write test mail messages to a gzipped file in a nested directory
- File subDir = new File(inputDir, "subdir");
- subDir.mkdir();
- File gzFile = new File(subDir, "mail-messages.gz");
- try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile))) {
- gzOut.write(testMailMessages.getBytes("UTF-8"));
- gzOut.finish();
- }
-
- File subDir2 = new File(subDir, "subsubdir");
- subDir2.mkdir();
- File gzFile2 = new File(subDir2, "mail-messages-2.gz");
- try (GZIPOutputStream gzOut = new GZIPOutputStream(new FileOutputStream(gzFile2))) {
- gzOut.write(testMailMessages.getBytes("UTF-8"));
- gzOut.finish();
- }
- }
-
- @Test
- public void testSequential() throws Exception {
-
- File outputDir = this.getTestTempDir("mail-archives-out");
-
- String[] args = {
- "--input", inputDir.getAbsolutePath(),
- "--output", outputDir.getAbsolutePath(),
- "--charset", "UTF-8",
- "--keyPrefix", "TEST",
- "--method", "sequential",
- "--body", "--subject", "--separator", ""
- };
-
- // run the application's main method
- SequenceFilesFromMailArchives.main(args);
-
- // app should create a single SequenceFile named "chunk-0" in the output dir
- File expectedChunkFile = new File(outputDir, "chunk-0");
- String expectedChunkPath = expectedChunkFile.getAbsolutePath();
- Assert.assertTrue("Expected chunk file " + expectedChunkPath + " not found!", expectedChunkFile.isFile());
-
- Configuration conf = getConfiguration();
- SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(new Path(expectedChunkPath), true, conf);
- Assert.assertTrue("First key/value pair not found!", iterator.hasNext());
- Pair<Text, Text> record = iterator.next();
-
- File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz");
- Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), record.getFirst().toString());
- Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
-
- Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-
- record = iterator.next();
- Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), record.getFirst().toString());
- Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-
- record = iterator.next();
- File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz");
- Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), record.getFirst().toString());
- Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
-
- Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
- record = iterator.next();
- Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), record.getFirst().toString());
- Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-
- Assert.assertFalse("Only two key/value pairs expected!", iterator.hasNext());
- }
-
- @Test
- public void testMapReduce() throws Exception {
-
- Path tmpDir = getTestTempDirPath();
- Path mrOutputDir = new Path(tmpDir, "mail-archives-out-mr");
- Configuration configuration = getConfiguration();
- FileSystem fs = FileSystem.get(configuration);
-
- File expectedInputFile = new File(inputDir.toString());
-
- String[] args = {
- "-Dhadoop.tmp.dir=" + configuration.get("hadoop.tmp.dir"),
- "--input", expectedInputFile.getAbsolutePath(),
- "--output", mrOutputDir.toString(),
- "--charset", "UTF-8",
- "--keyPrefix", "TEST",
- "--method", "mapreduce",
- "--body", "--subject", "--separator", ""
- };
-
- // run the application's main method
- SequenceFilesFromMailArchives.main(args);
-
- // app should create a single SequenceFile named "chunk-0" in the output dir
- FileStatus[] fileStatuses = fs.listStatus(mrOutputDir.suffix("/part-m-00000"));
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
- SequenceFileIterator<Text, Text> iterator =
- new SequenceFileIterator<>(mrOutputDir.suffix("/part-m-00000"), true, configuration);
-
- Assert.assertTrue("First key/value pair not found!", iterator.hasNext());
- Pair<Text, Text> record = iterator.next();
-
- File parentFileSubSubDir = new File(new File(new File(new File("TEST"), "subdir"), "subsubdir"), "mail-messages-2.gz");
-
- String expected = record.getFirst().toString();
- if (SystemUtils.IS_OS_WINDOWS) {
- expected = expected.replace("/", "\\");
- }
- Assert.assertEquals(new File(parentFileSubSubDir, testVars[0][0]).toString(), expected);
- Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
- Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-
- record = iterator.next();
- expected = record.getFirst().toString();
- if (SystemUtils.IS_OS_WINDOWS) {
- expected = expected.replace("/", "\\");
- }
- Assert.assertEquals(new File(parentFileSubSubDir, testVars[1][0]).toString(), expected);
- Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
-
- // test other file
- File parentFile = new File(new File(new File("TEST"), "subdir"), "mail-messages.gz");
- record = iterator.next();
- expected = record.getFirst().toString();
- if (SystemUtils.IS_OS_WINDOWS) {
- expected = expected.replace("/", "\\");
- }
- Assert.assertEquals(new File(parentFile, testVars[0][0]).toString(), expected);
- Assert.assertEquals(testVars[0][1] + testVars[0][2], record.getSecond().toString());
- Assert.assertTrue("Second key/value pair not found!", iterator.hasNext());
-
- record = iterator.next();
- expected = record.getFirst().toString();
- if (SystemUtils.IS_OS_WINDOWS) {
- expected = expected.replace("/", "\\");
- }
- Assert.assertEquals(new File(parentFile, testVars[1][0]).toString(), expected);
- Assert.assertEquals(testVars[1][1] + testVars[1][2], record.getSecond().toString());
- Assert.assertFalse("Only four key/value pairs expected!", iterator.hasNext());
- }
-
- // Messages extracted and made anonymous from the ASF mail archives
- private static final String[][] testVars = {
- new String[] {
- "user@example.com",
- "Ant task for JDK1.1 collections build option",
- "\nThis is just a test message\n--\nTesty McTester\n"
- },
- new String[] {
- "somebody@example.com",
- "Problem with build files in several directories",
- "\nHi all,\nThis is another test message.\nRegards,\nAnother Test\n"
- }
- };
-
- private static final String testMailMessages =
- "From user@example.com Mon Jul 24 19:13:53 2000\n"
- + "Return-Path: <us...@example.com>\n"
- + "Mailing-List: contact ant-user-help@jakarta.apache.org; run by ezmlm\n"
- + "Delivered-To: mailing list ant-user@jakarta.apache.org\n"
- + "Received: (qmail 49267 invoked from network); 24 Jul 2000 19:13:53 -0000\n"
- + "Message-ID: <" + testVars[0][0] + ">\n"
- + "From: \"Testy McTester\" <us...@example.com>\n"
- + "To: <an...@jakarta.apache.org>\n"
- + "Subject: " + testVars[0][1] + '\n'
- + "Date: Mon, 24 Jul 2000 12:24:56 -0700\n"
- + "MIME-Version: 1.0\n"
- + "Content-Type: text/plain;\n"
- + " charset=\"Windows-1252\"\n"
- + "Content-Transfer-Encoding: 7bit\n"
- + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"
- + testVars[0][2] + '\n'
- + "From somebody@example.com Wed Jul 26 11:32:16 2000\n"
- + "Return-Path: <so...@example.com>\n"
- + "Mailing-List: contact ant-user-help@jakarta.apache.org; run by ezmlm\n"
- + "Delivered-To: mailing list ant-user@jakarta.apache.org\n"
- + "Received: (qmail 73966 invoked from network); 26 Jul 2000 11:32:16 -0000\n"
- + "User-Agent: Microsoft-Outlook-Express-Macintosh-Edition/5.02.2022\n"
- + "Date: Wed, 26 Jul 2000 13:32:08 +0200\n"
- + "Subject: " + testVars[1][1] + '\n'
- + "From: Another Test <so...@example.com>\n"
- + "To: <an...@jakarta.apache.org>\n"
- + "Message-Id: <" + testVars[1][0] + ">\n"
- + "Mime-Version: 1.0\n"
- + "Content-Type: text/plain; charset=\"US-ASCII\"\n"
- + "Content-Transfer-Encoding: 7bit\n"
- + "X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N\n"
- + testVars[1][2];
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java b/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
deleted file mode 100644
index 227521a..0000000
--- a/integration/src/test/java/org/apache/mahout/text/TestPathFilter.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text;
-
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-
-/**
- * Dummy Path Filter for testing the MapReduce version of
- * SequenceFilesFromDirectory
- */
-public class TestPathFilter implements PathFilter {
-
- @Override
- public boolean accept(Path path) {
- return path.getName().startsWith("t") || path.getName().startsWith("r") || path.getName().startsWith("f");
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java b/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
deleted file mode 100644
index 040c8e4..0000000
--- a/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.text;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class TestSequenceFilesFromDirectory extends MahoutTestCase {
-
- private static final Logger logger = LoggerFactory.getLogger(TestSequenceFilesFromDirectory.class);
-
- private static final String[][] DATA1 = {
- {"test1", "This is the first text."},
- {"test2", "This is the second text."},
- {"test3", "This is the third text."}
- };
-
- private static final String[][] DATA2 = {
- {"recursive_test1", "This is the first text."},
- {"recursive_test2", "This is the second text."},
- {"recursive_test3", "This is the third text."}
- };
-
- @Test
- public void testSequenceFileFromDirectoryBasic() throws Exception {
- // parameters
- Configuration configuration = getConfiguration();
-
- FileSystem fs = FileSystem.get(configuration);
-
- // create
- Path tmpDir = this.getTestTempDirPath();
- Path inputDir = new Path(tmpDir, "inputDir");
- fs.mkdirs(inputDir);
-
- Path outputDir = new Path(tmpDir, "outputDir");
- Path outputDirRecursive = new Path(tmpDir, "outputDirRecursive");
-
- Path inputDirRecursive = new Path(tmpDir, "inputDirRecur");
- fs.mkdirs(inputDirRecursive);
-
- // prepare input files
- createFilesFromArrays(configuration, inputDir, DATA1);
-
- SequenceFilesFromDirectory.main(new String[]{
- "--input", inputDir.toString(),
- "--output", outputDir.toString(),
- "--chunkSize", "64",
- "--charset", Charsets.UTF_8.name(),
- "--keyPrefix", "UID",
- "--method", "sequential"});
-
- // check output chunk files
- checkChunkFiles(configuration, outputDir, DATA1, "UID");
-
- createRecursiveDirFilesFromArrays(configuration, inputDirRecursive, DATA2);
-
- FileStatus fstInputPath = fs.getFileStatus(inputDirRecursive);
- String dirs = HadoopUtil.buildDirList(fs, fstInputPath);
-
- System.out.println("\n\n ----- recursive dirs: " + dirs);
- SequenceFilesFromDirectory.main(new String[]{
- "--input", inputDirRecursive.toString(),
- "--output", outputDirRecursive.toString(),
- "--chunkSize", "64",
- "--charset", Charsets.UTF_8.name(),
- "--keyPrefix", "UID",
- "--method", "sequential"});
-
- checkRecursiveChunkFiles(configuration, outputDirRecursive, DATA2, "UID");
- }
-
- @Test
- public void testSequenceFileFromDirectoryMapReduce() throws Exception {
-
- Configuration conf = getConfiguration();
-
- FileSystem fs = FileSystem.get(conf);
-
- // create
- Path tmpDir = this.getTestTempDirPath();
- Path inputDir = new Path(tmpDir, "inputDir");
- fs.mkdirs(inputDir);
-
- Path inputDirRecur = new Path(tmpDir, "inputDirRecur");
- fs.mkdirs(inputDirRecur);
-
- Path mrOutputDir = new Path(tmpDir, "mrOutputDir");
- Path mrOutputDirRecur = new Path(tmpDir, "mrOutputDirRecur");
-
- createFilesFromArrays(conf, inputDir, DATA1);
-
- SequenceFilesFromDirectory.main(new String[]{
- "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"),
- "--input", inputDir.toString(),
- "--output", mrOutputDir.toString(),
- "--chunkSize", "64",
- "--charset", Charsets.UTF_8.name(),
- "--method", "mapreduce",
- "--keyPrefix", "UID",
- "--fileFilterClass", "org.apache.mahout.text.TestPathFilter"
- });
-
- checkMRResultFiles(conf, mrOutputDir, DATA1, "UID");
-
- createRecursiveDirFilesFromArrays(conf, inputDirRecur, DATA2);
-
- FileStatus fst_input_path = fs.getFileStatus(inputDirRecur);
- String dirs = HadoopUtil.buildDirList(fs, fst_input_path);
-
- logger.info("\n\n ---- recursive dirs: {}", dirs);
-
- SequenceFilesFromDirectory.main(new String[]{
- "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"),
- "--input", inputDirRecur.toString(),
- "--output", mrOutputDirRecur.toString(),
- "--chunkSize", "64",
- "--charset", Charsets.UTF_8.name(),
- "--method", "mapreduce",
- "--keyPrefix", "UID",
- "--fileFilterClass", "org.apache.mahout.text.TestPathFilter"
- });
-
- checkMRResultFilesRecursive(conf, mrOutputDirRecur, DATA2, "UID");
- }
-
-
- private static void createFilesFromArrays(Configuration conf, Path inputDir, String[][] data) throws IOException {
- FileSystem fs = FileSystem.get(conf);
- for (String[] aData : data) {
- try (OutputStreamWriter writer =
- new OutputStreamWriter(fs.create(new Path(inputDir, aData[0])), Charsets.UTF_8)){
- writer.write(aData[1]);
- }
- }
- }
-
- private static void createRecursiveDirFilesFromArrays(Configuration configuration, Path inputDir,
- String[][] data) throws IOException {
- FileSystem fs = FileSystem.get(configuration);
-
- logger.info("creativeRecursiveDirFilesFromArrays > based on: {}", inputDir.toString());
- Path curPath;
- String currentRecursiveDir = inputDir.toString();
-
- for (String[] aData : data) {
- currentRecursiveDir += "/" + aData[0];
- File subDir = new File(currentRecursiveDir);
- subDir.mkdir();
-
- curPath = new Path(subDir.toString(), "file.txt");
- logger.info("Created file: {}", curPath.toString());
-
- try (OutputStreamWriter writer = new OutputStreamWriter(fs.create(curPath), Charsets.UTF_8)){
- writer.write(aData[1]);
- }
- }
- }
-
- private static void checkChunkFiles(Configuration configuration,
- Path outputDir,
- String[][] data,
- String prefix) throws IOException {
- FileSystem fs = FileSystem.get(configuration);
-
- // output exists?
- FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("chunk-0", fileStatuses[0].getPath().getName());
-
- Map<String, String> fileToData = new HashMap<>();
- for (String[] aData : data) {
- fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
- }
-
- // read a chunk to check content
- try (SequenceFileIterator<Text, Text> iterator =
- new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)){
- while (iterator.hasNext()) {
- Pair<Text, Text> record = iterator.next();
- String retrievedData = fileToData.get(record.getFirst().toString().trim());
- assertNotNull(retrievedData);
- assertEquals(retrievedData, record.getSecond().toString().trim());
- }
- }
- }
-
- private static void checkRecursiveChunkFiles(Configuration configuration,
- Path outputDir,
- String[][] data,
- String prefix) throws IOException {
- FileSystem fs = FileSystem.get(configuration);
-
- System.out.println(" ----------- check_Recursive_ChunkFiles ------------");
-
- // output exists?
- FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("chunk-0", fileStatuses[0].getPath().getName());
-
-
- Map<String, String> fileToData = new HashMap<>();
- String currentPath = prefix;
- for (String[] aData : data) {
- currentPath += Path.SEPARATOR + aData[0];
- fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
- }
-
- // read a chunk to check content
- try (SequenceFileIterator<Text, Text> iterator =
- new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)) {
- while (iterator.hasNext()) {
- Pair<Text, Text> record = iterator.next();
- String retrievedData = fileToData.get(record.getFirst().toString().trim());
- System.out.printf("%s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim());
-
- assertNotNull(retrievedData);
- assertEquals(retrievedData, record.getSecond().toString().trim());
- System.out.printf(">>> k: %s, v: %s\n", record.getFirst().toString(), record.getSecond().toString());
- }
- }
- }
-
- private static void checkMRResultFiles(Configuration conf, Path outputDir,
- String[][] data, String prefix) throws IOException {
- FileSystem fs = FileSystem.get(conf);
-
- // output exists?
- FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
- Map<String, String> fileToData = new HashMap<>();
- for (String[] aData : data) {
- System.out.printf("map.put: %s %s\n", prefix + Path.SEPARATOR + aData[0], aData[1]);
- fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
- }
-
- // read a chunk to check content
- try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(
- fileStatuses[0].getPath(), true, conf)) {
- while (iterator.hasNext()) {
- Pair<Text, Text> record = iterator.next();
- String retrievedData = fileToData.get(record.getFirst().toString().trim());
-
- System.out.printf("MR> %s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim());
- assertNotNull(retrievedData);
- assertEquals(retrievedData, record.getSecond().toString().trim());
- }
- }
- }
-
- private static void checkMRResultFilesRecursive(Configuration configuration, Path outputDir,
- String[][] data, String prefix) throws IOException {
- FileSystem fs = FileSystem.get(configuration);
-
- // output exists?
- FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
- assertEquals(1, fileStatuses.length); // only one
- assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
- Map<String, String> fileToData = new HashMap<>();
- String currentPath = prefix;
-
- for (String[] aData : data) {
- currentPath += Path.SEPARATOR + aData[0];
- fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
- }
-
- // read a chunk to check content
- try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(
- fileStatuses[0].getPath(), true, configuration)){
- while (iterator.hasNext()) {
- Pair<Text, Text> record = iterator.next();
- System.out.printf("MR-Recur > Trying to check: %s\n", record.getFirst().toString().trim());
- String retrievedData = fileToData.get(record.getFirst().toString().trim());
- assertNotNull(retrievedData);
- assertEquals(retrievedData, record.getSecond().toString().trim());
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java b/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
deleted file mode 100644
index 7483b2d..0000000
--- a/integration/src/test/java/org/apache/mahout/text/doc/MultipleFieldsDocument.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.text.doc;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-
-/**
- * Used for testing lucene2seq
- */
-@Deprecated
-public class MultipleFieldsDocument extends SingleFieldDocument {
-
- public static final String FIELD1 = "field1";
- public static final String FIELD2 = "field2";
-
- private String field1;
- private String field2;
-
- public MultipleFieldsDocument(String id, String field, String field1, String field2) {
- super(id, field);
- this.field1 = field1;
- this.field2 = field2;
- }
-
- public String getField1() {
- return field1;
- }
-
- public String getField2() {
- return field2;
- }
-
- @Override
- public Document asLuceneDocument() {
- Document document = super.asLuceneDocument();
-
- document.add(new TextField(FIELD1, this.field1, Field.Store.YES));
- document.add(new TextField(FIELD2, this.field2, Field.Store.YES));
-
- return document;
- }
-}