You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2013/09/17 18:21:30 UTC
svn commit: r1524116 - in
/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming:
cluster/BallKMeansTest.java cluster/DataUtils.java
cluster/StreamingKMeansTest.java mapreduce/StreamingKMeansTestMR.java
Author: sslavic
Date: Tue Sep 17 16:21:30 2013
New Revision: 1524116
URL: http://svn.apache.org/r1524116
Log:
MAHOUT-1320: Made ball and streaming k-means clustering tests deterministic by using test seed (tdunning via sslavic)
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java?rev=1524116&r1=1524115&r2=1524116&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java Tue Sep 17 16:21:30 2013
@@ -22,6 +22,7 @@ import java.util.List;
import com.google.common.collect.Lists;
import org.apache.mahout.clustering.ClusteringUtils;
import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.math.Centroid;
@@ -40,6 +41,8 @@ import org.apache.mahout.math.neighborho
import org.apache.mahout.math.random.MultiNormal;
import org.apache.mahout.math.random.WeightedThing;
import org.apache.mahout.math.stats.OnlineSummarizer;
+import org.junit.Before;
+import org.junit.BeforeClass;
import org.junit.Test;
import static org.apache.mahout.clustering.ClusteringUtils.totalWeight;
@@ -52,10 +55,17 @@ public class BallKMeansTest {
private static final int NUM_ITERATIONS = 20;
private static final double DISTRIBUTION_RADIUS = 0.01;
- private static Pair<List<Centroid>, List<Centroid>> syntheticData =
- DataUtils.sampleMultiNormalHypercube(NUM_DIMENSIONS, NUM_DATA_POINTS, DISTRIBUTION_RADIUS);
+ @BeforeClass
+ public static void setUp() {
+ RandomUtils.useTestSeed();
+ syntheticData = DataUtils.sampleMultiNormalHypercube(NUM_DIMENSIONS, NUM_DATA_POINTS, DISTRIBUTION_RADIUS);
+
+ }
+
+ private static Pair<List<Centroid>, List<Centroid>> syntheticData;
private static final int K1 = 100;
+
@Test
public void testClusteringMultipleRuns() {
for (int i = 1; i <= 10; ++i) {
@@ -80,9 +90,18 @@ public class BallKMeansTest {
BallKMeans clusterer = new BallKMeans(searcher, 1 << NUM_DIMENSIONS, NUM_ITERATIONS);
long startTime = System.currentTimeMillis();
- clusterer.cluster(syntheticData.getFirst());
+ Pair<List<Centroid>, List<Centroid>> data = syntheticData;
+ clusterer.cluster(data.getFirst());
long endTime = System.currentTimeMillis();
+ long hash = 0;
+ for (Centroid centroid : data.getFirst()) {
+ for (Vector.Element element : centroid.all()) {
+ hash = 31 * hash + 17 * element.index() + Double.toHexString(element.get()).hashCode();
+ }
+ }
+ System.out.printf("Hash = %08x\n", hash);
+
assertEquals("Total weight not preserved", totalWeight(syntheticData.getFirst()), totalWeight(clusterer), 1.0e-9);
// Verify that each corner of the cube has a centroid very nearby.
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java?rev=1524116&r1=1524115&r2=1524116&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java Tue Sep 17 16:21:30 2013
@@ -41,6 +41,11 @@ public final class DataUtils {
* A hypercube of numDimensions has 2^numDimensions vertices. Keep this in mind when clustering
* the data.
*
+ * Note that it is almost always the case that you want to call RandomUtils.useTestSeed() before
+ * generating test data. This means that you can't generate data in the declaration of a static
+ * variable because such initializations happen before any @BeforeClass or @Before setup methods
+ * are called.
+ *
*
* @param numDimensions number of dimensions of the vectors to be generated.
* @param numDatapoints number of data points to be generated.
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java?rev=1524116&r1=1524115&r2=1524116&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java Tue Sep 17 16:21:30 2013
@@ -23,6 +23,7 @@ import java.util.List;
import org.apache.mahout.clustering.ClusteringUtils;
import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.math.Centroid;
@@ -33,6 +34,7 @@ import org.apache.mahout.math.neighborho
import org.apache.mahout.math.neighborhood.Searcher;
import org.apache.mahout.math.neighborhood.UpdatableSearcher;
import org.apache.mahout.math.random.WeightedThing;
+import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
@@ -49,8 +51,14 @@ public class StreamingKMeansTest {
private static final int NUM_PROJECTIONS = 2;
private static final int SEARCH_SIZE = 10;
- private static final Pair<List<Centroid>, List<Centroid>> syntheticData =
+ private static Pair<List<Centroid>, List<Centroid>> syntheticData ;
+
+ @Before
+ public void setUp() {
+ RandomUtils.useTestSeed();
+ syntheticData =
DataUtils.sampleMultiNormalHypercube(NUM_DIMENSIONS, NUM_DATA_POINTS);
+ }
private UpdatableSearcher searcher;
private boolean allAtOnce;
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java?rev=1524116&r1=1524115&r2=1524116&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java Tue Sep 17 16:21:30 2013
@@ -35,6 +35,7 @@ import org.apache.mahout.clustering.Clus
import org.apache.mahout.clustering.streaming.cluster.DataUtils;
import org.apache.mahout.clustering.streaming.cluster.StreamingKMeans;
import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
@@ -46,6 +47,7 @@ import org.apache.mahout.math.neighborho
import org.apache.mahout.math.neighborhood.LocalitySensitiveHashSearch;
import org.apache.mahout.math.neighborhood.ProjectionSearch;
import org.apache.mahout.math.random.WeightedThing;
+import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
@@ -62,8 +64,14 @@ public class StreamingKMeansTestMR {
private static final int MAX_NUM_ITERATIONS = 10;
private static final double DISTANCE_CUTOFF = 1.0e-6;
- private static final Pair<List<Centroid>, List<Centroid>> syntheticData =
+ private static Pair<List<Centroid>, List<Centroid>> syntheticData;
+
+ @Before
+ public void setUp() {
+ RandomUtils.useTestSeed();
+ syntheticData =
DataUtils.sampleMultiNormalHypercube(NUM_DIMENSIONS, NUM_DATA_POINTS, 1.0e-4);
+ }
private final String searcherClassName;
private final String distanceMeasureClassName;