You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2013/09/17 18:21:30 UTC

svn commit: r1524116 - in /mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming: cluster/BallKMeansTest.java cluster/DataUtils.java cluster/StreamingKMeansTest.java mapreduce/StreamingKMeansTestMR.java

Author: sslavic
Date: Tue Sep 17 16:21:30 2013
New Revision: 1524116

URL: http://svn.apache.org/r1524116
Log:
MAHOUT-1320: Made ball and streaming k-means clustering tests deterministic by using test seed (tdunning via sslavic)

Modified:
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java?rev=1524116&r1=1524115&r2=1524116&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/BallKMeansTest.java Tue Sep 17 16:21:30 2013
@@ -22,6 +22,7 @@ import java.util.List;
 import com.google.common.collect.Lists;
 import org.apache.mahout.clustering.ClusteringUtils;
 import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
 import org.apache.mahout.math.Centroid;
@@ -40,6 +41,8 @@ import org.apache.mahout.math.neighborho
 import org.apache.mahout.math.random.MultiNormal;
 import org.apache.mahout.math.random.WeightedThing;
 import org.apache.mahout.math.stats.OnlineSummarizer;
+import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import static org.apache.mahout.clustering.ClusteringUtils.totalWeight;
@@ -52,10 +55,17 @@ public class BallKMeansTest {
   private static final int NUM_ITERATIONS = 20;
   private static final double DISTRIBUTION_RADIUS = 0.01;
 
-  private static Pair<List<Centroid>, List<Centroid>> syntheticData =
-      DataUtils.sampleMultiNormalHypercube(NUM_DIMENSIONS, NUM_DATA_POINTS, DISTRIBUTION_RADIUS);
+  @BeforeClass
+  public static void setUp() {
+    RandomUtils.useTestSeed();
+    syntheticData = DataUtils.sampleMultiNormalHypercube(NUM_DIMENSIONS, NUM_DATA_POINTS, DISTRIBUTION_RADIUS);
+
+  }
+
+  private static Pair<List<Centroid>, List<Centroid>> syntheticData;
   private static final int K1 = 100;
 
+
   @Test
   public void testClusteringMultipleRuns() {
     for (int i = 1; i <= 10; ++i) {
@@ -80,9 +90,18 @@ public class BallKMeansTest {
     BallKMeans clusterer = new BallKMeans(searcher, 1 << NUM_DIMENSIONS, NUM_ITERATIONS);
 
     long startTime = System.currentTimeMillis();
-    clusterer.cluster(syntheticData.getFirst());
+    Pair<List<Centroid>, List<Centroid>> data = syntheticData;
+    clusterer.cluster(data.getFirst());
     long endTime = System.currentTimeMillis();
 
+    long hash = 0;
+    for (Centroid centroid : data.getFirst()) {
+      for (Vector.Element element : centroid.all()) {
+        hash = 31 * hash + 17 * element.index() + Double.toHexString(element.get()).hashCode();
+      }
+    }
+    System.out.printf("Hash = %08x\n", hash);
+
     assertEquals("Total weight not preserved", totalWeight(syntheticData.getFirst()), totalWeight(clusterer), 1.0e-9);
 
     // Verify that each corner of the cube has a centroid very nearby.

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java?rev=1524116&r1=1524115&r2=1524116&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/DataUtils.java Tue Sep 17 16:21:30 2013
@@ -41,6 +41,11 @@ public final class DataUtils {
    * A hypercube of numDimensions has 2^numDimensions vertices. Keep this in mind when clustering
    * the data.
    *
+   * Note that it is almost always the case that you want to call RandomUtils.useTestSeed() before
+   * generating test data.  This means that you can't generate data in the declaration of a static
+   * variable because such initializations happen before any @BeforeClass or @Before setup methods
+   * are called.
+   *
    *
    * @param numDimensions number of dimensions of the vectors to be generated.
    * @param numDatapoints number of data points to be generated.

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java?rev=1524116&r1=1524115&r2=1524116&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeansTest.java Tue Sep 17 16:21:30 2013
@@ -23,6 +23,7 @@ import java.util.List;
 
 import org.apache.mahout.clustering.ClusteringUtils;
 import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
 import org.apache.mahout.math.Centroid;
@@ -33,6 +34,7 @@ import org.apache.mahout.math.neighborho
 import org.apache.mahout.math.neighborhood.Searcher;
 import org.apache.mahout.math.neighborhood.UpdatableSearcher;
 import org.apache.mahout.math.random.WeightedThing;
+import org.junit.Before;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
@@ -49,8 +51,14 @@ public class StreamingKMeansTest {
   private static final int NUM_PROJECTIONS = 2;
   private static final int SEARCH_SIZE = 10;
 
-  private static final Pair<List<Centroid>, List<Centroid>> syntheticData =
+  private static Pair<List<Centroid>, List<Centroid>> syntheticData ;
+
+  @Before
+  public void setUp() {
+    RandomUtils.useTestSeed();
+    syntheticData =
       DataUtils.sampleMultiNormalHypercube(NUM_DIMENSIONS, NUM_DATA_POINTS);
+  }
 
   private UpdatableSearcher searcher;
   private boolean allAtOnce;

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java?rev=1524116&r1=1524115&r2=1524116&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansTestMR.java Tue Sep 17 16:21:30 2013
@@ -35,6 +35,7 @@ import org.apache.mahout.clustering.Clus
 import org.apache.mahout.clustering.streaming.cluster.DataUtils;
 import org.apache.mahout.clustering.streaming.cluster.StreamingKMeans;
 import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
@@ -46,6 +47,7 @@ import org.apache.mahout.math.neighborho
 import org.apache.mahout.math.neighborhood.LocalitySensitiveHashSearch;
 import org.apache.mahout.math.neighborhood.ProjectionSearch;
 import org.apache.mahout.math.random.WeightedThing;
+import org.junit.Before;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
@@ -62,8 +64,14 @@ public class StreamingKMeansTestMR {
   private static final int MAX_NUM_ITERATIONS = 10;
   private static final double DISTANCE_CUTOFF = 1.0e-6;
 
-  private static final Pair<List<Centroid>, List<Centroid>> syntheticData =
+  private static Pair<List<Centroid>, List<Centroid>> syntheticData;
+
+  @Before
+  public void setUp() {
+    RandomUtils.useTestSeed();
+    syntheticData =
       DataUtils.sampleMultiNormalHypercube(NUM_DIMENSIONS, NUM_DATA_POINTS, 1.0e-4);
+  }
 
   private final String searcherClassName;
   private final String distanceMeasureClassName;