You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/07/15 21:03:21 UTC

svn commit: r964542 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/ core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/ examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/ utils/sr...

Author: jeastman
Date: Thu Jul 15 19:03:20 2010
New Revision: 964542

URL: http://svn.apache.org/viewvc?rev=964542&view=rev
Log:
MAHOUT-294:
- fixed serialization bug in FuzzyKMeansInfo that was not serializing the combinerState and led to inflated cluster centroid values
- removed combiner TODO that indicates problem has been in trunk for some time
- added a unit test testFuzzyKMeansInfoSerialization()
- TestClusterDumper and SyntheticControl now produce reasonable-looking fuzzyK results
- all unit tests run

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java Thu Jul 15 19:03:20 2010
@@ -37,7 +37,6 @@ public class FuzzyKMeansCombiner extends
       }
       value.setCombinerPass(value.getCombinerPass() + 1);
     }
-    // TODO: how do we pass along the combinerPass? Or do we not need to?
     context.write(key, new FuzzyKMeansInfo(cluster.getPointProbSum(), cluster.getWeightedPointTotal(), 1));
   }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java Thu Jul 15 19:03:20 2010
@@ -64,12 +64,14 @@ public class FuzzyKMeansInfo implements 
   @Override
   public void write(DataOutput out) throws IOException {
     out.writeDouble(probability);
+    out.writeInt(combinerPass);
     VectorWritable.writeVector(out, pointTotal);
   }
   
   @Override
   public void readFields(DataInput in) throws IOException {
     this.probability = in.readDouble();
+    this.combinerPass = in.readInt();
     VectorWritable temp = new VectorWritable();
     temp.readFields(in);
     this.pointTotal = temp.get();

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Thu Jul 15 19:03:20 2010
@@ -18,6 +18,7 @@
 package org.apache.mahout.clustering.fuzzykmeans;
 
 import java.io.File;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -26,6 +27,8 @@ import java.util.Map;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.io.DataOutputBuffer;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
@@ -40,6 +43,7 @@ import org.apache.mahout.common.MahoutTe
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
@@ -211,28 +215,24 @@ public class TestFuzzyKmeansClustering e
 
       // now run the Job using the run() command line options.
       Path output = getTestTempDirPath("output");
-/*      FuzzyKMeansDriver.runJob(pointsPath,
-                               clustersPath,
-                               output,
-                               EuclideanDistanceMeasure.class.getName(),
-                               0.001,
-                               2,
-                               k + 1,
-                               2,
-                               false,
-                               true,
-                               0);
-*/
-      String[] args = { DefaultOptionCreator.INPUT_OPTION_KEY, pointsPath.toString(), 
-          DefaultOptionCreator.CLUSTERS_IN_OPTION_KEY, clustersPath.toString(), 
-          DefaultOptionCreator.OUTPUT_OPTION_KEY, output.toString(),
+      /*      FuzzyKMeansDriver.runJob(pointsPath,
+                                     clustersPath,
+                                     output,
+                                     EuclideanDistanceMeasure.class.getName(),
+                                     0.001,
+                                     2,
+                                     k + 1,
+                                     2,
+                                     false,
+                                     true,
+                                     0);
+      */
+      String[] args = { DefaultOptionCreator.INPUT_OPTION_KEY, pointsPath.toString(), DefaultOptionCreator.CLUSTERS_IN_OPTION_KEY,
+          clustersPath.toString(), DefaultOptionCreator.OUTPUT_OPTION_KEY, output.toString(),
           DefaultOptionCreator.DISTANCE_MEASURE_OPTION_KEY, EuclideanDistanceMeasure.class.getName(),
-          DefaultOptionCreator.CONVERGENCE_DELTA_OPTION_KEY, "0.001", 
-          DefaultOptionCreator.MAX_ITERATIONS_OPTION_KEY, "2",
-          FuzzyKMeansDriver.M_OPTION_KEY, "2.0", 
-          DefaultOptionCreator.CLUSTERING_OPTION_KEY,
-          DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION_KEY,
-          DefaultOptionCreator.OVERWRITE_OPTION_KEY };
+          DefaultOptionCreator.CONVERGENCE_DELTA_OPTION_KEY, "0.001", DefaultOptionCreator.MAX_ITERATIONS_OPTION_KEY, "2",
+          FuzzyKMeansDriver.M_OPTION_KEY, "2.0", DefaultOptionCreator.CLUSTERING_OPTION_KEY,
+          DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION_KEY, DefaultOptionCreator.OVERWRITE_OPTION_KEY };
       new FuzzyKMeansDriver().run(args);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "clusteredPoints/part-m-00000"), conf);
       IntWritable key = new IntWritable();
@@ -583,4 +583,19 @@ public class TestFuzzyKmeansClustering e
     }
   }
 
+  public void testFuzzyKMeansInfoSerialization() throws IOException {
+    double[] data = { 1.1, 2.2, 3.3 };
+    Vector vector = new DenseVector(data);
+    FuzzyKMeansInfo reference = new FuzzyKMeansInfo(2.0, vector, 1);
+    DataOutputBuffer out = new DataOutputBuffer();
+    reference.write(out);
+    FuzzyKMeansInfo info = new FuzzyKMeansInfo();
+    DataInputBuffer in = new DataInputBuffer();
+    in.reset(out.getData(), out.getLength());
+    info.readFields(in);
+    assertEquals("probability", reference.getProbability(), info.getProbability());
+    assertTrue("point total", reference.getVector().equals(info.getVector()));
+    assertEquals("combiner", reference.getCombinerPass(), info.getCombinerPass());
+  }
+
 }

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java Thu Jul 15 19:03:20 2010
@@ -181,7 +181,7 @@ public final class Job extends FuzzyKMea
                              numReducerTasks,
                              fuzziness,
                              true,
-                             false,
+                             true,
                              0.0);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-3"), new Path(output, "clusteredPoints"));

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Thu Jul 15 19:03:20 2010
@@ -162,7 +162,7 @@ public class TestClusterDumper extends M
     Path output = getTestTempDirPath("output");
     CanopyDriver.runJob(getTestTempDirPath("testdata"), output,
                         EuclideanDistanceMeasure.class.getName(), 8, 4, false);
-    // now run the KMeans job
+    // now run the Fuzzy KMeans job
     FuzzyKMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output,
                              EuclideanDistanceMeasure.class.getName(), 0.001, 10,
         1, (float) 1.1, true, true, 0);