You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/07/15 21:03:21 UTC
svn commit: r964542 - in /mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/
core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/
examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/
utils/sr...
Author: jeastman
Date: Thu Jul 15 19:03:20 2010
New Revision: 964542
URL: http://svn.apache.org/viewvc?rev=964542&view=rev
Log:
MAHOUT-294:
- fixed serialization bug in FuzzyKMeansInfo that was not serializing the combinerState and led to inflated cluster centroid values
- removed combiner TODO that indicates problem has been in trunk for some time
- added a unit test testFuzzyKMeansInfoSerialization()
- TestClusterDumper and SyntheticControl now produce reasonable-looking fuzzyK results
- all unit tests run
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java Thu Jul 15 19:03:20 2010
@@ -37,7 +37,6 @@ public class FuzzyKMeansCombiner extends
}
value.setCombinerPass(value.getCombinerPass() + 1);
}
- // TODO: how do we pass along the combinerPass? Or do we not need to?
context.write(key, new FuzzyKMeansInfo(cluster.getPointProbSum(), cluster.getWeightedPointTotal(), 1));
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java Thu Jul 15 19:03:20 2010
@@ -64,12 +64,14 @@ public class FuzzyKMeansInfo implements
@Override
public void write(DataOutput out) throws IOException {
out.writeDouble(probability);
+ out.writeInt(combinerPass);
VectorWritable.writeVector(out, pointTotal);
}
@Override
public void readFields(DataInput in) throws IOException {
this.probability = in.readDouble();
+ this.combinerPass = in.readInt();
VectorWritable temp = new VectorWritable();
temp.readFields(in);
this.pointTotal = temp.get();
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Thu Jul 15 19:03:20 2010
@@ -18,6 +18,7 @@
package org.apache.mahout.clustering.fuzzykmeans;
import java.io.File;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -26,6 +27,8 @@ import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
@@ -40,6 +43,7 @@ import org.apache.mahout.common.MahoutTe
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
@@ -211,28 +215,24 @@ public class TestFuzzyKmeansClustering e
// now run the Job using the run() command line options.
Path output = getTestTempDirPath("output");
-/* FuzzyKMeansDriver.runJob(pointsPath,
- clustersPath,
- output,
- EuclideanDistanceMeasure.class.getName(),
- 0.001,
- 2,
- k + 1,
- 2,
- false,
- true,
- 0);
-*/
- String[] args = { DefaultOptionCreator.INPUT_OPTION_KEY, pointsPath.toString(),
- DefaultOptionCreator.CLUSTERS_IN_OPTION_KEY, clustersPath.toString(),
- DefaultOptionCreator.OUTPUT_OPTION_KEY, output.toString(),
+ /* FuzzyKMeansDriver.runJob(pointsPath,
+ clustersPath,
+ output,
+ EuclideanDistanceMeasure.class.getName(),
+ 0.001,
+ 2,
+ k + 1,
+ 2,
+ false,
+ true,
+ 0);
+ */
+ String[] args = { DefaultOptionCreator.INPUT_OPTION_KEY, pointsPath.toString(), DefaultOptionCreator.CLUSTERS_IN_OPTION_KEY,
+ clustersPath.toString(), DefaultOptionCreator.OUTPUT_OPTION_KEY, output.toString(),
DefaultOptionCreator.DISTANCE_MEASURE_OPTION_KEY, EuclideanDistanceMeasure.class.getName(),
- DefaultOptionCreator.CONVERGENCE_DELTA_OPTION_KEY, "0.001",
- DefaultOptionCreator.MAX_ITERATIONS_OPTION_KEY, "2",
- FuzzyKMeansDriver.M_OPTION_KEY, "2.0",
- DefaultOptionCreator.CLUSTERING_OPTION_KEY,
- DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION_KEY,
- DefaultOptionCreator.OVERWRITE_OPTION_KEY };
+ DefaultOptionCreator.CONVERGENCE_DELTA_OPTION_KEY, "0.001", DefaultOptionCreator.MAX_ITERATIONS_OPTION_KEY, "2",
+ FuzzyKMeansDriver.M_OPTION_KEY, "2.0", DefaultOptionCreator.CLUSTERING_OPTION_KEY,
+ DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION_KEY, DefaultOptionCreator.OVERWRITE_OPTION_KEY };
new FuzzyKMeansDriver().run(args);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "clusteredPoints/part-m-00000"), conf);
IntWritable key = new IntWritable();
@@ -583,4 +583,19 @@ public class TestFuzzyKmeansClustering e
}
}
+ public void testFuzzyKMeansInfoSerialization() throws IOException {
+ double[] data = { 1.1, 2.2, 3.3 };
+ Vector vector = new DenseVector(data);
+ FuzzyKMeansInfo reference = new FuzzyKMeansInfo(2.0, vector, 1);
+ DataOutputBuffer out = new DataOutputBuffer();
+ reference.write(out);
+ FuzzyKMeansInfo info = new FuzzyKMeansInfo();
+ DataInputBuffer in = new DataInputBuffer();
+ in.reset(out.getData(), out.getLength());
+ info.readFields(in);
+ assertEquals("probability", reference.getProbability(), info.getProbability());
+ assertTrue("point total", reference.getVector().equals(info.getVector()));
+ assertEquals("combiner", reference.getCombinerPass(), info.getCombinerPass());
+ }
+
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java Thu Jul 15 19:03:20 2010
@@ -181,7 +181,7 @@ public final class Job extends FuzzyKMea
numReducerTasks,
fuzziness,
true,
- false,
+ true,
0.0);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-3"), new Path(output, "clusteredPoints"));
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=964542&r1=964541&r2=964542&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Thu Jul 15 19:03:20 2010
@@ -162,7 +162,7 @@ public class TestClusterDumper extends M
Path output = getTestTempDirPath("output");
CanopyDriver.runJob(getTestTempDirPath("testdata"), output,
EuclideanDistanceMeasure.class.getName(), 8, 4, false);
- // now run the KMeans job
+ // now run the Fuzzy KMeans job
FuzzyKMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output,
EuclideanDistanceMeasure.class.getName(), 0.001, 10,
1, (float) 1.1, true, true, 0);