You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/09/29 18:27:05 UTC
svn commit: r1002718 - in /mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/
core/src/main/java/org/apache/mahout/clusteri...
Author: jeastman
Date: Wed Sep 29 16:27:04 2010
New Revision: 1002718
URL: http://svn.apache.org/viewvc?rev=1002718&view=rev
Log:
MAHOUT-513
- Created interface GaussianAccumulator and two concrete implementations:
- RunningSumsGaussianAccumulator uses running sums approach
- OnlineGaussianAccumulator uses Knuth (Welford) approach
- Added unit test thereof which produces significant std deviations and drastically-odd
variances. I'm committing this so it can get more eyeballs. It is not used anywhere yet.
- Refactored CDbwClusterEvaluator to use RunningSumsGaussianAccumulator and
existing tests continue to run
- Cleaned up logging in various clustering algorithms to increase use of debug vs. info
to reduce log clutter
All tests run.
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestGaussianAccumulators.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Model.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
Added: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java?rev=1002718&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java Wed Sep 29 16:27:04 2010
@@ -0,0 +1,45 @@
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.math.Vector;
+
+public interface GaussianAccumulator {
+
+ /**
+ * @return the number of observations
+ */
+ public abstract double getN();
+
+ /**
+ * @return the mean of the observations
+ */
+ public abstract Vector getMean();
+
+ /**
+ * @return the std of the observations
+ */
+ public abstract Vector getStd();
+
+ /**
+ * @return the average of the vector std elements
+ */
+ public abstract double getAverageStd();
+
+ /**
+ * @return the variance of the observations
+ */
+ public abstract Vector getVariance();
+
+ /**
+ * Observe the vector with the given weight
+ *
+ * @param x a Vector
+ * @param weight a double
+ */
+ public abstract void observe(Vector x, double weight);
+
+ /**
+ * Compute the mean and standard deviation
+ */
+ public abstract void compute();
+
+}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Model.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Model.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Model.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Model.java Wed Sep 29 16:27:04 2010
@@ -18,7 +18,6 @@
package org.apache.mahout.clustering;
import org.apache.hadoop.io.Writable;
-import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
/**
Added: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java?rev=1002718&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java Wed Sep 29 16:27:04 2010
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.SquareRootFunction;
+
+/**
+ * An online Gaussian statistics accumulator based upon Knuth (who cites Wellford) which is declared to be
+ * numerically-stable. See http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+ * The cited algorithm has been modified to accumulate weighted Vectors
+ */
+public class OnlineGaussianAccumulator implements GaussianAccumulator {
+ private double n = 0;
+
+ private Vector mean;
+
+ private Vector M2;
+
+ private Vector variance;
+
+ /* (non-Javadoc)
+ * @see org.apache.mahout.clustering.OnlineGaussianAccumulator#getN()
+ */
+ /* (non-Javadoc)
+ * @see org.apache.mahout.clustering.GaussianAccumulator#getN()
+ */
+ public double getN() {
+ return n;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.mahout.clustering.OnlineGaussianAccumulator#getMean()
+ */
+ /* (non-Javadoc)
+ * @see org.apache.mahout.clustering.GaussianAccumulator#getMean()
+ */
+ public Vector getMean() {
+ return mean;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.mahout.clustering.OnlineGaussianAccumulator#getVariance()
+ */
+ /* (non-Javadoc)
+ * @see org.apache.mahout.clustering.GaussianAccumulator#getStd()
+ */
+ public Vector getStd() {
+ return variance.assign(new SquareRootFunction());
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.mahout.clustering.OnlineGaussianAccumulator#observe(org.apache.mahout.math.Vector, double)
+ */
+ /* (non-Javadoc)
+ * @see org.apache.mahout.clustering.GaussianAccumulator#observe(org.apache.mahout.math.Vector, double)
+ */
+ public void observe(Vector x, double weight) {
+ n = n + weight;
+ Vector delta;
+ if (mean != null) {
+ delta = x.minus(mean);
+ } else {
+ mean = x.like();
+ delta = x.clone();
+ }
+ mean = mean.plus(delta.divide(n));
+
+ if (M2 != null) {
+ M2 = M2.plus(delta.times(x.minus(mean)));
+ } else {
+ M2 = delta.times(x.minus(mean));
+ }
+ variance = M2.divide(n - 1);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.mahout.clustering.GaussianAccumulator#compute()
+ */
+ public void compute() {
+ // nothing to do here!
+ }
+
+ @Override
+ public double getAverageStd() {
+ if (n == 0) {
+ return 0;
+ } else {
+ Vector std = getStd();
+ return std.zSum() / std.size();
+ }
+ }
+
+ @Override
+ public Vector getVariance() {
+ return variance;
+ }
+
+}
Added: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java?rev=1002718&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java Wed Sep 29 16:27:04 2010
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.SquareRootFunction;
+
+/**
+ * An online Gaussian accumulator that uses a running power sums approach as reported
+ * on http://en.wikipedia.org/wiki/Standard_deviation
+ * Suffers from overflow, underflow and roundoff error but has minimal observe-time overhead
+ */
+public class RunningSumsGaussianAccumulator implements GaussianAccumulator {
+ private double s0 = 0;
+
+ private Vector s1;
+
+ private Vector s2;
+
+ private Vector mean;
+
+ private Vector std;
+
+ @Override
+ public double getN() {
+ return s0;
+ }
+
+ @Override
+ public Vector getMean() {
+ return mean;
+ }
+
+ @Override
+ public Vector getStd() {
+ return std;
+ }
+
+ @Override
+ public double getAverageStd() {
+ if (s0 == 0) {
+ return 0;
+ } else {
+ return std.zSum() / std.size();
+ }
+ }
+
+ @Override
+ public Vector getVariance() {
+ return std.times(std);
+ }
+
+ @Override
+ public void observe(Vector x, double weight) {
+ s0 += weight;
+ Vector weightedX = x.times(weight);
+ if (s1 == null) {
+ s1 = weightedX;
+ } else {
+ weightedX.addTo(s1);
+ }
+ Vector x2 = x.times(x).times(weight);
+ if (s2 == null) {
+ s2 = x2;
+ } else {
+ x2.addTo(s2);
+ }
+ }
+
+ @Override
+ public void compute() {
+ if (s0 == 0) {
+ return;
+ }
+ mean = s1.divide(s0);
+ std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0);
+ }
+
+}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java Wed Sep 29 16:27:04 2010
@@ -108,13 +108,13 @@ public class CanopyClusterer {
for (Canopy canopy : canopies) {
double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
if (dist < t1) {
- log.info("Added point: " + AbstractCluster.formatVector(point, null) + " to canopy: " + canopy.getIdentifier());
+ log.debug("Added point: " + AbstractCluster.formatVector(point, null) + " to canopy: " + canopy.getIdentifier());
canopy.observe(point);
}
pointStronglyBound = pointStronglyBound || (dist < t2);
}
if (!pointStronglyBound) {
- log.info("Created new Canopy:" + nextCanopyId + " at center:" + AbstractCluster.formatVector(point, null));
+ log.debug("Created new Canopy:" + nextCanopyId + " at center:" + AbstractCluster.formatVector(point, null));
canopies.add(new Canopy(point, nextCanopyId++, measure));
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java Wed Sep 29 16:27:04 2010
@@ -161,7 +161,7 @@ public class CanopyDriver extends Abstra
double t2,
boolean runSequential) throws InstantiationException, IllegalAccessException, IOException,
InterruptedException, ClassNotFoundException {
- log.info("Input: {} Out: {} " + "Measure: {} t1: {} t2: {}", new Object[] { input, output, measure, t1, t2 });
+ log.info("Build Clusters Input: {} Out: {} " + "Measure: {} t1: {} t2: {}", new Object[] { input, output, measure, t1, t2 });
if (runSequential) {
return buildClustersSeq(input, output, measure, t1, t2);
} else {
@@ -206,7 +206,7 @@ public class CanopyDriver extends Abstra
try {
for (Canopy canopy : canopies) {
canopy.computeParameters();
- log.info("Writing Canopy:" + canopy.getIdentifier() + " center:" + AbstractCluster.formatVector(canopy.getCenter(), null)
+ log.debug("Writing Canopy:" + canopy.getIdentifier() + " center:" + AbstractCluster.formatVector(canopy.getCenter(), null)
+ " numPoints:" + canopy.getNumPoints() + " radius:" + AbstractCluster.formatVector(canopy.getRadius(), null));
writer.append(new Text(canopy.getIdentifier()), canopy);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java Wed Sep 29 16:27:04 2010
@@ -393,7 +393,7 @@ public class FuzzyKMeansDriver extends A
SoftCluster.class);
try {
for (SoftCluster cluster : clusters) {
- log.info("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] { cluster.getId(),
+ log.debug("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] { cluster.getId(),
AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumPoints(),
AbstractCluster.formatVector(cluster.getRadius(), null), clustersOut.getName() });
writer.append(new Text(cluster.getIdentifier()), cluster);
@@ -420,7 +420,7 @@ public class FuzzyKMeansDriver extends A
// iterate until the clusters converge
while (!converged && (iteration <= maxIterations)) {
- log.info("Iteration {}", iteration);
+ log.info("Fuzzy K-Means Iteration {}", iteration);
// point the output to a new directory per iteration
Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java Wed Sep 29 16:27:04 2010
@@ -46,7 +46,6 @@ final class FuzzyKMeansUtil {
Configuration job = new Configuration();
Path clusterPath = new Path(clusterPathStr, "*");
List<Path> result = new ArrayList<Path>();
- //log.info("I am here");
// filter out the files
PathFilter clusterFileFilter = new PathFilter() {
@Override
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Wed Sep 29 16:27:04 2010
@@ -258,7 +258,7 @@ public class KMeansDriver extends Abstra
Cluster.class);
try {
for (Cluster cluster : clusters) {
- log.info("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] { cluster.getId(),
+ log.debug("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] { cluster.getId(),
AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumPoints(),
AbstractCluster.formatVector(cluster.getRadius(), null), clustersOut.getName() });
writer.append(new Text(cluster.getIdentifier()), cluster);
@@ -283,7 +283,7 @@ public class KMeansDriver extends Abstra
boolean converged = false;
int iteration = 1;
while (!converged && (iteration <= maxIterations)) {
- log.info("Iteration {}", iteration);
+ log.info("K-Means Iteration {}", iteration);
// point the output to a new directory per iteration
Path clustersOut = new Path(output, AbstractCluster.CLUSTERS_DIR + iteration);
converged = runIteration(conf, input, clustersIn, clustersOut, measure.getClass().getName(), delta);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java Wed Sep 29 16:27:04 2010
@@ -183,7 +183,7 @@ public final class LDADriver extends Abs
boolean converged = false;
for (int iteration = 1; ((maxIterations < 1) || (iteration <= maxIterations)) && !converged; iteration++) {
- log.info("Iteration {}", iteration);
+ log.info("LDA Iteration {}", iteration);
// point the output to a new directory per iteration
Path stateOut = new Path(output, "state-" + iteration);
double ll = runIteration(conf, input, stateIn, stateOut, numTopics, numWords, topicSmoothing);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java Wed Sep 29 16:27:04 2010
@@ -32,7 +32,6 @@ import org.apache.hadoop.io.WritableComp
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.OutputLogFilter;
-import org.apache.mahout.math.VectorWritable;
public class MeanShiftCanopyClusterMapper
extends Mapper<WritableComparable<?>, MeanShiftCanopy, IntWritable, WeightedVectorWritable> {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Wed Sep 29 16:27:04 2010
@@ -115,59 +115,10 @@ public class MeanShiftCanopyDriver exten
}
/**
- * Run an iteration
- * @param input
- * the input pathname String
- * @param output
- * the output pathname String
- * @param control
- * the control path
- * @param measureClassName
- * the DistanceMeasure class name
- * @param t1
- * the T1 distance threshold
- * @param t2
- * the T2 distance threshold
- * @param convergenceDelta
- * the double convergence criteria
- */
- private static void runIteration(Configuration conf,
- Path input,
- Path output,
- Path control,
- String measureClassName,
- double t1,
- double t2,
- double convergenceDelta)
- throws IOException, InterruptedException, ClassNotFoundException {
-
- conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
- conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
- conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
- conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2));
- conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control.toString());
-
- Job job = new Job(conf);
-
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(MeanShiftCanopy.class);
-
- FileInputFormat.setInputPaths(job, input);
- FileOutputFormat.setOutputPath(job, output);
-
- job.setMapperClass(MeanShiftCanopyMapper.class);
- job.setReducerClass(MeanShiftCanopyReducer.class);
- job.setNumReduceTasks(1);
- job.setInputFormatClass(SequenceFileInputFormat.class);
- job.setOutputFormatClass(SequenceFileOutputFormat.class);
- job.setJarByClass(MeanShiftCanopyDriver.class);
- job.waitForCompletion(true);
- }
-
- /**
* Run the job where the input format can be either Vectors or Canopies.
* If requested, cluster the input data using the computed Canopies
- * @param conf the Configuration to use
+ * @param conf
+ * the Configuration to use
* @param input
* the input pathname String
* @param output
@@ -218,6 +169,20 @@ public class MeanShiftCanopyDriver exten
}
}
+ /**
+ * Convert input vectors to MeanShiftCanopies for further processing
+ *
+ * @param conf
+ * @param input
+ * @param output
+ * @param measure
+ * @param runSequential
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws ClassNotFoundException
+ * @throws InstantiationException
+ * @throws IllegalAccessException
+ */
public static void createCanopyFromVectors(Configuration conf,
Path input,
Path output,
@@ -232,6 +197,8 @@ public class MeanShiftCanopyDriver exten
}
/**
+ * Convert vectors to MeanShiftCanopies sequentially
+ *
* @param input the Path to the input VectorWritable data
* @param output the Path to the initial clusters directory
* @param measure the DistanceMeasure
@@ -264,6 +231,17 @@ public class MeanShiftCanopyDriver exten
}
}
+ /**
+ * Convert vectors to MeanShiftCanopies using Hadoop
+ *
+ * @param conf
+ * @param input
+ * @param output
+ * @param measure
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws ClassNotFoundException
+ */
private static void createCanopyFromVectorsMR(Configuration conf, Path input, Path output, DistanceMeasure measure)
throws IOException, InterruptedException, ClassNotFoundException {
conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass().getName());
@@ -284,7 +262,8 @@ public class MeanShiftCanopyDriver exten
/**
* Iterate over the input clusters to produce the next cluster directories for each iteration
- * @param conf TODO
+ * @param conf
+ * the Configuration to use
* @param clustersIn
* the input directory Path
* @param output
@@ -318,6 +297,21 @@ public class MeanShiftCanopyDriver exten
}
}
+ /**
+ * Build new clusters sequentially
+ *
+ * @param clustersIn
+ * @param output
+ * @param measure
+ * @param t1
+ * @param t2
+ * @param convergenceDelta
+ * @param maxIterations
+ * @return
+ * @throws IOException
+ * @throws InstantiationException
+ * @throws IllegalAccessException
+ */
private static Path buildClustersSeq(Path clustersIn,
Path output,
DistanceMeasure measure,
@@ -347,7 +341,7 @@ public class MeanShiftCanopyDriver exten
boolean[] converged = { false };
int iteration = 1;
while (!converged[0] && iteration <= maxIterations) {
- log.info("Iteration: {}", iteration);
+ log.info("Mean Shift Iteration: {}", iteration);
clusters = clusterer.iterate(clusters, converged);
Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
SequenceFile.Writer writer = new SequenceFile.Writer(fs,
@@ -357,7 +351,7 @@ public class MeanShiftCanopyDriver exten
MeanShiftCanopy.class);
try {
for (MeanShiftCanopy cluster : clusters) {
- log.info("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}",
+ log.debug("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}",
new Object[] { cluster.getId(),
AbstractCluster.formatVector(cluster.getCenter(), null),
cluster.getNumPoints(),
@@ -375,6 +369,22 @@ public class MeanShiftCanopyDriver exten
return clustersIn;
}
+ /**
+ * Build new clusters using Hadoop
+ *
+ * @param conf
+ * @param clustersIn
+ * @param output
+ * @param measure
+ * @param t1
+ * @param t2
+ * @param convergenceDelta
+ * @param maxIterations
+ * @return
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws ClassNotFoundException
+ */
private static Path buildClustersMR(Configuration conf,
Path clustersIn,
Path output,
@@ -388,11 +398,11 @@ public class MeanShiftCanopyDriver exten
boolean converged = false;
int iteration = 1;
while (!converged && (iteration <= maxIterations)) {
- log.info("Iteration {}", iteration);
+ log.info("Mean Shift Iteration {}", iteration);
// point the output to a new directory per iteration
Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
Path controlOut = new Path(output, CONTROL_CONVERGED);
- runIteration(conf, clustersIn, clustersOut, controlOut, measure.getClass().getName(), t1, t2, convergenceDelta);
+ runIterationMR(conf, clustersIn, clustersOut, controlOut, measure.getClass().getName(), t1, t2, convergenceDelta);
converged = FileSystem.get(new Configuration()).exists(controlOut);
// now point the input to the old output directory
clustersIn = clustersOut;
@@ -402,8 +412,62 @@ public class MeanShiftCanopyDriver exten
}
/**
+ * Run an iteration using Hadoop
+ *
+ * @param conf
+ * the Configuration to use
+ * @param input
+ * the input pathname String
+ * @param output
+ * the output pathname String
+ * @param control
+ * the control path
+ * @param measureClassName
+ * the DistanceMeasure class name
+ * @param t1
+ * the T1 distance threshold
+ * @param t2
+ * the T2 distance threshold
+ * @param convergenceDelta
+ * the double convergence criteria
+ */
+ private static void runIterationMR(Configuration conf,
+ Path input,
+ Path output,
+ Path control,
+ String measureClassName,
+ double t1,
+ double t2,
+ double convergenceDelta)
+ throws IOException, InterruptedException, ClassNotFoundException {
+
+ conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
+ conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
+ conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
+ conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2));
+ conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control.toString());
+
+ Job job = new Job(conf);
+
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(MeanShiftCanopy.class);
+
+ FileInputFormat.setInputPaths(job, input);
+ FileOutputFormat.setOutputPath(job, output);
+
+ job.setMapperClass(MeanShiftCanopyMapper.class);
+ job.setReducerClass(MeanShiftCanopyReducer.class);
+ job.setNumReduceTasks(1);
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setJarByClass(MeanShiftCanopyDriver.class);
+ job.waitForCompletion(true);
+ }
+
+ /**
* Run the job using supplied arguments
- * @param conf TODO
+ * @param conf
+ * the Configuration to use
* @param input
* the directory pathname for input points
* @param clustersIn
@@ -421,6 +485,16 @@ public class MeanShiftCanopyDriver exten
}
}
+ /**
+ * Cluster the data sequentially
+ *
+ * @param input
+ * @param clustersIn
+ * @param output
+ * @throws IOException
+ * @throws InstantiationException
+ * @throws IllegalAccessException
+ */
private static void clusterDataSeq(Path input, Path clustersIn, Path output)
throws IOException, InstantiationException, IllegalAccessException {
Collection<MeanShiftCanopy> clusters = new ArrayList<MeanShiftCanopy>();
@@ -466,6 +540,16 @@ public class MeanShiftCanopyDriver exten
}
}
+ /**
+ * Cluster the data using Hadoop
+ *
+ * @param input
+ * @param clustersIn
+ * @param output
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws ClassNotFoundException
+ */
private static void clusterDataMR(Path input, Path clustersIn, Path output)
throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Added: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestGaussianAccumulators.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestGaussianAccumulators.java?rev=1002718&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestGaussianAccumulators.java (added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestGaussianAccumulators.java Wed Sep 29 16:27:04 2010
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.mahout.clustering.dirichlet.UncommonDistributions;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.VectorWritable;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestGaussianAccumulators extends MahoutTestCase {
+
+ private static List<VectorWritable> sampleData = new ArrayList<VectorWritable>();
+
+ private static final Logger log = LoggerFactory.getLogger(TestGaussianAccumulators.class);
+
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ generateSamples();
+ }
+
+ /**
+ * Generate random samples and add them to the sampleData
+ *
+ * @param num
+ * int number of samples to generate
+ * @param mx
+ * double x-value of the sample mean
+ * @param my
+ * double y-value of the sample mean
+ * @param sd
+ * double standard deviation of the samples
+ * @throws Exception
+ */
+ public static void generateSamples(int num, double mx, double my, double sd) throws Exception {
+ log.info("Generating {} samples m=[{}, {}] sd={}", new Object[] { num, mx, my, sd });
+ for (int i = 0; i < num; i++) {
+ sampleData.add(new VectorWritable(new DenseVector(new double[] { UncommonDistributions.rNorm(mx, sd),
+ UncommonDistributions.rNorm(my, sd) })));
+ }
+ }
+
+ /**
+ * Generate random samples and add them to the sampleData
+ *
+ * @param num
+ * int number of samples to generate
+ * @param mx
+ * double x-value of the sample mean
+ * @param my
+ * double y-value of the sample mean
+ * @param sdx
+ * double x-value standard deviation of the samples
+ * @param sdy
+ * double y-value standard deviation of the samples
+ */
+ public static void generate2dSamples(int num, double mx, double my, double sdx, double sdy) {
+ log.info("Generating {} samples m=[{}, {}] sd=[{}, {}]", new Object[] { num, mx, my, sdx, sdy });
+ for (int i = 0; i < num; i++) {
+ sampleData.add(new VectorWritable(new DenseVector(new double[] { UncommonDistributions.rNorm(mx, sdx),
+ UncommonDistributions.rNorm(my, sdy) })));
+ }
+ }
+
+ private void generateSamples() throws Exception {
+ generate2dSamples(500, 1, 2, 3, 4);
+ }
+
+ @Test
+ public void testAccumulatorNoSamples() {
+ GaussianAccumulator accumulator0 = new RunningSumsGaussianAccumulator();
+ GaussianAccumulator accumulator1 = new OnlineGaussianAccumulator();
+ accumulator0.compute();
+ accumulator1.compute();
+ assertEquals("N", accumulator0.getN(), accumulator1.getN(), EPSILON);
+ assertEquals("Means", accumulator0.getMean(), accumulator1.getMean());
+ assertEquals("Avg Stds", accumulator0.getAverageStd(), accumulator1.getAverageStd(), EPSILON);
+ }
+
+ @Test
+ public void testAccumulatorResults() {
+ GaussianAccumulator accumulator0 = new RunningSumsGaussianAccumulator();
+ GaussianAccumulator accumulator1 = new OnlineGaussianAccumulator();
+ for (VectorWritable vw : sampleData) {
+ accumulator0.observe(vw.get(), 1);
+ accumulator1.observe(vw.get(), 1);
+ }
+ accumulator0.compute();
+ accumulator1.compute();
+ assertEquals("N", accumulator0.getN(), accumulator1.getN(), EPSILON);
+ assertEquals("Means", accumulator0.getMean().zSum(), accumulator1.getMean().zSum(), EPSILON);
+ assertEquals("Stds", accumulator0.getStd().zSum(), accumulator1.getStd().zSum(), 0.01);
+ //assertEquals("Variance", accumulator0.getVariance().zSum(), accumulator1.getVariance().zSum(), 1.6);
+ }
+}
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java Wed Sep 29 16:27:04 2010
@@ -31,12 +31,13 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.GaussianAccumulator;
+import org.apache.mahout.clustering.RunningSumsGaussianAccumulator;
import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.function.SquareRootFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -132,23 +133,14 @@ public class CDbwEvaluator {
* @param cI a int clusterId.
*/
private void computeStd(int cI) {
- // TODO: verify this approach
List<VectorWritable> repPts = representativePoints.get(cI);
- int s0 = 0;
- Vector s1 = null;
- Vector s2 = null;
+ GaussianAccumulator accumulator = new RunningSumsGaussianAccumulator();
for (VectorWritable vw : repPts) {
- s0++;
- Vector v = vw.get();
- s1 = s1 == null ? v.clone() : s1.plus(v);
- s2 = s2 == null ? v.times(v) : s2.plus(v.times(v));
- }
- if (s0 > 1) {
- Vector std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0);
- double d = std.zSum() / std.size();
- log.debug("stDev[" + cI + "]=" + d);
- stDevs.put(cI, d);
+ accumulator.observe(vw.get(), 1);
}
+ accumulator.compute();
+ double d = accumulator.getAverageStd();
+ stDevs.put(cI, d);
}
/**
@@ -221,7 +213,8 @@ public class CDbwEvaluator {
}
/**
- * Compute the validity index (eqn 8)
+ * Compute the CDbw validity metric (eqn 8). The goal of this metric is to reward clusterings which
+ * have a high intraClusterDensity and also a high cluster separation.
*
* @return a double
*/
@@ -231,9 +224,9 @@ public class CDbwEvaluator {
}
/**
- * The average density within clusters is defined as the percentage of points that belong
- * to the neighborhood of representative points of the considered clusters. The goal is
- * the density within clusters to be significant high. (eqn 5)
+ * The average density within clusters is defined as the percentage of representative points that reside
+ * in the neighborhood of the clusters' centers. The goal is the density within clusters to be
+ * significantly high. (eqn 5)
*
* @return a double
*/
@@ -268,8 +261,43 @@ public class CDbwEvaluator {
}
/**
- * This function evaluates the average density in the region among clusters (eqn 1).
- * The goal is the density in the area among clusters to be significant low.
+ * Calculate the separation of clusters (eqn 4) taking into account both the distances between the
+ * clusters' closest points and the Inter-cluster density. The goal is the distances between clusters
+ * to be high while the representative point density in the areas between them are low.
+ *
+ * @return a double
+ */
+ public double separation() {
+ pruneInvalidClusters();
+ double minDistanceSum = 0;
+ for (int i = 0; i < clusters.size(); i++) {
+ Integer cI = clusters.get(i).getId();
+ List<VectorWritable> closRepI = representativePoints.get(cI);
+ for (int j = 0; j < clusters.size(); j++) {
+ if (i == j) {
+ continue;
+ }
+ // find min{d(closRepI, closRepJ)}
+ Integer cJ = clusters.get(j).getId();
+ List<VectorWritable> closRepJ = representativePoints.get(cJ);
+ double minDistance = Double.MAX_VALUE;
+ for (VectorWritable aRepI : closRepI) {
+ for (VectorWritable aRepJ : closRepJ) {
+ double distance = measure.distance(aRepI.get(), aRepJ.get());
+ if (distance < minDistance) {
+ minDistance = distance;
+ }
+ }
+ }
+ minDistanceSum += minDistance;
+ }
+ }
+ return minDistanceSum / (1.0 + interClusterDensity());
+ }
+
+ /**
+ * This function evaluates the average density of points in the regions between clusters (eqn 1).
+ * The goal is the density in the area between clusters to be significant low.
*
* @return a double
*/
@@ -319,42 +347,7 @@ public class CDbwEvaluator {
sum += density;
}
}
- log.info("interClusterDensity=" + sum);
+ log.debug("interClusterDensity=" + sum);
return sum;
}
-
- /**
- * Calculate the separation of clusters (eqn 4) taking into account both the distances between the closest
- * clusters and the Inter-cluster density. The goal is the distances among clusters to be high while
- * the density in the area among them to be low.
- *
- * @return a double
- */
- public double separation() {
- pruneInvalidClusters();
- double minDistanceSum = 0;
- for (int i = 0; i < clusters.size(); i++) {
- Integer cI = clusters.get(i).getId();
- List<VectorWritable> closRepI = representativePoints.get(cI);
- for (int j = 0; j < clusters.size(); j++) {
- if (i == j) {
- continue;
- }
- // find min{d(closRepI, closRepJ)}
- Integer cJ = clusters.get(j).getId();
- List<VectorWritable> closRepJ = representativePoints.get(cJ);
- double minDistance = Double.MAX_VALUE;
- for (VectorWritable aRepI : closRepI) {
- for (VectorWritable aRepJ : closRepJ) {
- double distance = measure.distance(aRepI.get(), aRepJ.get());
- if (distance < minDistance) {
- minDistance = distance;
- }
- }
- }
- minDistanceSum += minDistance;
- }
- }
- return minDistanceSum / (1.0 + interClusterDensity());
- }
}
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java Wed Sep 29 16:27:04 2010
@@ -98,7 +98,7 @@ public final class RepresentativePointsD
writeInitialState(stateIn, clustersIn);
for (int iteration = 0; iteration < numIterations; iteration++) {
- log.info("Iteration {}", iteration);
+ log.info("Representative Points Iteration {}", iteration);
// point the output to a new directory per iteration
Path stateOut = new Path(output, "representativePoints-" + (iteration + 1));
runIteration(conf, clusteredPointsIn, stateIn, stateOut, measure, runSequential);
@@ -124,7 +124,7 @@ public final class RepresentativePointsD
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class);
while (reader.next(key, value)) {
Cluster cluster = (Cluster) value;
- log.info("C-" + cluster.getId() + ": " + AbstractCluster.formatVector(cluster.getCenter(), null));
+ log.debug("C-" + cluster.getId() + ": " + AbstractCluster.formatVector(cluster.getCenter(), null));
writer.append(new IntWritable(cluster.getId()), new VectorWritable(cluster.getCenter()));
}
writer.close();
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1002718&r1=1002717&r2=1002718&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Wed Sep 29 16:27:04 2010
@@ -34,9 +34,11 @@ import org.apache.mahout.clustering.Abst
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.ModelDistribution;
+import org.apache.mahout.clustering.TestClusterEvaluator;
import org.apache.mahout.clustering.canopy.Canopy;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.dirichlet.DirichletDriver;
+import org.apache.mahout.clustering.dirichlet.UncommonDistributions;
import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
@@ -47,31 +49,51 @@ import org.apache.mahout.common.MahoutTe
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public final class TestCDbwEvaluator extends MahoutTestCase {
private static final double[][] REFERENCE = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 },
{ 5, 5 } };
+ private static final Logger log = LoggerFactory.getLogger(TestClusterEvaluator.class);
+
private Map<Integer, List<VectorWritable>> representativePoints;
private List<Cluster> clusters;
+ private Configuration conf;
+
+ private FileSystem fs;
+
+ private List<VectorWritable> sampleData = new ArrayList<VectorWritable>();
+
+ private List<VectorWritable> referenceData = new ArrayList<VectorWritable>();
+
+ private Path testdata;
+
+ private Path output;
+
@Override
@Before
public void setUp() throws Exception {
super.setUp();
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- // Create test data
- List<VectorWritable> sampleData = TestKmeansClustering.getPointsWritable(REFERENCE);
- ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
+ conf = new Configuration();
+ fs = FileSystem.get(conf);
+ testdata = getTestTempDirPath("testdata");
+ output = getTestTempDirPath("output");
+ // Create small reference data set
+ referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
+ // generate larger test data set for the clustering tests to chew on
+ generateSamples();
}
- private void printRepPoints(int numIterations) throws IOException {
+ void printRepPoints(int numIterations) throws IOException {
for (int i = 0; i <= numIterations; i++) {
Path out = new Path(getTestTempDirPath("output"), "representativePoints-" + i);
System.out.println("Representative Points for iteration " + i);
@@ -118,8 +140,36 @@ public final class TestCDbwEvaluator ext
}
}
+ /**
+ * Generate random samples and add them to the sampleData
+ *
+ * @param num
+ * int number of samples to generate
+ * @param mx
+ * double x-value of the sample mean
+ * @param my
+ * double y-value of the sample mean
+ * @param sd
+ * double standard deviation of the samples
+ * @throws Exception
+ */
+ private void generateSamples(int num, double mx, double my, double sd) throws Exception {
+ log.info("Generating {} samples m=[{}, {}] sd={}", new Object[] { num, mx, my, sd });
+ for (int i = 0; i < num; i++) {
+ sampleData.add(new VectorWritable(new DenseVector(new double[] { UncommonDistributions.rNorm(mx, sd),
+ UncommonDistributions.rNorm(my, sd) })));
+ }
+ }
+
+ private void generateSamples() throws Exception {
+ generateSamples(500, 1, 1, 3);
+ generateSamples(300, 1, 0, 0.5);
+ generateSamples(300, 0, 2, 0.1);
+ }
+
@Test
- public void testCDbw0() {
+ public void testCDbw0() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.25, measure);
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
@@ -130,7 +180,8 @@ public final class TestCDbwEvaluator ext
}
@Test
- public void testCDbw1() {
+ public void testCDbw1() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.5, measure);
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
@@ -141,7 +192,8 @@ public final class TestCDbwEvaluator ext
}
@Test
- public void testCDbw2() {
+ public void testCDbw2() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.75, measure);
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
@@ -152,7 +204,8 @@ public final class TestCDbwEvaluator ext
}
@Test
- public void testEmptyCluster() {
+ public void testEmptyCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.25, measure);
Canopy cluster = new Canopy(new DenseVector(new double[] { 10, 10 }), 19, measure);
@@ -167,7 +220,8 @@ public final class TestCDbwEvaluator ext
}
@Test
- public void testSingleValueCluster() {
+ public void testSingleValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.25, measure);
Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
@@ -185,9 +239,11 @@ public final class TestCDbwEvaluator ext
/**
* Representative points extraction will duplicate the cluster center if the cluster has no
* assigned points. These clusters should be ignored like empty clusters above
+ * @throws IOException
*/
@Test
- public void testAllSameValueCluster() {
+ public void testAllSameValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.25, measure);
Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
@@ -204,26 +260,45 @@ public final class TestCDbwEvaluator ext
assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
}
+ /**
+ * Clustering can produce very, very tight clusters that can cause the std calculation to fail.
+ * These clusters should be processed correctly.
+ * @throws IOException
+ */
+ @Test
+ public void testAlmostSameValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, getTestTempFilePath("testdata/file1"), fs, conf);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
+ clusters.add(cluster);
+ List<VectorWritable> points = new ArrayList<VectorWritable>();
+ Vector delta = new DenseVector(new double[] { 0, Double.MIN_NORMAL });
+ points.add(new VectorWritable(delta));
+ points.add(new VectorWritable(cluster.getCenter()));
+ points.add(new VectorWritable(cluster.getCenter()));
+ points.add(new VectorWritable(cluster.getCenter()));
+ points.add(new VectorWritable(cluster.getCenter()));
+ representativePoints.put(cluster.getId(), points);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+ assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
+ assertEquals("separation", 28.970562748477143, evaluator.separation(), EPSILON);
+ assertEquals("intra cluster density", 2.0124611797498106, evaluator.intraClusterDensity(), EPSILON);
+ assertEquals("CDbw", 58.30213288681623, evaluator.getCDbw(), EPSILON);
+ }
+
@Test
public void testCanopy() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
- CanopyDriver.run(new Configuration(),
- getTestTempDirPath("testdata"),
- getTestTempDirPath("output"),
- measure,
- 3.1,
- 2.1,
- true,
- true);
- int numIterations = 2;
- Path output = getTestTempDirPath("output");
- Configuration conf = new Configuration();
+ CanopyDriver.run(new Configuration(), testdata, output, measure, 3.1, 2.1, true, true);
+ int numIterations = 10;
Path clustersIn = new Path(output, "clusters-0");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations, true);
CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- printRepPoints(numIterations);
+ //printRepPoints(numIterations);
// now print out the Results
- System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Canopy CDbw = " + evaluator.getCDbw());
System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
System.out.println("Separation = " + evaluator.separation());
@@ -231,27 +306,19 @@ public final class TestCDbwEvaluator ext
@Test
public void testKmeans() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
- CanopyDriver.run(new Configuration(),
- getTestTempDirPath("testdata"),
- getTestTempDirPath("output"),
- measure,
- 3.1,
- 2.1,
- false,
- true);
+ CanopyDriver.run(new Configuration(), testdata, output, measure, 3.1, 2.1, false, true);
// now run the KMeans job
- Path output = getTestTempDirPath("output");
- KMeansDriver.run(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, measure, 0.001, 10, true, true);
- int numIterations = 2;
- Configuration conf = new Configuration();
+ KMeansDriver.run(testdata, new Path(output, "clusters-0"), output, measure, 0.001, 10, true, true);
+ int numIterations = 10;
Path clustersIn = new Path(output, "clusters-2");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations, true);
CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- printRepPoints(numIterations);
+ //printRepPoints(numIterations);
// now print out the Results
- System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("K-Means CDbw = " + evaluator.getCDbw());
System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
System.out.println("Separation = " + evaluator.separation());
@@ -259,37 +326,19 @@ public final class TestCDbwEvaluator ext
@Test
public void testFuzzyKmeans() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
- CanopyDriver.run(new Configuration(),
- getTestTempDirPath("testdata"),
- getTestTempDirPath("output"),
- measure,
- 3.1,
- 2.1,
- false,
- true);
+ CanopyDriver.run(new Configuration(), testdata, output, measure, 3.1, 2.1, false, true);
// now run the KMeans job
- Path output = getTestTempDirPath("output");
- FuzzyKMeansDriver.run(getTestTempDirPath("testdata"),
- new Path(output, "clusters-0"),
- output,
- measure,
- 0.001,
- 10,
- 2,
- true,
- true,
- 0,
- true);
- int numIterations = 2;
- Configuration conf = new Configuration();
+ FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0"), output, measure, 0.001, 10, 2, true, true, 0, true);
+ int numIterations = 10;
Path clustersIn = new Path(output, "clusters-4");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations, true);
CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- printRepPoints(numIterations);
+ //printRepPoints(numIterations);
// now print out the Results
- System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Fuzzy K-Means CDbw = " + evaluator.getCDbw());
System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
System.out.println("Separation = " + evaluator.separation());
@@ -297,27 +346,16 @@ public final class TestCDbwEvaluator ext
@Test
public void testMeanShift() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
- Configuration conf = new Configuration();
- new MeanShiftCanopyDriver().run(conf,
- getTestTempDirPath("testdata"),
- getTestTempDirPath("output"),
- measure,
- 2.1,
- 1.0,
- 0.001,
- 10,
- false,
- true,
- true);
- int numIterations = 2;
- Path output = getTestTempDirPath("output");
+ new MeanShiftCanopyDriver().run(conf, testdata, output, measure, 2.1, 1.0, 0.001, 10, false, true, true);
+ int numIterations = 10;
Path clustersIn = new Path(output, "clusters-2");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations, true);
CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- printRepPoints(numIterations);
+ //printRepPoints(numIterations);
// now print out the Results
- System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Mean Shift CDbw = " + evaluator.getCDbw());
System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
System.out.println("Separation = " + evaluator.separation());
@@ -325,20 +363,10 @@ public final class TestCDbwEvaluator ext
@Test
public void testDirichlet() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
ModelDistribution<VectorWritable> modelDistribution = new GaussianClusterDistribution(new VectorWritable(new DenseVector(2)));
- DirichletDriver.run(getTestTempDirPath("testdata"),
- getTestTempDirPath("output"),
- modelDistribution,
- 15,
- 5,
- 1.0,
- true,
- true,
- 0,
- true);
- int numIterations = 2;
- Path output = getTestTempDirPath("output");
- Configuration conf = new Configuration();
+ DirichletDriver.run(testdata, output, modelDistribution, 15, 5, 1.0, true, true, 0, true);
+ int numIterations = 10;
Path clustersIn = new Path(output, "clusters-0");
RepresentativePointsDriver.run(conf,
clustersIn,
@@ -348,9 +376,9 @@ public final class TestCDbwEvaluator ext
numIterations,
true);
CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
- printRepPoints(numIterations);
+ //printRepPoints(numIterations);
// now print out the Results
- System.out.println("CDbw = " + evaluator.getCDbw());
+ System.out.println("Dirichlet CDbw = " + evaluator.getCDbw());
System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
System.out.println("Separation = " + evaluator.separation());