You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by tn...@apache.org on 2013/11/16 19:48:48 UTC
svn commit: r1542545 - in /commons/proper/math/trunk/src: changes/
main/java/org/apache/commons/math3/ml/clustering/
main/java/org/apache/commons/math3/ml/clustering/evaluation/
test/java/org/apache/commons/math3/ml/clustering/evaluation/
Author: tn
Date: Sat Nov 16 18:48:48 2013
New Revision: 1542545
URL: http://svn.apache.org/r1542545
Log:
[MATH-1031] Added new ClusterEvaluation base class and refactored code in MultiKMeansPlusPlusClusterer.
Added:
commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/
commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java (with props)
commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java (with props)
commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java (with props)
commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/
commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java (with props)
Modified:
commons/proper/math/trunk/src/changes/changes.xml
commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java
Modified: commons/proper/math/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/changes/changes.xml?rev=1542545&r1=1542544&r2=1542545&view=diff
==============================================================================
--- commons/proper/math/trunk/src/changes/changes.xml (original)
+++ commons/proper/math/trunk/src/changes/changes.xml Sat Nov 16 18:48:48 2013
@@ -51,6 +51,11 @@ If the output is not quite correct, chec
</properties>
<body>
<release version="3.3" date="TBD" description="TBD">
+ <action dev="tn" type="update" issue="MATH-1031" due-to="Thorsten Schäfer">
+ Added new class "ClusterEvaluator" to evaluate the result of a clustering algorithm
+ and refactored existing evaluation code in "MultiKMeansPlusPlusClusterer"
+ into separate class "SumOfClusterVariances".
+ </action>
<action dev="psteitz" type="add" issue="MATH-1061">
Added InsufficientDataException.
</action>
@@ -96,7 +101,7 @@ If the output is not quite correct, chec
Added logDensity methods to AbstractReal/IntegerDistribution with naive default
implementations and improved implementations for some current distributions.
</action>
- <action dev="psteitz" type="add" issue="MATH-1038" due-to="Thorsten Schaefer">
+ <action dev="psteitz" type="add" issue="MATH-1038" due-to="Thorsten Schäfer">
Added ConfidenceInterval class and BinomialConfidenceInterval providing several
estimators for confidence intervals for binomial probabilities.
</action>
@@ -127,7 +132,7 @@ If the output is not quite correct, chec
Fix a typo in the test class of "GeometricDistribution" and ensure that a meaningful
tolerance value is used when comparing test results with expected values.
</action>
- <action dev="psteitz" type="add" issue="MATH-1034" due-to="Thorsten Schaefer">
+ <action dev="psteitz" type="add" issue="MATH-1034" due-to="Thorsten Schäfer">
Added exact binomial test implementation.
</action>
<action dev="tn" type="add" issue="MATH-1018" due-to="Ajo Fod">
Modified: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java?rev=1542545&r1=1542544&r2=1542545&view=diff
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java (original)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java Sat Nov 16 18:48:48 2013
@@ -22,7 +22,8 @@ import java.util.List;
import org.apache.commons.math3.exception.ConvergenceException;
import org.apache.commons.math3.exception.MathIllegalArgumentException;
-import org.apache.commons.math3.stat.descriptive.moment.Variance;
+import org.apache.commons.math3.ml.clustering.evaluation.ClusterEvaluator;
+import org.apache.commons.math3.ml.clustering.evaluation.SumOfClusterVariances;
/**
* A wrapper around a k-means++ clustering algorithm which performs multiple trials
@@ -39,15 +40,31 @@ public class MultiKMeansPlusPlusClustere
/** The number of trial runs. */
private final int numTrials;
+ /** The cluster evaluator to use. */
+ private final ClusterEvaluator<T> evaluator;
+
/** Build a clusterer.
* @param clusterer the k-means clusterer to use
* @param numTrials number of trial runs
*/
public MultiKMeansPlusPlusClusterer(final KMeansPlusPlusClusterer<T> clusterer,
final int numTrials) {
+ this(clusterer, numTrials, new SumOfClusterVariances<T>(clusterer.getDistanceMeasure()));
+ }
+
+ /** Build a clusterer.
+ * @param clusterer the k-means clusterer to use
+ * @param numTrials number of trial runs
+ * @param evaluator the cluster evaluator to use
+ * @since 3.3
+ */
+ public MultiKMeansPlusPlusClusterer(final KMeansPlusPlusClusterer<T> clusterer,
+ final int numTrials,
+ final ClusterEvaluator<T> evaluator) {
super(clusterer.getDistanceMeasure());
this.clusterer = clusterer;
this.numTrials = numTrials;
+ this.evaluator = evaluator;
}
/**
@@ -67,6 +84,15 @@ public class MultiKMeansPlusPlusClustere
}
/**
+ * Returns the {@link ClusterEvaluator} used to determine the "best" clustering.
+ * @return the used {@link ClusterEvaluator}
+ * @since 3.3
+ */
+ public ClusterEvaluator<T> getClusterEvaluator() {
+ return evaluator;
+ }
+
+ /**
* Runs the K-means++ clustering algorithm.
*
* @param points the points to cluster
@@ -92,22 +118,9 @@ public class MultiKMeansPlusPlusClustere
List<CentroidCluster<T>> clusters = clusterer.cluster(points);
// compute the variance of the current list
- double varianceSum = 0.0;
- for (final CentroidCluster<T> cluster : clusters) {
- if (!cluster.getPoints().isEmpty()) {
-
- // compute the distance variance of the current cluster
- final Clusterable center = cluster.getCenter();
- final Variance stat = new Variance();
- for (final T point : cluster.getPoints()) {
- stat.increment(distance(point, center));
- }
- varianceSum += stat.getResult();
-
- }
- }
+ final double varianceSum = evaluator.score(clusters);
- if (varianceSum <= bestVarianceSum) {
+ if (evaluator.isBetterScore(varianceSum, bestVarianceSum)) {
// this one is the best we have found so far, remember it
best = clusters;
bestVarianceSum = varianceSum;
Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java?rev=1542545&view=auto
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java (added)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java Sat Nov 16 18:48:48 2013
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math3.ml.clustering.evaluation;
+
+import java.util.List;
+
+import org.apache.commons.math3.ml.clustering.CentroidCluster;
+import org.apache.commons.math3.ml.clustering.Cluster;
+import org.apache.commons.math3.ml.clustering.Clusterable;
+import org.apache.commons.math3.ml.clustering.DoublePoint;
+import org.apache.commons.math3.ml.distance.DistanceMeasure;
+import org.apache.commons.math3.ml.distance.EuclideanDistance;
+
+/**
+ * Base class for cluster evaluation methods.
+ *
+ * @param <T> type of the clustered points
+ * @version $Id$
+ * @since 3.3
+ */
+public abstract class ClusterEvaluator<T extends Clusterable> {
+
+ /** The distance measure to use when evaluating the cluster. */
+ private final DistanceMeasure measure;
+
+ /**
+ * Creates a new cluster evaluator with an {@link EuclideanDistance}
+ * as distance measure.
+ */
+ public ClusterEvaluator() {
+ this(new EuclideanDistance());
+ }
+
+ /**
+ * Creates a new cluster evaluator with the given distance measure.
+ * @param measure the distance measure to use
+ */
+ public ClusterEvaluator(final DistanceMeasure measure) {
+ this.measure = measure;
+ }
+
+ /**
+ * Computes the evaluation score for the given list of clusters.
+ * @param clusters the clusters to evaluate
+ * @return the computed score
+ */
+ public abstract double score(List<? extends Cluster<T>> clusters);
+
+ /**
+ * Returns whether the first evaluation score is considered to be better
+ * than the second one by this evaluator.
+ * <p>
+ * Specific implementations shall override this method if the returned scores
+ * do not follow the same ordering, i.e. smaller score is better.
+ *
+ * @param score1 the first score
+ * @param score2 the second score
+ * @return {@code true} if the first score is considered to be better, {@code false} otherwise
+ */
+ public boolean isBetterScore(double score1, double score2) {
+ return score1 < score2;
+ }
+
+ /**
+ * Calculates the distance between two {@link Clusterable} instances
+ * with the configured {@link DistanceMeasure}.
+ *
+ * @param p1 the first clusterable
+ * @param p2 the second clusterable
+ * @return the distance between the two clusterables
+ */
+ protected double distance(final Clusterable p1, final Clusterable p2) {
+ return measure.compute(p1.getPoint(), p2.getPoint());
+ }
+
+ /**
+ * Computes the centroid for a cluster.
+ *
+ * @param cluster the cluster
+ * @return the computed centroid for the cluster,
+ * or {@code null} if the cluster does not contain any points
+ */
+ protected Clusterable centroidOf(final Cluster<T> cluster) {
+ final List<T> points = cluster.getPoints();
+ if (points.isEmpty()) {
+ return null;
+ }
+
+ // in case the cluster is of type CentroidCluster, no need to compute the centroid
+ if (cluster instanceof CentroidCluster) {
+ return ((CentroidCluster<T>) cluster).getCenter();
+ }
+
+ final int dimension = points.get(0).getPoint().length;
+ final double[] centroid = new double[dimension];
+ for (final T p : points) {
+ final double[] point = p.getPoint();
+ for (int i = 0; i < centroid.length; i++) {
+ centroid[i] += point[i];
+ }
+ }
+ for (int i = 0; i < centroid.length; i++) {
+ centroid[i] /= points.size();
+ }
+ return new DoublePoint(centroid);
+ }
+
+}
Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java
------------------------------------------------------------------------------
svn:keywords = Id Revision HeadURL
Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java?rev=1542545&view=auto
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java (added)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java Sat Nov 16 18:48:48 2013
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math3.ml.clustering.evaluation;
+
+import java.util.List;
+
+import org.apache.commons.math3.ml.clustering.Cluster;
+import org.apache.commons.math3.ml.clustering.Clusterable;
+import org.apache.commons.math3.ml.distance.DistanceMeasure;
+import org.apache.commons.math3.stat.descriptive.moment.Variance;
+
+/**
+ * Computes the sum of intra-cluster distance variances according to the formula:
+ * <pre>
+ * \( score = \sum\limits_{i=1}^n \sigma_i^2 \)
+ * </pre>
+ * where n is the number of clusters and \( \sigma_i^2 \) is the variance of
+ * intra-cluster distances of cluster \( c_i \).
+ *
+ * @param <T> the type of the clustered points
+ * @version $Id$
+ * @since 3.3
+ */
+public class SumOfClusterVariances<T extends Clusterable> extends ClusterEvaluator<T> {
+
+ /**
+ *
+ * @param measure the distance measure to use
+ */
+ public SumOfClusterVariances(final DistanceMeasure measure) {
+ super(measure);
+ }
+
+ @Override
+ public double score(final List<? extends Cluster<T>> clusters) {
+ double varianceSum = 0.0;
+ for (final Cluster<T> cluster : clusters) {
+ if (!cluster.getPoints().isEmpty()) {
+
+ final Clusterable center = centroidOf(cluster);
+
+ // compute the distance variance of the current cluster
+ final Variance stat = new Variance();
+ for (final T point : cluster.getPoints()) {
+ stat.increment(distance(point, center));
+ }
+ varianceSum += stat.getResult();
+
+ }
+ }
+ return varianceSum;
+ }
+
+}
Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java
------------------------------------------------------------------------------
svn:keywords = Id Revision HeadURL
Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java?rev=1542545&view=auto
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java (added)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java Sat Nov 16 18:48:48 2013
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Cluster evaluation methods.
+ */
+package org.apache.commons.math3.ml.clustering.evaluation;
Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java
------------------------------------------------------------------------------
svn:keywords = Id Revision HeadURL
Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java?rev=1542545&view=auto
==============================================================================
--- commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java (added)
+++ commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java Sat Nov 16 18:48:48 2013
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math3.ml.clustering.evaluation;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.math3.ml.clustering.Cluster;
+import org.apache.commons.math3.ml.clustering.DoublePoint;
+import org.apache.commons.math3.ml.distance.EuclideanDistance;
+import org.junit.Before;
+import org.junit.Test;
+
+public class SumOfClusterVariancesTest {
+
+ private ClusterEvaluator<DoublePoint> evaluator;
+
+ @Before
+ public void setUp() {
+ evaluator = new SumOfClusterVariances<DoublePoint>(new EuclideanDistance());
+ }
+
+ @Test
+ public void testScore() {
+ final DoublePoint[] points1 = new DoublePoint[] {
+ new DoublePoint(new double[] { 1 }),
+ new DoublePoint(new double[] { 2 }),
+ new DoublePoint(new double[] { 3 })
+ };
+
+ final DoublePoint[] points2 = new DoublePoint[] {
+ new DoublePoint(new double[] { 1 }),
+ new DoublePoint(new double[] { 5 }),
+ new DoublePoint(new double[] { 10 })
+ };
+
+ final List<Cluster<DoublePoint>> clusters = new ArrayList<Cluster<DoublePoint>>();
+
+ final Cluster<DoublePoint> cluster1 = new Cluster<DoublePoint>();
+ for (DoublePoint p : points1) {
+ cluster1.addPoint(p);
+ }
+ clusters.add(cluster1);
+
+ assertEquals(1.0/3.0, evaluator.score(clusters), 1e-6);
+
+ final Cluster<DoublePoint> cluster2 = new Cluster<DoublePoint>();
+ for (DoublePoint p : points2) {
+ cluster2.addPoint(p);
+ }
+ clusters.add(cluster2);
+
+ assertEquals(6.148148148, evaluator.score(clusters), 1e-6);
+ }
+
+ @Test
+ public void testOrdering() {
+ assertTrue(evaluator.isBetterScore(10, 20));
+ assertFalse(evaluator.isBetterScore(20, 1));
+ }
+}
Propchange: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java
------------------------------------------------------------------------------
svn:keywords = Id Revision HeadURL
Propchange: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain