You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by tn...@apache.org on 2013/11/16 19:48:48 UTC

svn commit: r1542545 - in /commons/proper/math/trunk/src: changes/ main/java/org/apache/commons/math3/ml/clustering/ main/java/org/apache/commons/math3/ml/clustering/evaluation/ test/java/org/apache/commons/math3/ml/clustering/evaluation/

Author: tn
Date: Sat Nov 16 18:48:48 2013
New Revision: 1542545

URL: http://svn.apache.org/r1542545
Log:
[MATH-1031] Added new ClusterEvaluation base class and refactored code in MultiKMeansPlusPlusClusterer.

Added:
    commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/
    commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java   (with props)
    commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java   (with props)
    commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java   (with props)
    commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/
    commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java   (with props)
Modified:
    commons/proper/math/trunk/src/changes/changes.xml
    commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java

Modified: commons/proper/math/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/changes/changes.xml?rev=1542545&r1=1542544&r2=1542545&view=diff
==============================================================================
--- commons/proper/math/trunk/src/changes/changes.xml (original)
+++ commons/proper/math/trunk/src/changes/changes.xml Sat Nov 16 18:48:48 2013
@@ -51,6 +51,11 @@ If the output is not quite correct, chec
   </properties>
   <body>
     <release version="3.3" date="TBD" description="TBD">
+      <action dev="tn" type="update" issue="MATH-1031" due-to="Thorsten Schäfer">
+        Added new class "ClusterEvaluator" to evaluate the result of a clustering algorithm
+        and refactored existing evaluation code in "MultiKMeansPlusPlusClusterer"
+        into separate class "SumOfClusterVariances".
+      </action>
       <action dev="psteitz" type="add" issue="MATH-1061">
         Added InsufficientDataException.
       </action>
@@ -96,7 +101,7 @@ If the output is not quite correct, chec
         Added logDensity methods to AbstractReal/IntegerDistribution with naive default
         implementations and improved implementations for some current distributions.
       </action>
-      <action dev="psteitz" type="add" issue="MATH-1038" due-to="Thorsten Schaefer">
+      <action dev="psteitz" type="add" issue="MATH-1038" due-to="Thorsten Schäfer">
         Added ConfidenceInterval class and BinomialConfidenceInterval providing several
         estimators for confidence intervals for binomial probabilities.
       </action>
@@ -127,7 +132,7 @@ If the output is not quite correct, chec
         Fix a typo in the test class of "GeometricDistribution" and ensure that a meaningful
         tolerance value is used when comparing test results with expected values.
       </action>
-      <action dev="psteitz" type="add" issue="MATH-1034" due-to="Thorsten Schaefer">
+      <action dev="psteitz" type="add" issue="MATH-1034" due-to="Thorsten Schäfer">
         Added exact binomial test implementation.
       </action>
       <action dev="tn" type="add" issue="MATH-1018" due-to="Ajo Fod">

Modified: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java?rev=1542545&r1=1542544&r2=1542545&view=diff
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java (original)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/MultiKMeansPlusPlusClusterer.java Sat Nov 16 18:48:48 2013
@@ -22,7 +22,8 @@ import java.util.List;
 
 import org.apache.commons.math3.exception.ConvergenceException;
 import org.apache.commons.math3.exception.MathIllegalArgumentException;
-import org.apache.commons.math3.stat.descriptive.moment.Variance;
+import org.apache.commons.math3.ml.clustering.evaluation.ClusterEvaluator;
+import org.apache.commons.math3.ml.clustering.evaluation.SumOfClusterVariances;
 
 /**
  * A wrapper around a k-means++ clustering algorithm which performs multiple trials
@@ -39,15 +40,31 @@ public class MultiKMeansPlusPlusClustere
     /** The number of trial runs. */
     private final int numTrials;
 
+    /** The cluster evaluator to use. */
+    private final ClusterEvaluator<T> evaluator;
+
     /** Build a clusterer.
      * @param clusterer the k-means clusterer to use
      * @param numTrials number of trial runs
      */
     public MultiKMeansPlusPlusClusterer(final KMeansPlusPlusClusterer<T> clusterer,
                                         final int numTrials) {
+        this(clusterer, numTrials, new SumOfClusterVariances<T>(clusterer.getDistanceMeasure()));
+    }
+
+    /** Build a clusterer.
+     * @param clusterer the k-means clusterer to use
+     * @param numTrials number of trial runs
+     * @param evaluator the cluster evaluator to use
+     * @since 3.3
+     */
+    public MultiKMeansPlusPlusClusterer(final KMeansPlusPlusClusterer<T> clusterer,
+                                        final int numTrials,
+                                        final ClusterEvaluator<T> evaluator) {
         super(clusterer.getDistanceMeasure());
         this.clusterer = clusterer;
         this.numTrials = numTrials;
+        this.evaluator = evaluator;
     }
 
     /**
@@ -67,6 +84,15 @@ public class MultiKMeansPlusPlusClustere
     }
 
     /**
+     * Returns the {@link ClusterEvaluator} used to determine the "best" clustering.
+     * @return the used {@link ClusterEvaluator}
+     * @since 3.3
+     */
+    public ClusterEvaluator<T> getClusterEvaluator() {
+       return evaluator;
+    }
+
+    /**
      * Runs the K-means++ clustering algorithm.
      *
      * @param points the points to cluster
@@ -92,22 +118,9 @@ public class MultiKMeansPlusPlusClustere
             List<CentroidCluster<T>> clusters = clusterer.cluster(points);
 
             // compute the variance of the current list
-            double varianceSum = 0.0;
-            for (final CentroidCluster<T> cluster : clusters) {
-                if (!cluster.getPoints().isEmpty()) {
-
-                    // compute the distance variance of the current cluster
-                    final Clusterable center = cluster.getCenter();
-                    final Variance stat = new Variance();
-                    for (final T point : cluster.getPoints()) {
-                        stat.increment(distance(point, center));
-                    }
-                    varianceSum += stat.getResult();
-
-                }
-            }
+            final double varianceSum = evaluator.score(clusters);
 
-            if (varianceSum <= bestVarianceSum) {
+            if (evaluator.isBetterScore(varianceSum, bestVarianceSum)) {
                 // this one is the best we have found so far, remember it
                 best            = clusters;
                 bestVarianceSum = varianceSum;

Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java?rev=1542545&view=auto
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java (added)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java Sat Nov 16 18:48:48 2013
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math3.ml.clustering.evaluation;
+
+import java.util.List;
+
+import org.apache.commons.math3.ml.clustering.CentroidCluster;
+import org.apache.commons.math3.ml.clustering.Cluster;
+import org.apache.commons.math3.ml.clustering.Clusterable;
+import org.apache.commons.math3.ml.clustering.DoublePoint;
+import org.apache.commons.math3.ml.distance.DistanceMeasure;
+import org.apache.commons.math3.ml.distance.EuclideanDistance;
+
+/**
+ * Base class for cluster evaluation methods.
+ *
+ * @param <T> type of the clustered points
+ * @version $Id$
+ * @since 3.3
+ */
+public abstract class ClusterEvaluator<T extends Clusterable> {
+
+    /** The distance measure to use when evaluating the cluster. */
+    private final DistanceMeasure measure;
+
+    /**
+     * Creates a new cluster evaluator with an {@link EuclideanDistance}
+     * as distance measure.
+     */
+    public ClusterEvaluator() {
+        this(new EuclideanDistance());
+    }
+
+    /**
+     * Creates a new cluster evaluator with the given distance measure.
+     * @param measure the distance measure to use
+     */
+    public ClusterEvaluator(final DistanceMeasure measure) {
+        this.measure = measure;
+    }
+
+    /**
+     * Computes the evaluation score for the given list of clusters.
+     * @param clusters the clusters to evaluate
+     * @return the computed score
+     */
+    public abstract double score(List<? extends Cluster<T>> clusters);
+
+    /**
+     * Returns whether the first evaluation score is considered to be better
+     * than the second one by this evaluator.
+     * <p>
+     * Specific implementations shall override this method if the returned scores
+     * do not follow the same ordering, i.e. smaller score is better.
+     *
+     * @param score1 the first score
+     * @param score2 the second score
+     * @return {@code true} if the first score is considered to be better, {@code false} otherwise
+     */
+    public boolean isBetterScore(double score1, double score2) {
+        return score1 < score2;
+    }
+
+    /**
+     * Calculates the distance between two {@link Clusterable} instances
+     * with the configured {@link DistanceMeasure}.
+     *
+     * @param p1 the first clusterable
+     * @param p2 the second clusterable
+     * @return the distance between the two clusterables
+     */
+    protected double distance(final Clusterable p1, final Clusterable p2) {
+        return measure.compute(p1.getPoint(), p2.getPoint());
+    }
+
+    /**
+     * Computes the centroid for a cluster.
+     *
+     * @param cluster the cluster
+     * @return the computed centroid for the cluster,
+     * or {@code null} if the cluster does not contain any points
+     */
+    protected Clusterable centroidOf(final Cluster<T> cluster) {
+        final List<T> points = cluster.getPoints();
+        if (points.isEmpty()) {
+            return null;
+        }
+
+        // in case the cluster is of type CentroidCluster, no need to compute the centroid
+        if (cluster instanceof CentroidCluster) {
+            return ((CentroidCluster<T>) cluster).getCenter();
+        }
+
+        final int dimension = points.get(0).getPoint().length;
+        final double[] centroid = new double[dimension];
+        for (final T p : points) {
+            final double[] point = p.getPoint();
+            for (int i = 0; i < centroid.length; i++) {
+                centroid[i] += point[i];
+            }
+        }
+        for (int i = 0; i < centroid.length; i++) {
+            centroid[i] /= points.size();
+        }
+        return new DoublePoint(centroid);
+    }
+
+}

Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision HeadURL

Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/ClusterEvaluator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java?rev=1542545&view=auto
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java (added)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java Sat Nov 16 18:48:48 2013
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math3.ml.clustering.evaluation;
+
+import java.util.List;
+
+import org.apache.commons.math3.ml.clustering.Cluster;
+import org.apache.commons.math3.ml.clustering.Clusterable;
+import org.apache.commons.math3.ml.distance.DistanceMeasure;
+import org.apache.commons.math3.stat.descriptive.moment.Variance;
+
+/**
+ * Computes the sum of intra-cluster distance variances according to the formula:
+ * <pre>
+ * \( score = \sum\limits_{i=1}^n \sigma_i^2 \)
+ * </pre>
+ * where n is the number of clusters and \( \sigma_i^2 \) is the variance of
+ * intra-cluster distances of cluster \( c_i \).
+ *
+ * @param <T> the type of the clustered points
+ * @version $Id$
+ * @since 3.3
+ */
+public class SumOfClusterVariances<T extends Clusterable> extends ClusterEvaluator<T> {
+
+    /**
+     *
+     * @param measure the distance measure to use
+     */
+    public SumOfClusterVariances(final DistanceMeasure measure) {
+        super(measure);
+    }
+
+    @Override
+    public double score(final List<? extends Cluster<T>> clusters) {
+        double varianceSum = 0.0;
+        for (final Cluster<T> cluster : clusters) {
+            if (!cluster.getPoints().isEmpty()) {
+
+                final Clusterable center = centroidOf(cluster);
+
+                // compute the distance variance of the current cluster
+                final Variance stat = new Variance();
+                for (final T point : cluster.getPoints()) {
+                    stat.increment(distance(point, center));
+                }
+                varianceSum += stat.getResult();
+
+            }
+        }
+        return varianceSum;
+    }
+
+}

Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision HeadURL

Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariances.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java?rev=1542545&view=auto
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java (added)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java Sat Nov 16 18:48:48 2013
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Cluster evaluation methods.
+ */
+package org.apache.commons.math3.ml.clustering.evaluation;

Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision HeadURL

Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/ml/clustering/evaluation/package-info.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java?rev=1542545&view=auto
==============================================================================
--- commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java (added)
+++ commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java Sat Nov 16 18:48:48 2013
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math3.ml.clustering.evaluation;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.math3.ml.clustering.Cluster;
+import org.apache.commons.math3.ml.clustering.DoublePoint;
+import org.apache.commons.math3.ml.distance.EuclideanDistance;
+import org.junit.Before;
+import org.junit.Test;
+
+public class SumOfClusterVariancesTest {
+
+    private ClusterEvaluator<DoublePoint> evaluator;
+
+    @Before
+    public void setUp() {
+        evaluator = new SumOfClusterVariances<DoublePoint>(new EuclideanDistance());
+    }
+
+    @Test
+    public void testScore() {
+        final DoublePoint[] points1 = new DoublePoint[] {
+                new DoublePoint(new double[] { 1 }),
+                new DoublePoint(new double[] { 2 }),
+                new DoublePoint(new double[] { 3 })
+        };
+
+        final DoublePoint[] points2 = new DoublePoint[] {
+                new DoublePoint(new double[] { 1 }),
+                new DoublePoint(new double[] { 5 }),
+                new DoublePoint(new double[] { 10 })
+        };
+
+        final List<Cluster<DoublePoint>> clusters = new ArrayList<Cluster<DoublePoint>>();
+        
+        final Cluster<DoublePoint> cluster1 = new Cluster<DoublePoint>();
+        for (DoublePoint p : points1) {
+            cluster1.addPoint(p);
+        }
+        clusters.add(cluster1);
+
+        assertEquals(1.0/3.0, evaluator.score(clusters), 1e-6);
+
+        final Cluster<DoublePoint> cluster2 = new Cluster<DoublePoint>();
+        for (DoublePoint p : points2) {
+            cluster2.addPoint(p);
+        }
+        clusters.add(cluster2);
+
+        assertEquals(6.148148148, evaluator.score(clusters), 1e-6);
+    }
+    
+    @Test
+    public void testOrdering() {
+        assertTrue(evaluator.isBetterScore(10, 20));
+        assertFalse(evaluator.isBetterScore(20, 1));
+    }
+}

Propchange: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision HeadURL

Propchange: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/ml/clustering/evaluation/SumOfClusterVariancesTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain