You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by lu...@apache.org on 2011/06/20 21:48:44 UTC

svn commit: r1137759 - in /commons/proper/math/trunk/src: main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java site/xdoc/changes.xml test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java

Author: luc
Date: Mon Jun 20 19:48:44 2011
New Revision: 1137759

URL: http://svn.apache.org/viewvc?rev=1137759&view=rev
Log:
added multiple trials runs to K-means++ clustering algorithm.

JIRA: MATH-548

Modified:
    commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java
    commons/proper/math/trunk/src/site/xdoc/changes.xml
    commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java

Modified: commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java?rev=1137759&r1=1137758&r2=1137759&view=diff
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java (original)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java Mon Jun 20 19:48:44 2011
@@ -96,6 +96,60 @@ public class KMeansPlusPlusClusterer<T e
      *     of clusters is larger than the number of data points
      */
     public List<Cluster<T>> cluster(final Collection<T> points, final int k,
+                                    int numTrials, int maxIterationsPerTrial)
+        throws MathIllegalArgumentException {
+
+        // at first, we have not found any clusters list yet
+        List<Cluster<T>> best = null;
+        double bestVarianceSum = Double.POSITIVE_INFINITY;
+
+        // do several clustering trials
+        for (int i = 0; i < numTrials; ++i) {
+
+            // compute a clusters list
+            List<Cluster<T>> clusters = cluster(points, k, maxIterationsPerTrial);
+
+            // compute the variance of the current list
+            double varianceSum = 0.0;
+            for (final Cluster<T> cluster : clusters) {
+                if (!cluster.getPoints().isEmpty()) {
+
+                    // compute the distance variance of the current cluster
+                    final T center = cluster.getCenter();
+                    final Variance stat = new Variance();
+                    for (final T point : cluster.getPoints()) {
+                        stat.increment(point.distanceFrom(center));
+                    }
+                    varianceSum += stat.getResult();
+
+                }
+            }
+
+            if (varianceSum <= bestVarianceSum) {
+                // this one is the best we have found so far, remember it
+                best            = clusters;
+                bestVarianceSum = varianceSum;
+            }
+
+        }
+
+        // return the best clusters list found
+        return best;
+
+    }
+
+    /**
+     * Runs the K-means++ clustering algorithm.
+     *
+     * @param points the points to cluster
+     * @param k the number of clusters to split the data into
+     * @param maxIterations the maximum number of iterations to run the algorithm
+     *     for.  If negative, no maximum will be used
+     * @return a list of clusters containing the points
+     * @throws MathIllegalArgumentException if the data points are null or the number
+     *     of clusters is larger than the number of data points
+     */
+    public List<Cluster<T>> cluster(final Collection<T> points, final int k,
                                     final int maxIterations)
         throws MathIllegalArgumentException {
 

Modified: commons/proper/math/trunk/src/site/xdoc/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/site/xdoc/changes.xml?rev=1137759&r1=1137758&r2=1137759&view=diff
==============================================================================
--- commons/proper/math/trunk/src/site/xdoc/changes.xml (original)
+++ commons/proper/math/trunk/src/site/xdoc/changes.xml Mon Jun 20 19:48:44 2011
@@ -52,6 +52,9 @@ The <action> type attribute can be add,u
     If the output is not quite correct, check for invisible trailing spaces!
      -->
     <release version="3.0" date="TBD" description="TBD">
+      <action dev="luc" type="add" issue="MATH-548">
+        K-means++ clustering can now run multiple trials
+      </action>
       <action dev="luc" type="add" issue="MATH-591">
         Added a way to compute sub-lines intersections, considering sub-lines either
         as open sets or closed sets

Modified: commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java?rev=1137759&r1=1137758&r2=1137759&view=diff
==============================================================================
--- commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java (original)
+++ commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java Mon Jun 20 19:48:44 2011
@@ -65,7 +65,7 @@ public class KMeansPlusPlusClustererTest
 
         };
         List<Cluster<EuclideanIntegerPoint>> clusters =
-            transformer.cluster(Arrays.asList(points), 3, 10);
+            transformer.cluster(Arrays.asList(points), 3, 5, 10);
 
         Assert.assertEquals(3, clusters.size());
         boolean cluster1Found = false;