You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by df...@apache.org on 2013/06/16 19:04:09 UTC

svn commit: r1493533 - in /mahout/trunk: CHANGELOG core/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java

Author: dfilimon
Date: Sun Jun 16 17:04:08 2013
New Revision: 1493533

URL: http://svn.apache.org/r1493533
Log:
MAHOUT-1255: Fix for weights in Multinomial sometimes overflowing in BallKMeans


Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1493533&r1=1493532&r2=1493533&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Jun 16 17:04:08 2013
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.8 - unreleased
 
+  MAHOUT-1255: Fix for weights in Multinomial sometimes overflowing in BallKMeans (dfilimon)
+
   MAHOUT-1254: Final round of cleanup for StreamingKMeans (dfilimon)
 
   MAHOUT-1263: Serialise/Deserialise Lambda value for OnlineLogisticRegression (Mike Davy via smarthi)  

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java?rev=1493533&r1=1493532&r2=1493533&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java Sun Jun 16 17:04:08 2013
@@ -281,11 +281,11 @@ public class BallKMeans implements Itera
    * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
    */
   private void initializeSeedsKMeansPlusPlus(List<? extends WeightedVector> datapoints) {
-    Preconditions.checkArgument(datapoints.size() > 1, "Must have at least two datapoints points to cluster "
-        + "sensibly");
+    Preconditions.checkArgument(datapoints.size() > 1, "Must have at least two datapoints points to cluster " +
+        "sensibly");
     Preconditions.checkArgument(datapoints.size() >= numClusters,
         String.format("Must have more datapoints [%d] than clusters [%d]", datapoints.size(), numClusters));
-    // Compute the centroid of all of the datapoints. This is then used to compute the squared radius of the datapoints.
+    // Compute the centroid of all of the datapoints.  This is then used to compute the squared radius of the datapoints.
     Centroid center = new Centroid(datapoints.iterator().next());
     for (WeightedVector row : Iterables.skip(datapoints, 1)) {
       center.update(row);
@@ -330,7 +330,7 @@ public class BallKMeans implements Itera
     // set to the squared distance from c_1
     for (int i = 0; i < datapoints.size(); ++i) {
       WeightedVector row = datapoints.get(i);
-      final double w = distanceMeasure.distance(c_1, row) * row.getWeight();
+      final double w = distanceMeasure.distance(c_1, row) * 2 * Math.log(1 + row.getWeight());
       seedSelector.set(i, w);
     }
 
@@ -446,8 +446,8 @@ public class BallKMeans implements Itera
     return Iterators.transform(centroids.iterator(), new Function<Vector, Centroid>() {
       @Override
       public Centroid apply(Vector input) {
-        Preconditions.checkArgument(input instanceof Centroid, "Non-centroid in centroids "
-            + "searcher");
+        Preconditions.checkArgument(input instanceof Centroid, "Non-centroid in centroids " +
+            "searcher");
         //noinspection ConstantConditions
         return (Centroid)input;
       }