You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by is...@apache.org on 2009/12/10 12:02:01 UTC
svn commit: r889184 - in /lucene/mahout/trunk/core/src:
main/java/org/apache/mahout/clustering/
main/java/org/apache/mahout/clustering/canopy/
main/java/org/apache/mahout/clustering/fuzzykmeans/
main/java/org/apache/mahout/clustering/kmeans/ main/java/...
Author: isabel
Date: Thu Dec 10 11:01:28 2009
New Revision: 889184
URL: http://svn.apache.org/viewvc?rev=889184&view=rev
Log:
MAHOUT-11 - missing files :/
Added:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansConfigKeys.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/package.html
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyConfigKeys.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/package.html
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java Thu Dec 10 11:01:28 2009
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.matrix.Vector;
+
+public class CanopyClusterer {
+
+ private int nextCanopyId;
+
+ // the T1 distance threshold
+ private double t1;
+
+ // the T2 distance threshold
+ private double t2;
+
+ // the distance measure
+ private DistanceMeasure measure;
+
+ private int nextClusterId = 0;
+
+ public CanopyClusterer(final DistanceMeasure measure, final double t1, final double t2) {
+ this.t1 = t1;
+ this.t2 = t2;
+ this.measure = measure;
+ }
+
+ public CanopyClusterer(JobConf job) {
+ this.configure(job);
+ }
+
+ /**
+ * Configure the Canopy and its distance measure
+ *
+ * @param job the JobConf for this job
+ */
+ public void configure(JobConf job) {
+ try {
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ Class<?> cl = ccl.loadClass(job
+ .get(CanopyConfigKeys.DISTANCE_MEASURE_KEY));
+ measure = (DistanceMeasure) cl.newInstance();
+ measure.configure(job);
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException(e);
+ }
+ t1 = Double.parseDouble(job.get(CanopyConfigKeys.T1_KEY));
+ t2 = Double.parseDouble(job.get(CanopyConfigKeys.T2_KEY));
+ nextCanopyId = 0;
+ }
+
+ /** Configure the Canopy for unit tests */
+ public void config(DistanceMeasure aMeasure, double aT1, double aT2) {
+ measure = aMeasure;
+ t1 = aT1;
+ t2 = aT2;
+ }
+
+ /**
+ * This is the same algorithm as the reference but inverted to iterate over
+ * existing canopies instead of the points. Because of this it does not need
+ * to actually store the points, instead storing a total points vector and the
+ * number of points. From this a centroid can be computed.
+ * <p/>
+ * This method is used by the CanopyReducer.
+ *
+ * @param point the point to be added
+ * @param canopies the List<Canopy> to be appended
+ */
+ public void addPointToCanopies(Vector point, List<Canopy> canopies) {
+ boolean pointStronglyBound = false;
+ for (Canopy canopy : canopies) {
+ double dist = measure.distance(canopy.getCenter().getLengthSquared(),
+ canopy.getCenter(), point);
+ if (dist < t1) {
+ canopy.addPoint(point);
+ }
+ pointStronglyBound = pointStronglyBound || (dist < t2);
+ }
+ if (!pointStronglyBound) {
+ canopies.add(new Canopy(point, nextCanopyId++));
+ }
+ }
+
+ /**
+ * This method is used by the CanopyMapper to perform canopy inclusion tests
+ * and to emit the point and its covering canopies to the output. The
+ * CanopyCombiner will then sum the canopy points and produce the centroids.
+ *
+ * @param point the point to be added
+ * @param canopies the List<Canopy> to be appended
+ * @param collector an OutputCollector in which to emit the point
+ */
+ public void emitPointToNewCanopies(Vector point, List<Canopy> canopies,
+ OutputCollector<Text, Vector> collector) throws IOException {
+ boolean pointStronglyBound = false;
+ for (Canopy canopy : canopies) {
+ double dist = measure.distance(canopy.getCenter().getLengthSquared(),
+ canopy.getCenter(), point);
+ if (dist < t1) {
+ canopy.emitPoint(point, collector);
+ }
+ pointStronglyBound = pointStronglyBound || (dist < t2);
+ }
+ if (!pointStronglyBound) {
+ Canopy canopy = new Canopy(point, nextCanopyId++);
+ canopies.add(canopy);
+ canopy.emitPoint(point, collector);
+ }
+ }
+
+ /**
+ * This method is used by the CanopyMapper to perform canopy inclusion tests
+ * and to emit the point keyed by its covering canopies to the output. if the
+ * point is not covered by any canopies (due to canopy centroid clustering),
+ * emit the point to the closest covering canopy.
+ *
+ * @param point the point to be added
+ * @param canopies the List<Canopy> to be appended
+ * @param collector an OutputCollector in which to emit the point
+ */
+ public void emitPointToExistingCanopies(Vector point, List<Canopy> canopies,
+ OutputCollector<Text, Vector> collector) throws IOException {
+ double minDist = Double.MAX_VALUE;
+ Canopy closest = null;
+ boolean isCovered = false;
+ for (Canopy canopy : canopies) {
+ double dist = measure.distance(canopy.getCenter().getLengthSquared(),
+ canopy.getCenter(), point);
+ if (dist < t1) {
+ isCovered = true;
+ collector.collect(new Text(canopy.getIdentifier()), point);
+ } else if (dist < minDist) {
+ minDist = dist;
+ closest = canopy;
+ }
+ }
+ // if the point is not contained in any canopies (due to canopy centroid
+ // clustering), emit the point to the closest covering canopy.
+ if (!isCovered) {
+ collector.collect(new Text(closest.getIdentifier()), point);
+ }
+ }
+
+ /**
+ * Return if the point is covered by the canopy
+ *
+ * @param point a point
+ * @return if the point is covered
+ */
+ public boolean canopyCovers(Canopy canopy, Vector point) {
+ return measure.distance(canopy.getCenter().getLengthSquared(),
+ canopy.getCenter(), point) < t1;
+ }
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java Thu Dec 10 11:01:28 2009
@@ -0,0 +1,11 @@
+package org.apache.mahout.clustering.canopy;
+
+public class CanopyConfigKeys {
+
+ public static final String T1_KEY = "org.apache.mahout.clustering.canopy.t1";
+ public static final String CANOPY_PATH_KEY = "org.apache.mahout.clustering.canopy.path";
+ public static final String T2_KEY = "org.apache.mahout.clustering.canopy.t2";
+ // keys used by Driver, Mapper, Combiner & Reducer
+ public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";
+
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java Thu Dec 10 11:01:28 2009
@@ -0,0 +1,183 @@
+package org.apache.mahout.clustering.fuzzykmeans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.matrix.Vector;
+
+public class FuzzyKMeansClusterer {
+
+
+ private static final double MINIMAL_VALUE = 0.0000000001; // using it for
+ // adding
+ // exception
+ // this value to any
+ // zero valued
+ // variable to avoid
+ // divide by Zero
+
+ private int nextClusterId = 0;
+
+ private DistanceMeasure measure;
+
+ private double convergenceDelta = 0;
+
+ private double m = 2.0; // default value
+
+ /**
+ * Init the fuzzy k-means clusterer with the distance measure to use for comparison.
+ *
+ * @param measure
+ * The distance measure to use for comparing clusters against points.
+ * @param convergenceDelta
+ * When do we define a cluster to have converged?
+ *
+ * */
+ public FuzzyKMeansClusterer(final DistanceMeasure measure, double convergenceDelta, double m) {
+ this.measure = measure;
+ this.convergenceDelta = convergenceDelta;
+ this.m = m;
+ }
+
+ public FuzzyKMeansClusterer(JobConf job) {
+ this.configure(job);
+ }
+
+ /**
+ * Configure the distance measure directly. Used by unit tests.
+ *
+ * @param aMeasure the DistanceMeasure
+ * @param aConvergenceDelta the delta value used to define convergence
+ */
+ private void config(DistanceMeasure aMeasure, double aConvergenceDelta) {
+ measure = aMeasure;
+ convergenceDelta = aConvergenceDelta;
+ nextClusterId = 0;
+ }
+
+ /**
+ * Configure the distance measure from the job
+ *
+ * @param job the JobConf for the job
+ */
+ private void configure(JobConf job) {
+ try {
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ Class<?> cl = ccl.loadClass(job.get(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY));
+ measure = (DistanceMeasure) cl.newInstance();
+ measure.configure(job);
+ convergenceDelta = Double.parseDouble(job.get(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY));
+ nextClusterId = 0;
+ m = Double.parseDouble(job.get(FuzzyKMeansConfigKeys.M_KEY));
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ /**
+ * Emit the point and its probability of belongingness to each cluster
+ *
+ * @param point a point
+ * @param clusters a List<SoftCluster>
+ * @param output the OutputCollector to emit into
+ */
+ public void emitPointProbToCluster(Vector point,
+ List<SoftCluster> clusters,
+ OutputCollector<Text, FuzzyKMeansInfo> output) throws IOException {
+
+ List<Double> clusterDistanceList = new ArrayList<Double>();
+ for (SoftCluster cluster : clusters) {
+ clusterDistanceList.add(measure.distance(cluster.getCenter(), point));
+ }
+
+ for (int i = 0; i < clusters.size(); i++) {
+ double probWeight = computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
+ Text key = new Text(clusters.get(i).getIdentifier()); // just output the
+ // identifier,avoids
+ // too much data
+ // traffic
+ /*Text value = new Text(Double.toString(probWeight)
+ + FuzzyKMeansDriver.MAPPER_VALUE_SEPARATOR + values.toString());*/
+ FuzzyKMeansInfo value = new FuzzyKMeansInfo(probWeight, point);
+ output.collect(key, value);
+ }
+ }
+
+ /**
+ * Output point with cluster info (Cluster and probability)
+ *
+ * @param point a point
+ * @param clusters a List<SoftCluster> to test
+ * @param output the OutputCollector to emit into
+ */
+ public void outputPointWithClusterProbabilities(String key,
+ Vector point, List<SoftCluster> clusters,
+ OutputCollector<Text, FuzzyKMeansOutput> output) throws IOException {
+
+ List<Double> clusterDistanceList = new ArrayList<Double>();
+
+ for (SoftCluster cluster : clusters) {
+ clusterDistanceList.add(measure.distance(point, cluster.getCenter()));
+ }
+ FuzzyKMeansOutput fOutput = new FuzzyKMeansOutput(clusters.size());
+ for (int i = 0; i < clusters.size(); i++) {
+ // System.out.print("cluster:" + i + "\t" + clusterDistanceList.get(i));
+
+ double probWeight = computeProbWeight(clusterDistanceList.get(i),
+ clusterDistanceList);
+ /*outputValue.append(clusters.get(i).clusterId).append(':').append(
+ probWeight).append(' ');*/
+ fOutput.add(i, clusters.get(i), probWeight);
+ }
+ String name = point.getName();
+ output.collect(new Text(name != null && name.length() != 0 ? name
+ : point.asFormatString()),
+ fOutput);
+ }
+
+ /** Computes the probability of a point belonging to a cluster */
+ public double computeProbWeight(double clusterDistance,
+ List<Double> clusterDistanceList) {
+ if (clusterDistance == 0) {
+ clusterDistance = MINIMAL_VALUE;
+ }
+ double denom = 0.0;
+ for (double eachCDist : clusterDistanceList) {
+ if (eachCDist == 0.0) {
+ eachCDist = MINIMAL_VALUE;
+ }
+
+ denom += Math.pow(clusterDistance / eachCDist, 2.0 / (m - 1));
+
+ }
+ return 1.0 / denom;
+ }
+
+ /**
+ * Return if the cluster is converged by comparing its center and centroid.
+ *
+ * @return if the cluster is converged
+ */
+ public boolean computeConvergence(SoftCluster cluster) {
+ Vector centroid = cluster.computeCentroid();
+ cluster.setConverged(measure.distance(cluster.getCenter(), centroid) <= convergenceDelta);
+ return cluster.isConverged();
+ }
+
+ public double getM() {
+ return m;
+ }
+
+ public DistanceMeasure getMeasure() {
+ return this.measure;
+ }
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansConfigKeys.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansConfigKeys.java?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansConfigKeys.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansConfigKeys.java Thu Dec 10 11:01:28 2009
@@ -0,0 +1,10 @@
+package org.apache.mahout.clustering.fuzzykmeans;
+
+public class FuzzyKMeansConfigKeys {
+
+ public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure";
+ public static final String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path";
+ public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence";
+ public static final String M_KEY = "org.apache.mahout.clustering.fuzzykmeans.m";
+
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java Thu Dec 10 11:01:28 2009
@@ -0,0 +1,97 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.kmeans;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.matrix.Vector;
+
+/**
+ * This class implements the k-means clustering algorithm. It uses
+ * {@link Cluster} as a cluster representation. The class can be used as part of
+ * a clustering job to be started as map/reduce job.
+ * */
+public class KMeansClusterer {
+
+ /** Distance to use for point to cluster comparison. */
+ private final DistanceMeasure measure;
+
+ /**
+ * Init the k-means clusterer with the distance measure to use for comparison.
+ *
+ * @param measure
+ * The distance measure to use for comparing clusters against points.
+ * @param convergenceDelta
+ * When do we define a cluster to have converged?
+ *
+ * */
+ public KMeansClusterer(final DistanceMeasure measure) {
+ this.measure = measure;
+ }
+
+ /**
+ * Iterates over all clusters and identifies the one closes to the given
+ * point. Distance measure used is configured at creation time of
+ * {@link KMeansClusterer}.
+ *
+ * @param point
+ * a point to find a cluster for.
+ * @param clusters
+ * a List<Cluster> to test.
+ */
+ public void emitPointToNearestCluster(Vector point,
+ List<Cluster> clusters, OutputCollector<Text, KMeansInfo> output) throws IOException {
+ Cluster nearestCluster = null;
+ double nearestDistance = Double.MAX_VALUE;
+ for (Cluster cluster : clusters) {
+ Vector clusterCenter = cluster.getCenter();
+ double distance = this.measure.distance(clusterCenter.getLengthSquared(),
+ clusterCenter, point);
+ System.out.println(distance + " Cluster: " + cluster.getId());
+ if (distance < nearestDistance || nearestCluster == null) {
+ nearestCluster = cluster;
+ nearestDistance = distance;
+ }
+ }
+ // emit only clusterID
+ output.collect(new Text(nearestCluster.getIdentifier()), new KMeansInfo(1, point));
+ }
+
+ public void outputPointWithClusterInfo(Vector point,
+ List<Cluster> clusters, OutputCollector<Text, Text> output) throws IOException {
+ Cluster nearestCluster = null;
+ double nearestDistance = Double.MAX_VALUE;
+ for (Cluster cluster : clusters) {
+ Vector clusterCenter = cluster.getCenter();
+ double distance = measure.distance(clusterCenter.getLengthSquared(),
+ clusterCenter, point);
+ if (distance < nearestDistance || nearestCluster == null) {
+ nearestCluster = cluster;
+ nearestDistance = distance;
+ }
+ }
+
+ String name = point.getName();
+ String key = new String(name != null && name.length() != 0 ? name : point
+ .asFormatString());
+ output.collect(new Text(key), new Text(String.valueOf(nearestCluster.getId())));
+ }
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java Thu Dec 10 11:01:28 2009
@@ -0,0 +1,16 @@
+package org.apache.mahout.clustering.kmeans;
+
+/**
+ * This class holds all config keys that are relevant to be used in the KMeans MapReduce JobConf.
+ * */
+public class KMeansConfigKeys {
+ /** Configuration key for distance measure to use. */
+ public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure";
+ /** Configuration key for convergence threshold. */
+ public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence";
+ /** Configuration key for ?? */
+ public static final String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path";
+ /** The number of iterations that have taken place */
+ public static final String ITERATION_NUMBER = "org.apache.mahout.clustering.kmeans.iteration";
+
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/package.html
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/package.html?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/package.html (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/package.html Thu Dec 10 11:01:28 2009
@@ -0,0 +1,29 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head>
+<!--
+
+ @(#)package.html
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+</head>
+<body bgcolor="white">
+This package provides an implementation of the <a href="http://en.wikipedia.org/wiki/Kmeans">k-means</a> clustering
+algorithm.
+</body>
+</html>
\ No newline at end of file
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java Thu Dec 10 11:01:28 2009
@@ -0,0 +1,151 @@
+package org.apache.mahout.clustering.meanshift;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.matrix.Vector;
+
+public class MeanShiftCanopyClusterer {
+
+ private double convergenceDelta = 0;
+ // the next canopyId to be allocated
+ private int nextCanopyId = 0;
+ // the T1 distance threshold
+ private double t1;
+ // the T2 distance threshold
+ private double t2;
+ // the distance measure
+ private DistanceMeasure measure;
+
+ public double getT1() {
+ return t1;
+ }
+ public double getT2() {
+ return t2;
+ }
+
+ public MeanShiftCanopyClusterer(DistanceMeasure aMeasure, double aT1, double aT2, double aDelta) {
+ config(aMeasure, aT1, aT2, aDelta);
+ }
+
+ public MeanShiftCanopyClusterer(JobConf job) {
+ configure(job);
+ }
+ /**
+ * Configure the Canopy and its distance measure
+ *
+ * @param job the JobConf for this job
+ */
+ public void configure(JobConf job) {
+ try {
+ measure = Class.forName(job.get(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY)).asSubclass(
+ DistanceMeasure.class).newInstance();
+ measure.configure(job);
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException(e);
+ }
+ nextCanopyId = 0;
+ t1 = Double.parseDouble(job.get(MeanShiftCanopyConfigKeys.T1_KEY));
+ t2 = Double.parseDouble(job.get(MeanShiftCanopyConfigKeys.T2_KEY));
+ convergenceDelta = Double.parseDouble(job.get(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY));
+ }
+ /**
+ * Configure the Canopy for unit tests
+ *
+ * @param aDelta the convergence criteria
+ */
+ public void config(DistanceMeasure aMeasure, double aT1, double aT2,
+ double aDelta) {
+ nextCanopyId = 100; // so canopyIds will sort properly
+ measure = aMeasure;
+ t1 = aT1;
+ t2 = aT2;
+ convergenceDelta = aDelta;
+ }
+
+ /**
+ * Merge the given canopy into the canopies list. If it touches any existing canopy (norm<T1) then add the center of
+ * each to the other. If it covers any other canopies (norm<T2), then merge the given canopy with the closest covering
+ * canopy. If the given canopy does not cover any other canopies, add it to the canopies list.
+ *
+ * @param aCanopy a MeanShiftCanopy to be merged
+ * @param canopies the List<Canopy> to be appended
+ */
+ public void mergeCanopy(MeanShiftCanopy aCanopy,
+ List<MeanShiftCanopy> canopies) {
+ MeanShiftCanopy closestCoveringCanopy = null;
+ double closestNorm = Double.MAX_VALUE;
+ for (MeanShiftCanopy canopy : canopies) {
+ double norm = measure.distance(canopy.getCenter(), aCanopy.getCenter());
+ if (norm < t1) {
+ aCanopy.touch(canopy);
+ }
+ if (norm < t2) {
+ if (closestCoveringCanopy == null || norm < closestNorm) {
+ closestNorm = norm;
+ closestCoveringCanopy = canopy;
+ }
+ }
+ }
+ if (closestCoveringCanopy == null) {
+ canopies.add(aCanopy);
+ } else {
+ closestCoveringCanopy.merge(aCanopy);
+ }
+ }
+
+ /** Emit the new canopy to the collector, keyed by the canopy's Id */
+ void emitCanopy(MeanShiftCanopy canopy,
+ OutputCollector<Text, WritableComparable<?>> collector)
+ throws IOException {
+ String identifier = canopy.getIdentifier();
+ collector.collect(new Text(identifier), new Text("new " + canopy.toString()));
+ }
+
+ /**
+ * Shift the center to the new centroid of the cluster
+ *
+ * @param the canopy to shift.
+ * @return if the cluster is converged
+ */
+ public boolean shiftToMean(MeanShiftCanopy canopy) {
+ Vector centroid = canopy.computeCentroid();
+ canopy.setConverged(new EuclideanDistanceMeasure().distance(centroid, canopy.getCenter()) < convergenceDelta);
+ canopy.setCenter(centroid);
+ canopy.setNumPoints(1);
+ canopy.setPointTotal(centroid.clone());
+ return canopy.isConverged();
+ }
+
+ /**
+ * Return if the point is covered by this canopy
+ *
+ * @param canopy a canopy.
+ * @param point a Vector point
+ * @return if the point is covered
+ */
+ boolean covers(MeanShiftCanopy canopy, Vector point) {
+ return measure.distance(canopy.getCenter(), point) < t1;
+ }
+
+ /**
+ * Return if the point is closely covered by the canopy
+ *
+ * @param canopy a canopy.
+ * @param point a Vector point
+ * @return if the point is covered
+ */
+ public boolean closelyBound(MeanShiftCanopy canopy, Vector point) {
+ return measure.distance(canopy.getCenter(), point) < t2;
+ }
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyConfigKeys.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyConfigKeys.java?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyConfigKeys.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyConfigKeys.java Thu Dec 10 11:01:28 2009
@@ -0,0 +1,12 @@
+package org.apache.mahout.clustering.meanshift;
+
+public class MeanShiftCanopyConfigKeys {
+
+ // keys used by Driver, Mapper, Combiner & Reducer
+ public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";
+ public static final String T1_KEY = "org.apache.mahout.clustering.canopy.t1";
+ public static final String T2_KEY = "org.apache.mahout.clustering.canopy.t2";
+ public static final String CONTROL_PATH_KEY = "org.apache.mahout.clustering.control.path";
+ public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.canopy.convergence";
+
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/package.html
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/package.html?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/package.html (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/package.html Thu Dec 10 11:01:28 2009
@@ -0,0 +1,37 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head>
+<!--
+
+ @(#)package.html
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-->
+</head>
+<body bgcolor="white">
+This package provides several clustering algorithm implementations. Clustering usually groups a set of objects into groups
+of similar items. The definition of similarity usually is up to you - for text documents, cosine-distance/-similarity is
+recommended. Mahout also features other types of distance measure like Euclidean distance.
+<br>
+Input of each clustering algorithm is a set of vectors representing your items. For texts in general these are
+<a href="http://en.wikipedia.org/wiki/TFIDF">TFIDF</a> or <a href="http://en.wikipedia.org/wiki/Bag_of_words">Bag of words</a>
+representations of the documents.
+<br>
+Output of each clustering algorithm is either a hard or soft assigment of items to clusters.
+
+</body>
+</html>
\ No newline at end of file
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java?rev=889184&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java Thu Dec 10 11:01:28 2009
@@ -0,0 +1,101 @@
+package org.apache.mahout.clustering.kmeans;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.ClusteringTestUtils;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.matrix.Vector;
+
+public class TestRandomSeedGenerator extends TestCase {
+
+ static final double[][] raw = {{1, 1}, {2, 1}, {1, 2}, {2, 2},
+ {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
+
+ FileSystem fs;
+
+ private static List<Vector> getPoints(double[][] raw) {
+ List<Vector> points = new ArrayList<Vector>();
+ int i = 0;
+ for (double[] fr : raw) {
+ Vector vec = new SparseVector(String.valueOf(i++), fr.length);
+ vec.assign(fr);
+ points.add(vec);
+ }
+ return points;
+ }
+
+ private static void rmr(String path) throws Exception {
+ File f = new File(path);
+ if (f.exists()) {
+ if (f.isDirectory()) {
+ String[] contents = f.list();
+ for (String content : contents) {
+ rmr(f.toString() + File.separator + content);
+ }
+ }
+ f.delete();
+ }
+ }
+
+ public void setUp() throws Exception {
+ super.setUp();
+ rmr("testdata");
+ Configuration conf = new Configuration();
+ fs = FileSystem.get(conf);
+ }
+
+ /** Story: test random seed generation generates 4 clusters with proper ids and data */
+ public void testRandomSeedGenerator() throws Exception {
+ List<Vector> points = getPoints(raw);
+ File testData = new File("testdata");
+ if (!testData.exists()) {
+ testData.mkdir();
+ }
+
+ File randomOutput = new File("testdata/random-output");
+ if (!randomOutput.exists()) {
+ randomOutput.mkdir();
+ }
+
+ JobConf job = new JobConf(RandomSeedGenerator.class);
+ ClusteringTestUtils.writePointsToFile(points, "testdata/random-input", fs, job);
+
+ RandomSeedGenerator.buildRandom("testdata/random-input", "testdata/random-output", 4);
+
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("testdata/random-output/part-randomSeed"), job);
+ Writable key = (Writable) reader.getKeyClass().newInstance();
+ Cluster value = (Cluster) reader.getValueClass().newInstance();
+
+ int clusterCount = 0;
+ Set<Integer> set = new HashSet<Integer>();
+ while (reader.next(key, value)) {
+ clusterCount++;
+ int id = value.getId();
+ TestCase.assertTrue(set.add(id)); // validate unique id's
+
+ Vector v = value.getCenter();
+ assertVectorEquals(raw[id], v); // validate values match
+ }
+
+ TestCase.assertEquals(4, clusterCount); // validate sample count
+ }
+
+ public void assertVectorEquals(double[] raw, Vector v) {
+ TestCase.assertEquals(raw.length, v.size());
+ for (int i=0; i < raw.length; i++) {
+ TestCase.assertEquals(raw[i], v.getQuick(i));
+ }
+ }
+}