You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2009/07/10 11:35:28 UTC
svn commit: r792856 [8/13] - in /lucene/mahout/trunk/core/src:
main/java/org/apache/mahout/cf/taste/common/
main/java/org/apache/mahout/cf/taste/eval/
main/java/org/apache/mahout/cf/taste/hadoop/
main/java/org/apache/mahout/cf/taste/impl/common/ main/j...
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/UncommonDistributions.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/UncommonDistributions.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/UncommonDistributions.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/UncommonDistributions.java Fri Jul 10 09:35:19 2009
@@ -17,13 +17,13 @@
* limitations under the License.
*/
-import java.util.Random;
-
import org.apache.mahout.matrix.DenseVector;
import org.apache.mahout.matrix.Vector;
import org.uncommons.maths.random.GaussianGenerator;
import org.uncommons.maths.random.MersenneTwisterRNG;
+import java.util.Random;
+
public final class UncommonDistributions {
private static final double sqrt2pi = Math.sqrt(2.0 * Math.PI);
@@ -38,12 +38,10 @@
}
//=============== start of BSD licensed code. See LICENSE.txt
- /**
- * Returns a double sampled according to this distribution. Uniformly
- * fast for all k > 0. (Reference: Non-Uniform Random Variate Generation,
- * Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Uses Cheng's
- * rejection algorithm (GB) for k>=1, rejection from Weibull distribution
- * for 0 < k < 1.
+ /**
+ * Returns a double sampled according to this distribution. Uniformly fast for all k > 0. (Reference: Non-Uniform
+ * Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Uses Cheng's rejection algorithm
+ * (GB) for k>=1, rejection from Weibull distribution for 0 < k < 1.
*/
public static double rGamma(double k, double lambda) {
boolean accept = false;
@@ -88,9 +86,8 @@
//============= end of BSD licensed code
/**
- * Returns a random sample from a beta distribution with
- * the given shapes
- *
+ * Returns a random sample from a beta distribution with the given shapes
+ *
* @param shape1 a double representing shape1
* @param shape2 a double representing shape2
* @return a Vector of samples
@@ -103,10 +100,9 @@
}
/**
- * Returns a vector of random samples from a beta distribution with
- * the given shapes
- *
- * @param K the number of samples to return
+ * Returns a vector of random samples from a beta distribution with the given shapes
+ *
+ * @param K the number of samples to return
* @param shape1 a double representing shape1
* @param shape2 a double representing shape2
* @return a Vector of samples
@@ -116,15 +112,15 @@
//params.add(shape1);
//params.add(Math.max(0, shape2));
Vector result = new DenseVector(K);
- for (int i = 0; i < K; i++)
+ for (int i = 0; i < K; i++) {
result.set(i, rBeta(shape1, shape2));
+ }
return result;
}
/**
- * Return a random sample from the chi-squared (chi^2) distribution with df
- * degrees of freedom.
- * @param df
+ * Return a random sample from the chi-squared (chi^2) distribution with df degrees of freedom.
+ *
* @return a double sample
*/
public static double rChisq(double df) {
@@ -137,11 +133,10 @@
}
/**
- * Return a random value from a normal distribution with the given mean and
- * standard deviation
- *
+ * Return a random value from a normal distribution with the given mean and standard deviation
+ *
* @param mean a double mean value
- * @param sd a double standard deviation
+ * @param sd a double standard deviation
* @return a double sample
*/
public static double rNorm(double mean, double sd) {
@@ -151,10 +146,9 @@
/**
* Return the normal density function value for the sample x
- *
+ *
* pdf = 1/[sqrt(2*p)*s] * e^{-1/2*[(x-m)/s]^2}
- *
- *
+ *
* @param x a double sample value
* @param m a double mean value
* @param s a double standard deviation
@@ -167,9 +161,7 @@
return exp / (sqrt2pi * s);
}
- /**
- * Returns one sample from a multinomial.
- */
+ /** Returns one sample from a multinomial. */
public static int rMultinom(Vector probabilities) {
// our probability argument are not normalized.
double total = probabilities.zSum();
@@ -189,19 +181,15 @@
/**
* Returns a multinomial vector sampled from the given probabilities
- *
+ *
* rmultinom should be implemented as successive binomial sampling.
*
- *Keep a normalizing amount that starts with 1 (I call it total).
+ * Keep a normalizing amount that starts with 1 (I call it total).
+ *
+ * For each i k[i] = rbinom(p[i] / total, size); total -= p[i]; size -= k[i];
*
- * For each i
- * k[i] = rbinom(p[i] / total, size);
- * total -= p[i];
- * size -= k[i];
- *
- * @param size the size parameter of the binomial distribution
+ * @param size the size parameter of the binomial distribution
* @param probabilities a Vector of probabilities
- *
* @return a multinomial distribution Vector
*/
public static Vector rMultinom(int size, Vector probabilities) {
@@ -220,14 +208,14 @@
}
/**
- * Returns an integer sampled according to this distribution. Takes time
- * proprotional to np + 1. (Reference: Non-Uniform Random Variate
- * Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html)
- * Second time-waiting algorithm.
+ * Returns an integer sampled according to this distribution. Takes time proprotional to np + 1. (Reference:
+ * Non-Uniform Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Second time-waiting
+ * algorithm.
*/
public static int rBinomial(int n, double p) {
- if (p >= 1)
+ if (p >= 1) {
return n; // needed to avoid infinite loops and negative results
+ }
double q = -Math.log(1 - p);
double sum = 0;
int x = 0;
@@ -237,15 +225,18 @@
sum += (e / (n - x));
x += 1;
}
- if (x == 0)
- return 0;
+ if (x == 0) {
+ {
+ return 0;
+ }
+ }
return x - 1;
}
/**
- * Sample from a Dirichlet distribution over the given alpha,
- * returning a vector of probabilities using a stick-breaking algorithm
- *
+ * Sample from a Dirichlet distribution over the given alpha, returning a vector of probabilities using a
+ * stick-breaking algorithm
+ *
* @param alpha an unnormalized count Vector
* @return a Vector of probabilities
*/
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AsymmetricSampledNormalDistribution.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AsymmetricSampledNormalDistribution.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AsymmetricSampledNormalDistribution.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AsymmetricSampledNormalDistribution.java Fri Jul 10 09:35:19 2009
@@ -22,10 +22,9 @@
import org.apache.mahout.matrix.Vector;
/**
- * An implementation of the ModelDistribution interface suitable for testing the
- * DirichletCluster algorithm. Uses a Normal Distribution to sample the prior
- * model values. Model values have a vector standard deviation, allowing assymetrical
- * regions to be covered by a model.
+ * An implementation of the ModelDistribution interface suitable for testing the DirichletCluster algorithm. Uses a
+ * Normal Distribution to sample the prior model values. Model values have a vector standard deviation, allowing
+ * assymetrical regions to be covered by a model.
*/
public class AsymmetricSampledNormalDistribution implements
ModelDistribution<Vector> {
@@ -34,11 +33,11 @@
public Model<Vector>[] sampleFromPrior(int howMany) {
Model<Vector>[] result = new AsymmetricSampledNormalModel[howMany];
for (int i = 0; i < howMany; i++) {
- double[] m = { UncommonDistributions.rNorm(0, 1),
- UncommonDistributions.rNorm(0, 1) };
+ double[] m = {UncommonDistributions.rNorm(0, 1),
+ UncommonDistributions.rNorm(0, 1)};
DenseVector mean = new DenseVector(m);
- double[] s = { UncommonDistributions.rNorm(1, 1),
- UncommonDistributions.rNorm(1, 1) };
+ double[] s = {UncommonDistributions.rNorm(1, 1),
+ UncommonDistributions.rNorm(1, 1)};
DenseVector sd = new DenseVector(s);
result[i] = new AsymmetricSampledNormalModel(mean, sd);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AsymmetricSampledNormalModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AsymmetricSampledNormalModel.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AsymmetricSampledNormalModel.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/AsymmetricSampledNormalModel.java Fri Jul 10 09:35:19 2009
@@ -16,14 +16,14 @@
*/
package org.apache.mahout.clustering.dirichlet.models;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.SquareRootFunction;
import org.apache.mahout.matrix.Vector;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
public class AsymmetricSampledNormalModel implements Model<Vector> {
private static final double sqrt2pi = Math.sqrt(2.0 * Math.PI);
@@ -55,7 +55,7 @@
/**
* Return an instance with the same parameters
- *
+ *
* @return an AsymmetricSampledNormalModel
*/
AsymmetricSampledNormalModel sample() {
@@ -65,20 +65,23 @@
@Override
public void observe(Vector x) {
s0++;
- if (s1 == null)
+ if (s1 == null) {
s1 = x.clone();
- else
+ } else {
s1 = s1.plus(x);
- if (s2 == null)
+ }
+ if (s2 == null) {
s2 = x.times(x);
- else
+ } else {
s2 = s2.plus(x.times(x));
+ }
}
@Override
public void computeParameters() {
- if (s0 == 0)
+ if (s0 == 0) {
return;
+ }
mean = s1.divide(s0);
// compute the two component stds
if (s0 > 1) {
@@ -91,10 +94,9 @@
/**
* Calculate a pdf using the supplied sample and sd
- *
- * @param x a Vector sample
+ *
+ * @param x a Vector sample
* @param sd a double std deviation
- * @return
*/
private double pdf(Vector x, double sd) {
assert x.getNumNondefaultElements() == 2;
@@ -124,13 +126,17 @@
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("asnm{n=").append(s0).append(" m=[");
- if (mean != null)
- for (int i = 0; i < mean.size(); i++)
+ if (mean != null) {
+ for (int i = 0; i < mean.size(); i++) {
buf.append(String.format("%.2f", mean.get(i))).append(", ");
+ }
+ }
buf.append("] sd=[");
- if (sd != null)
- for (int i = 0; i < sd.size(); i++)
+ if (sd != null) {
+ for (int i = 0; i < sd.size(); i++) {
buf.append(String.format("%.2f", sd.get(i))).append(", ");
+ }
+ }
buf.append("]}");
return buf.toString();
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/Model.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/Model.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/Model.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/Model.java Fri Jul 10 09:35:19 2009
@@ -20,35 +20,32 @@
*/
/**
- * A model is a probability distribution over observed data points and allows
- * the probability of any data point to be computed.
+ * A model is a probability distribution over observed data points and allows the probability of any data point to be
+ * computed.
*/
public interface Model<O> extends Writable {
/**
* Observe the given observation, retaining information about it
- *
+ *
* @param x an Observation from the posterior
*/
void observe(O x);
- /**
- * Compute a new set of posterior parameters based upon the Observations
- * that have been observed since my creation
- */
+ /** Compute a new set of posterior parameters based upon the Observations that have been observed since my creation */
void computeParameters();
/**
- * Return the probability that the observation is described by this model
- *
- * @param x an Observation from the posterior
- * @return the probability that x is in the receiver
- */
+ * Return the probability that the observation is described by this model
+ *
+ * @param x an Observation from the posterior
+ * @return the probability that x is in the receiver
+ */
double pdf(O x);
/**
* Return the number of observations that have been observed by this model
- *
+ *
* @return an int
*/
int count();
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/ModelDistribution.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/ModelDistribution.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/ModelDistribution.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/ModelDistribution.java Fri Jul 10 09:35:19 2009
@@ -17,14 +17,12 @@
* limitations under the License.
*/
-/**
- * A model distribution allows us to sample a model from its prior distribution.
- */
+/** A model distribution allows us to sample a model from its prior distribution. */
public interface ModelDistribution<O> {
/**
* Return a list of models sampled from the prior
- *
+ *
* @param howMany the int number of models to return
* @return a Model<Observation>[] representing what is known apriori
*/
@@ -32,7 +30,7 @@
/**
* Return a list of models sampled from the posterior
- *
+ *
* @param posterior the Model<Observation>[] after observations
* @return a Model<Observation>[] representing what is known apriori
*/
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/NormalModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/NormalModel.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/NormalModel.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/NormalModel.java Fri Jul 10 09:35:19 2009
@@ -16,14 +16,14 @@
*/
package org.apache.mahout.clustering.dirichlet.models;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.SquareRootFunction;
import org.apache.mahout.matrix.Vector;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
public class NormalModel implements Model<Vector> {
private static final double sqrt2pi = Math.sqrt(2.0 * Math.PI);
@@ -52,9 +52,8 @@
}
/**
- * TODO: Return a proper sample from the posterior. For now, return an
- * instance with the same parameters
- *
+ * TODO: Return a proper sample from the posterior. For now, return an instance with the same parameters
+ *
* @return an NormalModel
*/
public NormalModel sample() {
@@ -64,28 +63,32 @@
@Override
public void observe(Vector x) {
s0++;
- if (s1 == null)
+ if (s1 == null) {
s1 = x.clone();
- else
+ } else {
s1 = s1.plus(x);
- if (s2 == null)
+ }
+ if (s2 == null) {
s2 = x.times(x);
- else
+ } else {
s2 = s2.plus(x.times(x));
+ }
}
@Override
public void computeParameters() {
- if (s0 == 0)
+ if (s0 == 0) {
return;
+ }
mean = s1.divide(s0);
// compute the average of the component stds
if (s0 > 1) {
Vector std = s2.times(s0).minus(s1.times(s1)).assign(
new SquareRootFunction()).divide(s0);
sd = std.zSum() / s1.size();
- } else
+ } else {
sd = Double.MIN_VALUE;
+ }
}
@Override
@@ -105,9 +108,11 @@
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("nm{n=").append(s0).append(" m=[");
- if (mean != null)
- for (int i = 0; i < mean.size(); i++)
+ if (mean != null) {
+ for (int i = 0; i < mean.size(); i++) {
buf.append(String.format("%.2f", mean.get(i))).append(", ");
+ }
+ }
buf.append("] sd=").append(String.format("%.2f", sd)).append('}');
return buf.toString();
}
@@ -127,6 +132,6 @@
out.writeDouble(sd);
out.writeInt(s0);
AbstractVector.writeVector(out, s1);
- AbstractVector.writeVector(out, s2);
+ AbstractVector.writeVector(out, s2);
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/NormalModelDistribution.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/NormalModelDistribution.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/NormalModelDistribution.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/NormalModelDistribution.java Fri Jul 10 09:35:19 2009
@@ -21,16 +21,17 @@
import org.apache.mahout.matrix.Vector;
/**
- * An implementation of the ModelDistribution interface suitable for testing the
- * DirichletCluster algorithm. Uses a Normal Distribution
+ * An implementation of the ModelDistribution interface suitable for testing the DirichletCluster algorithm. Uses a
+ * Normal Distribution
*/
public class NormalModelDistribution implements ModelDistribution<Vector> {
@Override
public Model<Vector>[] sampleFromPrior(int howMany) {
Model<Vector>[] result = new NormalModel[howMany];
- for (int i = 0; i < howMany; i++)
+ for (int i = 0; i < howMany; i++) {
result[i] = new NormalModel(new DenseVector(2), 1);
+ }
return result;
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/SampledNormalDistribution.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/SampledNormalDistribution.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/SampledNormalDistribution.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/SampledNormalDistribution.java Fri Jul 10 09:35:19 2009
@@ -22,9 +22,8 @@
import org.apache.mahout.matrix.Vector;
/**
- * An implementation of the ModelDistribution interface suitable for testing the
- * DirichletCluster algorithm. Uses a Normal Distribution to sample the prior
- * model values.
+ * An implementation of the ModelDistribution interface suitable for testing the DirichletCluster algorithm. Uses a
+ * Normal Distribution to sample the prior model values.
*/
public class SampledNormalDistribution extends NormalModelDistribution {
@@ -32,8 +31,8 @@
public Model<Vector>[] sampleFromPrior(int howMany) {
Model<Vector>[] result = new SampledNormalModel[howMany];
for (int i = 0; i < howMany; i++) {
- double[] m = { UncommonDistributions.rNorm(0, 1),
- UncommonDistributions.rNorm(0, 1) };
+ double[] m = {UncommonDistributions.rNorm(0, 1),
+ UncommonDistributions.rNorm(0, 1)};
DenseVector mean = new DenseVector(m);
result[i] = new SampledNormalModel(mean, 1);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/SampledNormalModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/SampledNormalModel.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/SampledNormalModel.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/SampledNormalModel.java Fri Jul 10 09:35:19 2009
@@ -32,15 +32,18 @@
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("snm{n=").append(s0).append(" m=[");
- if (mean != null)
- for (int i = 0; i < mean.size(); i++)
+ if (mean != null) {
+ for (int i = 0; i < mean.size(); i++) {
buf.append(String.format("%.2f", mean.get(i))).append(", ");
+ }
+ }
buf.append("] sd=").append(String.format("%.2f", sd)).append('}');
return buf.toString();
}
/**
* Return an instance with the same parameters
+ *
* @return an SampledNormalModel
*/
@Override
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java Fri Jul 10 09:35:19 2009
@@ -17,26 +17,26 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.matrix.Vector;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
public class FuzzyKMeansClusterMapper extends MapReduceBase implements
- Mapper<WritableComparable<?>, Vector, Text, FuzzyKMeansOutput> {
+ Mapper<WritableComparable<?>, Vector, Text, FuzzyKMeansOutput> {
protected List<SoftCluster> clusters;
+
@Override
public void map(WritableComparable<?> key, Vector point,
- OutputCollector<Text, FuzzyKMeansOutput> output, Reporter reporter) throws IOException
- {
+ OutputCollector<Text, FuzzyKMeansOutput> output, Reporter reporter) throws IOException {
SoftCluster.outputPointWithClusterProbabilities(key.toString(), point, clusters, output);
}
@@ -59,8 +59,9 @@
FuzzyKMeansUtil.configureWithClusterInfo(job
.get(SoftCluster.CLUSTER_PATH_KEY), clusters);
- if (clusters.isEmpty())
+ if (clusters.isEmpty()) {
throw new NullPointerException("Cluster is empty!!!");
+ }
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java Fri Jul 10 09:35:19 2009
@@ -17,9 +17,6 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import java.io.IOException;
-import java.util.Iterator;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
@@ -27,12 +24,15 @@
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import java.io.IOException;
+import java.util.Iterator;
+
public class FuzzyKMeansCombiner extends MapReduceBase implements
Reducer<Text, FuzzyKMeansInfo, Text, FuzzyKMeansInfo> {
@Override
public void reduce(Text key, Iterator<FuzzyKMeansInfo> values,
- OutputCollector<Text, FuzzyKMeansInfo> output, Reporter reporter) throws IOException {
+ OutputCollector<Text, FuzzyKMeansInfo> output, Reporter reporter) throws IOException {
SoftCluster cluster = new SoftCluster(key.toString().trim());
while (values.hasNext()) {
//String pointInfo = values.next().toString();
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java Fri Jul 10 09:35:19 2009
@@ -25,6 +25,7 @@
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -38,7 +39,6 @@
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
-import org.apache.hadoop.conf.Configuration;
import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
import org.apache.mahout.matrix.SparseVector;
import org.apache.mahout.matrix.Vector;
@@ -56,7 +56,7 @@
public class FuzzyKMeansDriver {
private static final Logger log = LoggerFactory
- .getLogger(FuzzyKMeansDriver.class);
+ .getLogger(FuzzyKMeansDriver.class);
private FuzzyKMeansDriver() {
@@ -68,65 +68,65 @@
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
- abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
- withDescription("The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
Option clustersOpt = obuilder.withLongName("clusters").withRequired(true).withArgument(
- abuilder.withName("clusters").withMinimum(1).withMaximum(1).create()).
- withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " +
- "If k is also specified, then a random set of vectors will be selected and written out to this path first").withShortName("c").create();
+ abuilder.withName("clusters").withMinimum(1).withMaximum(1).create()).
+ withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " +
+ "If k is also specified, then a random set of vectors will be selected and written out to this path first").withShortName("c").create();
Option kOpt = obuilder.withLongName("k").withRequired(false).withArgument(
- abuilder.withName("k").withMinimum(1).withMaximum(1).create()).
- withDescription("The k in k-Means. If specified, then a random selection of k Vectors will be chosen as the Centroid and written to the clusters output path.").withShortName("k").create();
+ abuilder.withName("k").withMinimum(1).withMaximum(1).create()).
+ withDescription("The k in k-Means. If specified, then a random selection of k Vectors will be chosen as the Centroid and written to the clusters output path.").withShortName("k").create();
Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
- withDescription("The Path to put the output in").withShortName("o").create();
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Path to put the output in").withShortName("o").create();
Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
- abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
- withDescription("The Distance Measure to use. Default is SquaredEuclidean").withShortName("dm").create();
+ abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Distance Measure to use. Default is SquaredEuclidean").withShortName("dm").create();
Option convergenceDeltaOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(
- abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).
- withDescription("The threshold below which the clusters are considered to be converged. Default is 0.5").withShortName("d").create();
+ abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).
+ withDescription("The threshold below which the clusters are considered to be converged. Default is 0.5").withShortName("d").create();
Option maxIterationsOpt = obuilder.withLongName("max").withRequired(false).withArgument(
- abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
- withDescription("The maximum number of iterations to perform. Default is 20").withShortName("x").create();
+ abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
+ withDescription("The maximum number of iterations to perform. Default is 20").withShortName("x").create();
Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
- abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
- withDescription("The Vector implementation class name. Default is SparseVector.class").withShortName("v").create();
+ abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Vector implementation class name. Default is SparseVector.class").withShortName("v").create();
Option helpOpt = obuilder.withLongName("help").
- withDescription("Print out help").withShortName("h").create();
+ withDescription("Print out help").withShortName("h").create();
Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).
- withDescription("If set, overwrite the output directory").withShortName("w").create();
+ withDescription("If set, overwrite the output directory").withShortName("w").create();
Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).
- withDescription("If true, run clustering only (assumes the iterations have already taken place").withShortName("l").create();
+ withDescription("If true, run clustering only (assumes the iterations have already taken place").withShortName("l").create();
Option mOpt = obuilder.withLongName("m").withRequired(true).withArgument(
- abuilder.withName("m").withMinimum(1).withMaximum(1).create()).
- withDescription("coefficient normalization factor, must be greater than 1").withShortName("m").create();
+ abuilder.withName("m").withMinimum(1).withMaximum(1).create()).
+ withDescription("coefficient normalization factor, must be greater than 1").withShortName("m").create();
Option numReduceTasksOpt = obuilder.withLongName("numReduce").withRequired(false).withArgument(
- abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).
- withDescription("The number of reduce tasks").withShortName("r").create();
+ abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).
+ withDescription("The number of reduce tasks").withShortName("r").create();
Option numMapTasksOpt = obuilder.withLongName("numMap").withRequired(false).withArgument(
- abuilder.withName("numMap").withMinimum(1).withMaximum(1).create()).
- withDescription("The number of map tasks").withShortName("u").create();
+ abuilder.withName("numMap").withMinimum(1).withMaximum(1).create()).
+ withDescription("The number of map tasks").withShortName("u").create();
Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(measureClassOpt)
- .withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(kOpt).withOption(mOpt)
- .withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).create();
+ .withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(kOpt).withOption(mOpt)
+ .withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).create();
try {
Parser parser = new Parser();
@@ -150,8 +150,8 @@
float m = Float.parseFloat(cmdLine.getValue(mOpt).toString());
Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
- SparseVector.class
- : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+ SparseVector.class
+ : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
int numReduceTasks = 10;
@@ -175,14 +175,14 @@
if (cmdLine.hasOption(kOpt)) {
clusters = RandomSeedGenerator.buildRandom(input, clusters,
- Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
+ Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
}
if (cmdLine.hasOption(clusteringOpt)) {
runClustering(input, clusters, output, measureClass, convergenceDelta, numMapTasks, m, vectorClass);
} else {
runJob(input, clusters, output, measureClass, convergenceDelta,
- maxIterations, numMapTasks, numReduceTasks, m, vectorClass);
+ maxIterations, numMapTasks, numReduceTasks, m, vectorClass);
}
@@ -206,7 +206,7 @@
* @param numMapTasks the number of mapper tasks
* @param numReduceTasks the number of reduce tasks
* @param m the fuzzification factor, see http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
- * @param vectorClass the {@link org.apache.mahout.matrix.Vector} implementation to use
+ * @param vectorClass the {@link org.apache.mahout.matrix.Vector} implementation to use
*/
public static void runJob(String input, String clustersIn, String output,
String measureClass, double convergenceDelta, int maxIterations,
@@ -222,7 +222,7 @@
// point the output to a new directory per iteration
String clustersOut = output + File.separator + "clusters-" + iteration;
converged = runIteration(input, clustersIn, clustersOut, measureClass,
- convergenceDelta, numMapTasks, numReduceTasks, iteration, m);
+ convergenceDelta, numMapTasks, numReduceTasks, iteration, m);
// now point the input to the old output directory
clustersIn = output + File.separator + "clusters-" + iteration;
@@ -233,7 +233,7 @@
log.info("Clustering ");
runClustering(input, clustersIn, output + File.separator + "points",
- measureClass, convergenceDelta, numMapTasks, m, vectorClass);
+ measureClass, convergenceDelta, numMapTasks, m, vectorClass);
}
/**
@@ -361,7 +361,7 @@
};
FileStatus[] matches = fs.listStatus(FileUtil.stat2Paths(fs.globStatus(
- clusterPath, clusterFileFilter)), clusterFileFilter);
+ clusterPath, clusterFileFilter)), clusterFileFilter);
for (FileStatus match : matches) {
result.add(fs.makeQualified(match.getPath()));
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansInfo.java Fri Jul 10 09:35:19 2009
@@ -17,14 +17,14 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
import org.apache.hadoop.io.Writable;
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.Vector;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
public class FuzzyKMeansInfo implements Writable {
private double probability;
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansJob.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansJob.java Fri Jul 10 09:35:19 2009
@@ -17,14 +17,14 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import java.io.IOException;
-
import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.utils.ManhattanDistanceMeasure;
import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.ManhattanDistanceMeasure;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+
public class FuzzyKMeansJob {
private static final Logger log = LoggerFactory
@@ -57,9 +57,7 @@
maxIterations, numMapTasks, numReduceTasks, doCanopy, m, vectorClass);
}
- /**
- * Prints Error Message
- */
+ /** Prints Error Message */
private static void printMessage() {
log
.warn("Usage: inputDir clusterDir OutputDir measureClass ConvergenceDelata maxIterations numMapTasks numReduceTasks doCanopy m");
@@ -67,20 +65,20 @@
/**
* Run the job using supplied arguments
- *
- * @param input the directory pathname for input points
- * @param clustersIn the directory pathname for initial clusters
- * @param output the directory pathname for output points
- * @param measureClass the classname of the DistanceMeasure
+ *
+ * @param input the directory pathname for input points
+ * @param clustersIn the directory pathname for initial clusters
+ * @param output the directory pathname for output points
+ * @param measureClass the classname of the DistanceMeasure
* @param convergenceDelta the convergence delta value
- * @param maxIterations the maximum number of iterations
- * @param numMapTasks the number of maptasks
- * @param doCanopy does canopy needed for initial clusters
- * @param m param needed to fuzzify the cluster membership values
+ * @param maxIterations the maximum number of iterations
+ * @param numMapTasks the number of maptasks
+ * @param doCanopy does canopy needed for initial clusters
+ * @param m param needed to fuzzify the cluster membership values
*/
public static void runJob(String input, String clustersIn, String output,
- String measureClass, double convergenceDelta, int maxIterations,
- int numMapTasks, int numReduceTasks, boolean doCanopy, float m, Class<? extends Vector> vectorClass)
+ String measureClass, double convergenceDelta, int maxIterations,
+ int numMapTasks, int numReduceTasks, boolean doCanopy, float m, Class<? extends Vector> vectorClass)
throws IOException {
// run canopy to find initial clusters
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java Fri Jul 10 09:35:19 2009
@@ -17,10 +17,6 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
@@ -32,6 +28,10 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
public class FuzzyKMeansMapper extends MapReduceBase implements
Mapper<WritableComparable<?>, Vector, Text, FuzzyKMeansInfo> {
@@ -41,13 +41,13 @@
@Override
public void map(WritableComparable<?> key, Vector point,
- OutputCollector<Text, FuzzyKMeansInfo> output, Reporter reporter) throws IOException {
+ OutputCollector<Text, FuzzyKMeansInfo> output, Reporter reporter) throws IOException {
SoftCluster.emitPointProbToCluster(point, clusters, output);
}
/**
* Configure the mapper by providing its clusters. Used by unit tests.
- *
+ *
* @param clusters a List<Cluster>
*/
void config(List<SoftCluster> clusters) {
@@ -66,8 +66,9 @@
FuzzyKMeansUtil.configureWithClusterInfo(job
.get(SoftCluster.CLUSTER_PATH_KEY), clusters);
- if (clusters.isEmpty())
+ if (clusters.isEmpty()) {
throw new NullPointerException("Cluster is empty!!!");
+ }
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java Fri Jul 10 09:35:19 2009
@@ -17,13 +17,6 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
@@ -31,6 +24,13 @@
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
public class FuzzyKMeansReducer extends MapReduceBase implements
Reducer<Text, FuzzyKMeansInfo, Text, SoftCluster> {
@@ -38,7 +38,7 @@
@Override
public void reduce(Text key, Iterator<FuzzyKMeansInfo> values,
- OutputCollector<Text, SoftCluster> output, Reporter reporter) throws IOException {
+ OutputCollector<Text, SoftCluster> output, Reporter reporter) throws IOException {
SoftCluster cluster = clusterMap.get(key.toString());
@@ -55,7 +55,7 @@
}
// force convergence calculation
cluster.computeConvergence();
- output.collect(new Text(cluster.getIdentifier()), cluster);
+ output.collect(new Text(cluster.getIdentifier()), cluster);
}
@Override
@@ -70,8 +70,9 @@
.get(SoftCluster.CLUSTER_PATH_KEY), clusters);
setClusterMap(clusters);
- if (clusterMap.isEmpty())
+ if (clusterMap.isEmpty()) {
throw new NullPointerException("Cluster is empty!!!");
+ }
}
private void setClusterMap(List<SoftCluster> clusters) {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java Fri Jul 10 09:35:19 2009
@@ -17,6 +17,7 @@
*/
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -24,7 +25,6 @@
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.conf.Configuration;
import org.apache.mahout.clustering.kmeans.Cluster;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -39,12 +39,7 @@
private FuzzyKMeansUtil() {
}
- /**
- * Configure the mapper with the cluster info
- *
- * @param clusterPathStr
- * @param clusters
- */
+ /** Configure the mapper with the cluster info */
public static void configureWithClusterInfo(String clusterPathStr, List<SoftCluster> clusters) {
//Get the path location where the cluster Info is stored
Configuration job = new Configuration();
@@ -63,7 +58,7 @@
//get all filtered file names in result list
FileSystem fs = clusterPath.getFileSystem(job);
FileStatus[] matches = fs.listStatus(FileUtil.stat2Paths(fs.globStatus(
- clusterPath, clusterFileFilter)), clusterFileFilter);
+ clusterPath, clusterFileFilter)), clusterFileFilter);
for (FileStatus match : matches) {
result.add(fs.makeQualified(match.getPath()));
@@ -86,7 +81,7 @@
log.error("Exception", e);
throw new RuntimeException(e);
}
- if (valueClass.equals(Cluster.class)){
+ if (valueClass.equals(Cluster.class)) {
Cluster value = new Cluster();
while (reader.next(key, value)) {
// get the cluster info
@@ -94,7 +89,7 @@
clusters.add(theCluster);
value = new Cluster();
}
- } else if (valueClass.equals(SoftCluster.class)){
+ } else if (valueClass.equals(SoftCluster.class)) {
SoftCluster value = new SoftCluster();
while (reader.next(key, value)) {
// get the cluster info
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java Fri Jul 10 09:35:19 2009
@@ -17,12 +17,6 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
@@ -33,6 +27,12 @@
import org.apache.mahout.matrix.Vector;
import org.apache.mahout.utils.DistanceMeasure;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
public class SoftCluster implements Writable {
public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure";
@@ -88,7 +88,7 @@
/**
* Format the SoftCluster for output
- *
+ *
* @param cluster the Cluster
*/
public static String formatCluster(SoftCluster cluster) {
@@ -98,7 +98,7 @@
/**
* Decodes and returns a SoftCluster from the formattedString
- *
+ *
* @param formattedString a String produced by formatCluster
*/
public static SoftCluster decodeCluster(String formattedString) {
@@ -138,7 +138,7 @@
/**
* Configure the distance measure from the job
- *
+ *
* @param job the JobConf for the job
*/
public static void configure(JobConf job) {
@@ -161,8 +161,8 @@
/**
* Configure the distance measure directly. Used by unit tests.
- *
- * @param aMeasure the DistanceMeasure
+ *
+ * @param aMeasure the DistanceMeasure
* @param aConvergenceDelta the delta value used to define convergence
*/
public static void config(DistanceMeasure aMeasure, double aConvergenceDelta) {
@@ -173,15 +173,14 @@
/**
* Emit the point and its probability of belongingness to each cluster
- *
- * @param point a point
+ *
+ * @param point a point
* @param clusters a List<SoftCluster>
- * @param output the OutputCollector to emit into
- * @throws IOException
+ * @param output the OutputCollector to emit into
*/
public static void emitPointProbToCluster(Vector point,
- List<SoftCluster> clusters,
- OutputCollector<Text, FuzzyKMeansInfo> output) throws IOException {
+ List<SoftCluster> clusters,
+ OutputCollector<Text, FuzzyKMeansInfo> output) throws IOException {
List<Double> clusterDistanceList = new ArrayList<Double>();
for (SoftCluster cluster : clusters) {
clusterDistanceList.add(measure.distance(cluster.getCenter(), point));
@@ -203,15 +202,14 @@
/**
* Output point with cluster info (Cluster and probability)
- *
- * @param point a point
+ *
+ * @param point a point
* @param clusters a List<SoftCluster> to test
- * @param output the OutputCollector to emit into
- * @throws IOException
+ * @param output the OutputCollector to emit into
*/
public static void outputPointWithClusterProbabilities(String key,
- Vector point, List<SoftCluster> clusters,
- OutputCollector<Text, FuzzyKMeansOutput> output) throws IOException {
+ Vector point, List<SoftCluster> clusters,
+ OutputCollector<Text, FuzzyKMeansOutput> output) throws IOException {
List<Double> clusterDistanceList = new ArrayList<Double>();
for (SoftCluster cluster : clusters) {
@@ -229,25 +227,21 @@
}
String name = point.getName();
output.collect(new Text(name != null && name.length() != 0 ? name
- : point.asFormatString()),
- fOutput);
+ : point.asFormatString()),
+ fOutput);
}
- /**
- * Computes the probability of a point belonging to a cluster
- *
- * @param clusterDistance
- * @param clusterDistanceList
- */
+ /** Computes the probability of a point belonging to a cluster */
public static double computeProbWeight(double clusterDistance,
- List<Double> clusterDistanceList) {
+ List<Double> clusterDistanceList) {
if (clusterDistance == 0) {
clusterDistance = MINIMAL_VALUE;
}
double denom = 0.0;
for (double eachCDist : clusterDistanceList) {
- if (eachCDist == 0.0)
+ if (eachCDist == 0.0) {
eachCDist = MINIMAL_VALUE;
+ }
denom += Math.pow(clusterDistance / eachCDist, 2.0 / (m - 1));
@@ -257,13 +251,13 @@
/**
* Compute the centroid
- *
+ *
* @return the new centroid
*/
private Vector computeCentroid() {
- if (pointProbSum == 0)
+ if (pointProbSum == 0) {
return weightedPointTotal;
- else if (centroid == null) {
+ } else if (centroid == null) {
// lazy compute new centroid
centroid = weightedPointTotal.divide(pointProbSum);
}
@@ -276,7 +270,7 @@
/**
* Construct a new SoftCluster with the given point as its center
- *
+ *
* @param center the center point
*/
public SoftCluster(Vector center) {
@@ -289,7 +283,7 @@
/**
* Construct a new SoftCluster with the given point as its center
- *
+ *
* @param center the center point
*/
public SoftCluster(Vector center, int clusterId) {
@@ -299,11 +293,7 @@
this.weightedPointTotal = center.like();
}
- /**
- * Construct a new softcluster with the given clusterID
- *
- * @param clusterId
- */
+ /** Construct a new softcluster with the given clusterID */
public SoftCluster(String clusterId) {
this.clusterId = Integer.parseInt((clusterId.substring(1)));
@@ -318,72 +308,70 @@
}
public String getIdentifier() {
- if (converged)
+ if (converged) {
return "V" + clusterId;
- else
+ } else {
return "C" + clusterId;
+ }
}
- /**
- * Observe the point, accumulating weighted variables for std() calculation
- * @param point
- * @param ptProb
- */
+ /** Observe the point, accumulating weighted variables for std() calculation */
private void observePoint(Vector point, double ptProb) {
s0 += ptProb;
Vector wtPt = point.times(ptProb);
- if (s1 == null)
+ if (s1 == null) {
s1 = point.clone();
- else
+ } else {
s1 = s1.plus(wtPt);
- if (s2 == null)
+ }
+ if (s2 == null) {
s2 = wtPt.times(wtPt);
- else
+ } else {
s2 = s2.plus(wtPt.times(wtPt));
+ }
}
- /**
- * Compute a "standard deviation" value to use as the "radius" of the cluster for display purposes
- * @return
- */
+ /** Compute a "standard deviation" value to use as the "radius" of the cluster for display purposes */
public double std() {
if (s0 > 0) {
Vector radical = s2.times(s0).minus(s1.times(s1));
radical = radical.times(radical).assign(new SquareRootFunction());
Vector stds = radical.assign(new SquareRootFunction()).divide(s0);
return stds.zSum() / stds.size();
- } else
+ } else {
return 0;
+ }
}
/**
* Add the point to the SoftCluster
- *
+ *
* @param point a point to add
- * @param ptProb
*/
public void addPoint(Vector point, double ptProb) {
observePoint(point, ptProb);
centroid = null;
pointProbSum += ptProb;
- if (weightedPointTotal == null)
+ if (weightedPointTotal == null) {
weightedPointTotal = point.clone().times(ptProb);
- else
+ } else {
weightedPointTotal = weightedPointTotal.plus(point.times(ptProb));
+ }
}
/**
* Add the point to the cluster
- *
+ *
* @param delta a point to add
*/
public void addPoints(Vector delta, double partialSumPtProb) {
centroid = null;
pointProbSum += partialSumPtProb;
- if (weightedPointTotal == null)
+ if (weightedPointTotal == null) {
weightedPointTotal = delta.clone();
- else
+ } else {
weightedPointTotal = weightedPointTotal.plus(delta);
+ }
}
public Vector getCenter() {
@@ -394,9 +382,7 @@
return pointProbSum;
}
- /**
- * Compute the centroid and set the center to it.
- */
+ /** Compute the centroid and set the center to it. */
public void recomputeCenter() {
center = computeCentroid();
pointProbSum = 0;
@@ -405,7 +391,7 @@
/**
* Return if the cluster is converged by comparing its center and centroid.
- *
+ *
* @return if the cluster is converged
*/
public boolean computeConvergence() {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java Fri Jul 10 09:35:19 2009
@@ -16,11 +16,6 @@
*/
package org.apache.mahout.clustering.kmeans;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.List;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
@@ -31,6 +26,11 @@
import org.apache.mahout.matrix.Vector;
import org.apache.mahout.utils.DistanceMeasure;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.List;
+
public class Cluster extends ClusterBase implements Writable {
private static final String ERROR_UNKNOWN_CLUSTER_FORMAT = "Unknown cluster format:\n";
@@ -41,13 +41,9 @@
public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence";
- /**
- * The number of iterations that have taken place
- */
+ /** The number of iterations that have taken place */
public static final String ITERATION_NUMBER = "org.apache.mahout.clustering.kmeans.iteration";
- /**
- * Boolean value indicating whether the initial input is from Canopy clustering
- */
+ /** Boolean value indicating whether the initial input is from Canopy clustering */
public static final String CANOPY_INPUT = "org.apache.mahout.clustering.kmeans.canopyInput";
private static int nextClusterId = 0;
@@ -60,7 +56,6 @@
private double std;
-
// the total of all the points squared, used for std computation
private Vector pointSquaredTotal = null;
@@ -71,7 +66,7 @@
/**
* Format the cluster for output
- *
+ *
* @param cluster the Cluster
* @return the String representation of the Cluster
*/
@@ -87,7 +82,7 @@
/**
* Decodes and returns a Cluster from the formattedString
- *
+ *
* @param formattedString a String produced by formatCluster
* @return a decoded Cluster, not null
* @throws IllegalArgumentException when the string is wrongly formatted
@@ -105,9 +100,10 @@
Cluster cluster = new Cluster(clusterCenter, clusterId);
cluster.converged = startsWithV;
return cluster;
- } else
+ } else {
throw new IllegalArgumentException(ERROR_UNKNOWN_CLUSTER_FORMAT
+ formattedString);
+ }
}
@@ -130,7 +126,7 @@
/**
* Configure the distance measure from the job
- *
+ *
* @param job the JobConf for the job
*/
public static void configure(JobConf job) {
@@ -152,8 +148,8 @@
/**
* Configure the distance measure directly. Used by unit tests.
- *
- * @param aMeasure the DistanceMeasure
+ *
+ * @param aMeasure the DistanceMeasure
* @param aConvergenceDelta the delta value used to define convergence
*/
public static void config(DistanceMeasure aMeasure, double aConvergenceDelta) {
@@ -164,14 +160,13 @@
/**
* Emit the point to the nearest cluster center
- *
- * @param point a point
+ *
+ * @param point a point
* @param clusters a List<Cluster> to test
- * @param output the OutputCollector to emit into
- * @throws IOException
+ * @param output the OutputCollector to emit into
*/
public static void emitPointToNearestCluster(Vector point,
- List<Cluster> clusters, OutputCollector<Text, KMeansInfo> output)
+ List<Cluster> clusters, OutputCollector<Text, KMeansInfo> output)
throws IOException {
Cluster nearestCluster = null;
double nearestDistance = Double.MAX_VALUE;
@@ -187,7 +182,7 @@
}
public static void outputPointWithClusterInfo(Vector point,
- List<Cluster> clusters, OutputCollector<Text, Text> output)
+ List<Cluster> clusters, OutputCollector<Text, Text> output)
throws IOException {
Cluster nearestCluster = null;
double nearestDistance = Double.MAX_VALUE;
@@ -205,13 +200,13 @@
/**
* Compute the centroid by averaging the pointTotals
- *
+ *
* @return the new centroid
*/
private Vector computeCentroid() {
- if (numPoints == 0)
+ if (numPoints == 0) {
return center;
- else if (centroid == null) {
+ } else if (centroid == null) {
// lazy compute new centroid
centroid = pointTotal.divide(numPoints);
Vector stds = pointSquaredTotal.times(numPoints).minus(
@@ -224,9 +219,8 @@
/**
* Construct a new cluster with the given point as its center
- *
- * @param center
- * the center point
+ *
+ * @param center the center point
*/
public Cluster(Vector center) {
super();
@@ -237,15 +231,13 @@
this.pointSquaredTotal = center.like();
}
- /**
- * For (de)serialization as a Writable
- */
+ /** For (de)serialization as a Writable */
public Cluster() {
}
/**
* Construct a new cluster with the given point as its center
- *
+ *
* @param center the center point
*/
public Cluster(Vector center, int clusterId) {
@@ -257,9 +249,7 @@
this.pointSquaredTotal = center.like();
}
- /**
- * Construct a new clsuter with the given id as identifier
- */
+ /** Construct a new clsuter with the given id as identifier */
public Cluster(String clusterId) {
this.id = Integer.parseInt((clusterId.substring(1)));
@@ -273,15 +263,16 @@
}
public String getIdentifier() {
- if (converged)
+ if (converged) {
return "V" + id;
- else
+ } else {
return "C" + id;
+ }
}
/**
* Add the point to the cluster
- *
+ *
* @param point a point to add
*/
public void addPoint(Vector point) {
@@ -290,7 +281,7 @@
/**
* Add the point to the cluster
- *
+ *
* @param count the number of points in the delta
* @param delta a point to add
*/
@@ -306,12 +297,8 @@
}
}
-
-
- /**
- * Compute the centroid and set the center to it.
- */
+ /** Compute the centroid and set the center to it. */
public void recomputeCenter() {
center = computeCentroid();
numPoints = 0;
@@ -320,7 +307,7 @@
/**
* Return if the cluster is converged by comparing its center and centroid.
- *
+ *
* @return if the cluster is converged
*/
public boolean computeConvergence() {
@@ -330,14 +317,11 @@
}
-
public boolean isConverged() {
return converged;
}
- /**
- * @return the std
- */
+ /** @return the std */
public double getStd() {
return std;
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java Fri Jul 10 09:35:19 2009
@@ -19,18 +19,18 @@
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.mahout.matrix.Vector;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-public class KMeansClusterMapper extends MapReduceBase implements
- Mapper<WritableComparable<?>, Vector, Text, Text> {
+public class KMeansClusterMapper extends MapReduceBase implements
+ Mapper<WritableComparable<?>, Vector, Text, Text> {
protected List<Cluster> clusters;
@@ -56,10 +56,11 @@
clusters = new ArrayList<Cluster>();
KMeansUtil.configureWithClusterInfo(job.get(Cluster.CLUSTER_PATH_KEY),
- clusters);
+ clusters);
- if (clusters.isEmpty())
+ if (clusters.isEmpty()) {
throw new NullPointerException("Cluster is empty!!!");
+ }
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java Fri Jul 10 09:35:19 2009
@@ -16,9 +16,6 @@
*/
package org.apache.mahout.clustering.kmeans;
-import java.io.IOException;
-import java.util.Iterator;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
@@ -26,19 +23,22 @@
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import java.io.IOException;
+import java.util.Iterator;
+
public class KMeansCombiner extends MapReduceBase implements
Reducer<Text, KMeansInfo, Text, KMeansInfo> {
@Override
public void reduce(Text key, Iterator<KMeansInfo> values,
- OutputCollector<Text, KMeansInfo> output, Reporter reporter) throws IOException {
+ OutputCollector<Text, KMeansInfo> output, Reporter reporter) throws IOException {
Cluster cluster = new Cluster(key.toString());
while (values.hasNext()) {
KMeansInfo next = values.next();
cluster.addPoints(next.getPoints(),
next.getPointTotal());
}
- output.collect(key, new KMeansInfo(cluster.getNumPoints(), cluster.getPointTotal()));
+ output.collect(key, new KMeansInfo(cluster.getNumPoints(), cluster.getPointTotal()));
}
@Override
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Fri Jul 10 09:35:19 2009
@@ -47,9 +47,7 @@
public class KMeansDriver {
- /**
- * The name of the directory used to output final results.
- */
+ /** The name of the directory used to output final results. */
public static final String DEFAULT_OUTPUT_DIRECTORY = "/points";
private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class);
@@ -57,9 +55,7 @@
private KMeansDriver() {
}
- /**
- * @param args Expects 7 args and they all correspond to the order of the params in {@link #runJob}
- */
+ /** @param args Expects 7 args and they all correspond to the order of the params in {@link #runJob} */
public static void main(String[] args) throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
@@ -67,51 +63,51 @@
GroupBuilder gbuilder = new GroupBuilder();
Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
- abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
- withDescription("The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
Option clustersOpt = obuilder.withLongName("clusters").withRequired(true).withArgument(
- abuilder.withName("clusters").withMinimum(1).withMaximum(1).create()).
- withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " +
- "If k is also specified, then a random set of vectors will be selected and written out to this path first").withShortName("c").create();
+ abuilder.withName("clusters").withMinimum(1).withMaximum(1).create()).
+ withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " +
+ "If k is also specified, then a random set of vectors will be selected and written out to this path first").withShortName("c").create();
Option kOpt = obuilder.withLongName("k").withRequired(false).withArgument(
- abuilder.withName("k").withMinimum(1).withMaximum(1).create()).
- withDescription("The k in k-Means. If specified, then a random selection of k Vectors will be chosen as the Centroid and written to the clusters output path.").withShortName("k").create();
+ abuilder.withName("k").withMinimum(1).withMaximum(1).create()).
+ withDescription("The k in k-Means. If specified, then a random selection of k Vectors will be chosen as the Centroid and written to the clusters output path.").withShortName("k").create();
Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
- withDescription("The Path to put the output in").withShortName("o").create();
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Path to put the output in").withShortName("o").create();
Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).
- withDescription("If set, overwrite the output directory").withShortName("w").create();
+ withDescription("If set, overwrite the output directory").withShortName("w").create();
Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
- abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
- withDescription("The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
+ abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
Option convergenceDeltaOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(
- abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).
- withDescription("The threshold below which the clusters are considered to be converged. Default is 0.5").withShortName("d").create();
+ abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).
+ withDescription("The threshold below which the clusters are considered to be converged. Default is 0.5").withShortName("d").create();
Option maxIterationsOpt = obuilder.withLongName("max").withRequired(false).withArgument(
- abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
- withDescription("The maximum number of iterations to perform. Default is 20").withShortName("x").create();
+ abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
+ withDescription("The maximum number of iterations to perform. Default is 20").withShortName("x").create();
Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
- abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
- withDescription("The Vector implementation class name. Default is SparseVector.class").withShortName("v").create();
+ abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Vector implementation class name. Default is SparseVector.class").withShortName("v").create();
Option numReduceTasksOpt = obuilder.withLongName("numReduce").withRequired(false).withArgument(
- abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).
- withDescription("The number of reduce tasks").withShortName("r").create();
+ abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).
+ withDescription("The number of reduce tasks").withShortName("r").create();
Option helpOpt = obuilder.withLongName("help").
- withDescription("Print out help").withShortName("h").create();
+ withDescription("Print out help").withShortName("h").create();
Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(measureClassOpt)
- .withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt).withOption(kOpt)
- .withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).create();
+ .withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt).withOption(kOpt)
+ .withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -134,8 +130,8 @@
}
Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
- SparseVector.class
- : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+ SparseVector.class
+ : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
int maxIterations = 20;
@@ -153,7 +149,7 @@
clusters = RandomSeedGenerator.buildRandom(input, clusters, Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
}
runJob(input, clusters, output, measureClass, convergenceDelta,
- maxIterations, numReduceTasks, vectorClass);
+ maxIterations, numReduceTasks, vectorClass);
} catch (OptionException e) {
log.error("Exception", e);
CommandLineUtil.printHelp(group);
@@ -161,7 +157,6 @@
}
-
/**
* Run the job using supplied arguments
*
@@ -172,7 +167,6 @@
* @param convergenceDelta the convergence delta value
* @param maxIterations the maximum number of iterations
* @param numReduceTasks the number of reducers
- * @param vectorClass
*/
public static void runJob(String input, String clustersIn, String output,
String measureClass, double convergenceDelta, int maxIterations,
@@ -190,7 +184,7 @@
// point the output to a new directory per iteration
String clustersOut = output + "/clusters-" + iteration;
converged = runIteration(input, clustersIn, clustersOut, measureClass,
- delta, numReduceTasks, iteration);
+ delta, numReduceTasks, iteration);
// now point the input to the old output directory
clustersIn = output + "/clusters-" + iteration;
iteration++;
@@ -299,7 +293,7 @@
* @throws IOException if there was an IO error
*/
private static boolean isConverged(String filePath, JobConf conf, FileSystem fs)
- throws IOException {
+ throws IOException {
Path outPart = new Path(filePath + "/*");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, outPart, conf);
Writable key;
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansInfo.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansInfo.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansInfo.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansInfo.java Fri Jul 10 09:35:19 2009
@@ -21,9 +21,9 @@
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.Vector;
+import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
-import java.io.DataInput;
public class KMeansInfo implements Writable {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java Fri Jul 10 09:35:19 2009
@@ -16,10 +16,6 @@
*/
package org.apache.mahout.clustering.kmeans;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
@@ -29,6 +25,10 @@
import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.matrix.Vector;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
public class KMeansMapper extends MapReduceBase implements
Mapper<WritableComparable<?>, Vector, Text, KMeansInfo> {
@@ -36,13 +36,13 @@
@Override
public void map(WritableComparable<?> key, Vector point,
- OutputCollector<Text, KMeansInfo> output, Reporter reporter) throws IOException {
- Cluster.emitPointToNearestCluster(point, clusters, output);
+ OutputCollector<Text, KMeansInfo> output, Reporter reporter) throws IOException {
+ Cluster.emitPointToNearestCluster(point, clusters, output);
}
/**
* Configure the mapper by providing its clusters. Used by unit tests.
- *
+ *
* @param clusters a List<Cluster>
*/
void config(List<Cluster> clusters) {
@@ -58,7 +58,8 @@
KMeansUtil.configureWithClusterInfo(job.get(Cluster.CLUSTER_PATH_KEY),
clusters);
- if (clusters.isEmpty())
+ if (clusters.isEmpty()) {
throw new NullPointerException("Cluster is empty!!!");
+ }
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java Fri Jul 10 09:35:19 2009
@@ -16,13 +16,6 @@
*/
package org.apache.mahout.clustering.kmeans;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
@@ -30,6 +23,13 @@
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
public class KMeansReducer extends MapReduceBase implements
Reducer<Text, KMeansInfo, Text, Cluster> {
@@ -37,7 +37,7 @@
@Override
public void reduce(Text key, Iterator<KMeansInfo> values,
- OutputCollector<Text, Cluster> output, Reporter reporter) throws IOException {
+ OutputCollector<Text, Cluster> output, Reporter reporter) throws IOException {
Cluster cluster = clusterMap.get(key.toString());
while (values.hasNext()) {
@@ -61,8 +61,9 @@
clusters);
setClusterMap(clusters);
- if (clusterMap.isEmpty())
+ if (clusterMap.isEmpty()) {
throw new NullPointerException("Cluster is empty!!!");
+ }
}
private void setClusterMap(List<Cluster> clusters) {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java Fri Jul 10 09:35:19 2009
@@ -16,10 +16,6 @@
* limitations under the License.
*/
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
@@ -32,6 +28,10 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
final class KMeansUtil {
private static final Logger log = LoggerFactory.getLogger(KMeansUtil.class);
@@ -39,12 +39,10 @@
private KMeansUtil() {
}
- /**
- * Configure the mapper with the cluster info
- */
+ /** Configure the mapper with the cluster info */
public static void configureWithClusterInfo(String clusterPathStr,
- List<Cluster> clusters) {
-
+ List<Cluster> clusters) {
+
// Get the path location where the cluster Info is stored
JobConf job = new JobConf(KMeansUtil.class);
Path clusterPath = new Path(clusterPathStr + "/*");
@@ -72,7 +70,7 @@
for (Path path : result) {
SequenceFile.Reader reader = null;
try {
- reader =new SequenceFile.Reader(fs, path, job);
+ reader = new SequenceFile.Reader(fs, path, job);
Class<?> valueClass = reader.getValueClass();
Writable key;
try {
@@ -84,14 +82,14 @@
log.error("Exception", e);
throw new RuntimeException(e);
}
- if (valueClass.equals(Cluster.class)){
+ if (valueClass.equals(Cluster.class)) {
Cluster value = new Cluster();
while (reader.next(key, value)) {
// get the cluster info
clusters.add(value);
value = new Cluster();
}
- } else if (valueClass.equals(Canopy.class)){
+ } else if (valueClass.equals(Canopy.class)) {
Canopy value = new Canopy();
while (reader.next(key, value)) {
// get the cluster info
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java?rev=792856&r1=792855&r2=792856&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java Fri Jul 10 09:35:19 2009
@@ -28,17 +28,15 @@
import org.slf4j.LoggerFactory;
import java.io.IOException;
-import java.util.Random;
-import java.util.List;
import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
/**
- * Given an Input Path containing a {@link org.apache.hadoop.io.SequenceFile}, randomly select k vectors
- * and write them to the output file as a {@link org.apache.mahout.clustering.kmeans.Cluster} representing
- * the initial centroid to use.
+ * Given an Input Path containing a {@link org.apache.hadoop.io.SequenceFile}, randomly select k vectors and write them
+ * to the output file as a {@link org.apache.mahout.clustering.kmeans.Cluster} representing the initial centroid to use.
* <p/>
- *
*/
public final class RandomSeedGenerator {
@@ -46,7 +44,8 @@
public static final String K = "k";
- private RandomSeedGenerator() {}
+ private RandomSeedGenerator() {
+ }
public static Path buildRandom(String input, String output,
int k) throws IOException, IllegalAccessException, InstantiationException {
@@ -59,12 +58,12 @@
}
fs.mkdirs(outPath);
Path outFile = new Path(outPath, "part-randomSeed");
- if (fs.exists(outFile) == true){
+ if (fs.exists(outFile) == true) {
log.warn("Deleting " + outFile);
fs.delete(outFile, false);
}
boolean newFile = fs.createNewFile(outFile);
- if (newFile == true){
+ if (newFile == true) {
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(input), conf);
Writable key = (Writable) reader.getKeyClass().newInstance();
Vector value = (Vector) reader.getValueClass().newInstance();
@@ -73,7 +72,7 @@
List<Text> chosenTexts = new ArrayList<Text>(k);
List<Cluster> chosenClusters = new ArrayList<Cluster>(k);
- while (reader.next(key, value)){
+ while (reader.next(key, value)) {
Cluster newCluster = new Cluster(value);
newCluster.addPoint(value);
Text newText = new Text(key.toString());