You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@mahout.apache.org by Aleksander Sadecki <al...@pi.esisar.grenoble-inp.fr> on 2014/05/21 13:19:22 UTC
Apache Mahout - KMeans Clustering
Hi,
I am following the book Mahout In Action.
I downloaded sources and I am trying to run this piece of code:
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.Cluster;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
public class KMeansClustering {
public static final double[][] points = { { 1, 1 }, { 2, 1 }, { 1, 2 },
{ 2, 2 }, { 3, 3 }, { 8, 8 }, { 9, 8 }, { 8, 9 }, { 9, 9 } };
public static void writePointsToFile(List<Vector> points, String fileName,
FileSystem fs, Configuration conf) throws IOException {
Path path = new Path(fileName);
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
LongWritable.class, VectorWritable.class);
long recNum = 0;
VectorWritable vec = new VectorWritable();
for (Vector point : points) {
vec.set(point);
writer.append(new LongWritable(recNum++), vec);
}
writer.close();
}
public static List<Vector> getPoints(double[][] raw) {
List<Vector> points = new ArrayList<Vector>();
for (int i = 0; i < raw.length; i++) {
double[] fr = raw[i];
Vector vec = new RandomAccessSparseVector(fr.length);
vec.assign(fr);
points.add(vec);
}
return points;
}
public static void main(String args[]) throws Exception {
int k = 2;
List<Vector> vectors = getPoints(points);
File testData = new File("testdata");
if (!testData.exists()) {
testData.mkdir();
}
testData = new File("testdata/points");
if (!testData.exists()) {
testData.mkdir();
}
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
writePointsToFile(vectors, "testdata/points/file1", fs, conf);
Path path = new Path("testdata/clusters/part-00000");
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
Text.class, Cluster.class);
for (int i = 0; i < k; i++) {
Vector vec = vectors.get(i);
Cluster cluster = new Cluster(vec, i,
new EuclideanDistanceMeasure());
writer.append(new Text(cluster.getIdentifier()), cluster);
}
writer.close();
KMeansDriver.run(conf, new Path("testdata/points"), new Path(
"testdata/clusters"), new Path("output"),
new EuclideanDistanceMeasure(), 0.001, 10, true, false);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(
"output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"),
conf);
IntWritable key = new IntWritable();
WeightedVectorWritable value = new WeightedVectorWritable();
while (reader.next(key, value)) {
System.out.println(value.toString() + " belongs to cluster "
+ key.toString());
}
reader.close();
}
}
In fact, I have got a problem with 2 imports:
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.Cluster;
and with line:
KMeansDriver.run(...)
which gives an error
The method run(Configuration, Path, Path, Path, double, int, boolean, double, boolean) in the type KMeansDriver is not applicable for the arguments
(Configuration, Path, Path, Path, EuclideanDistanceMeasure, double, int, boolean, boolean)
I think I solved it a little bit. I changed
import org.apache.mahout.clustering.WeightedVectorWritable;
to
import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
but I cannot find a solution for other ones. I found Cluster in package org.apache.mahout.clustering but it is an Interface.
Thank you for any help.
My pom.xml
<dependency>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout-core</artifactId>
<version>0.9</version>
</dependency>
<dependency>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout-math</artifactId>
<version>0.9</version>
</dependency>
<dependency>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout-collections</artifactId>
<version>1.0</version>
</dependency>
Apache Mahout - KMeans Clustering
Posted by Aleksander Sadecki <al...@pi.esisar.grenoble-inp.fr>.
Hi,
Thank you for your answer.
I changed my pom.xml:
<mahout.version>0.7</mahout.version>
<mahout.groupid>org.apache.mahout</mahout.groupid>
<dependency>
<groupId>${mahout.groupid}</groupId>
<artifactId>mahout-core</artifactId>
<version>${mahout.version}</version>
</dependency>
<dependency>
<groupId>${mahout.groupid}</groupId>
<artifactId>mahout-core</artifactId>
<type>test-jar</type>
<scope>test</scope>
<version>${mahout.version}</version>
</dependency>
<dependency>
<groupId>${mahout.groupid}</groupId>
<artifactId>mahout-math</artifactId>
<version>${mahout.version}</version>
</dependency>
<dependency>
<groupId>${mahout.groupid}</groupId>
<artifactId>mahout-math</artifactId>
<type>test-jar</type>
<scope>test</scope>
<version>${mahout.version}</version>
</dependency>
<dependency>
<groupId>${mahout.groupid}</groupId>
<artifactId>mahout-examples</artifactId>
<version>${mahout.version}</version>
</dependency>
I copied classes:
mia.clustering.ClusterHelper
mia.clustering.ClustersFilter
mia.clustering.ch07.SimpleKMeansClustering
and finally I can compile it.
When I run it, I can see an error:
DEBUG Configuration - java.io.IOException: config()
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:53)
DEBUG Configuration - java.io.IOException: config()
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
at org.apache.hadoop.security.UserGroupInformation.ensureInitialized(UserGroupInformation.java:159)
at org.apache.hadoop.security.UserGroupInformation.isSecurityEnabled(UserGroupInformation.java:216)
at org.apache.hadoop.security.UserGroupInformation.getLoginUser(UserGroupInformation.java:409)
at org.apache.hadoop.security.UserGroupInformation.getCurrentUser(UserGroupInformation.java:395)
at org.apache.hadoop.fs.FileSystem$Cache$Key.<init>(FileSystem.java:1418)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:1319)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:226)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:109)
at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:54)
DEBUG Groups - Creating new Groups object
DEBUG Groups - Group mapping impl=org.apache.hadoop.security.ShellBasedUnixGroupsMapping; cacheTimeout=300000
DEBUG FileSystem - Creating filesystem for file:///
INFO HadoopUtil - Deleting output
INFO KMeansDriver - Input: testdata/points Clusters In: testdata/clusters Out: output Distance: org.apache.mahout.common.distance.EuclideanDistanceMeasure
INFO KMeansDriver - convergence: 0.001 max Iterations: 10 num Reduce Tasks: org.apache.mahout.math.VectorWritable Input Vectors: {}
DEBUG KMeansUtil - Read 1 Cluster from testdata/clusters
DEBUG KMeansUtil - Read 1 Cluster from testdata/clusters
DEBUG Configuration - java.io.IOException: config()
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
at org.apache.mahout.clustering.classify.ClusterClassifier.writePolicy(ClusterClassifier.java:232)
at org.apache.mahout.clustering.classify.ClusterClassifier.writeToSeqFiles(ClusterClassifier.java:185)
at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:224)
at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)
DEBUG Configuration - java.io.IOException: config()
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
at org.apache.mahout.clustering.classify.ClusterClassifier.writeToSeqFiles(ClusterClassifier.java:186)
at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:224)
at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)
DEBUG Configuration - java.io.IOException: config()
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
at org.apache.mahout.clustering.classify.ClusterClassifier.readPolicy(ClusterClassifier.java:221)
at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:160)
at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)
Cluster Iterator running iteration 1 over priorPath: output/clusters-0
DEBUG Configuration - java.io.IOException: config(config)
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:259)
at org.apache.hadoop.mapred.JobConf.<init>(JobConf.java:343)
at org.apache.hadoop.mapreduce.JobContext.<init>(JobContext.java:76)
at org.apache.hadoop.mapreduce.Job.<init>(Job.java:50)
at org.apache.hadoop.mapreduce.Job.<init>(Job.java:54)
at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:168)
at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)
DEBUG Configuration - java.io.IOException: config(config)
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:259)
at org.apache.hadoop.mapred.JobConf.<init>(JobConf.java:343)
at org.apache.hadoop.mapred.LocalJobRunner.<init>(LocalJobRunner.java:420)
at org.apache.hadoop.mapred.JobClient.init(JobClient.java:468)
at org.apache.hadoop.mapred.JobClient.<init>(JobClient.java:453)
at org.apache.hadoop.mapreduce.Job$1.run(Job.java:478)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Unknown Source)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1059)
at org.apache.hadoop.mapreduce.Job.connect(Job.java:476)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:464)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:495)
at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:185)
at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)
DEBUG Configuration - java.io.IOException: config(config)
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:259)
at org.apache.hadoop.mapred.JobConf.<init>(JobConf.java:343)
at org.apache.hadoop.mapred.LocalJobRunner.<init>(LocalJobRunner.java:421)
at org.apache.hadoop.mapred.JobClient.init(JobClient.java:468)
at org.apache.hadoop.mapred.JobClient.<init>(JobClient.java:453)
at org.apache.hadoop.mapreduce.Job$1.run(Job.java:478)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Unknown Source)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1059)
at org.apache.hadoop.mapreduce.Job.connect(Job.java:476)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:464)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:495)
at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:185)
at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)
Exception in thread "main" java.io.IOException: Failed to set permissions of path: \tmp\hadoop-mynamehere\mapred\staging\mynamehere-965149672\.staging to 0700
at org.apache.hadoop.fs.FileUtil.checkReturnValue(FileUtil.java:680)
at org.apache.hadoop.fs.FileUtil.setPermission(FileUtil.java:653)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:483)
at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:318)
at org.apache.hadoop.fs.FilterFileSystem.mkdirs(FilterFileSystem.java:183)
at org.apache.hadoop.mapreduce.JobSubmissionFiles.getStagingDir(JobSubmissionFiles.java:116)
at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:813)
at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:807)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Unknown Source)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1059)
at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:807)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:465)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:495)
at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:185)
at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)
DEBUG FileSystem - Starting clear of FileSystem cache with 1 elements.
DEBUG FileSystem - Removing filesystem for file:///
DEBUG FileSystem - Removing filesystem for file:///
DEBUG FileSystem - Done clearing cache
So, what I should do now? Is it not a problem with Windows platform?
Thank you in advance
Aleksander Sadecki
Re: Apache Mahout - KMeans Clustering
Posted by tuxdna <tu...@gmail.com>.
You are using 0.9 version of Mahout amd 1.0 version of
mahout-collections. The API might have changed considerably.
I suggest you checkout the code from here:
https://github.com/tdunning/MiA/tree/mahout-0.7
This code works with mahout-0.7
Regards,
Saleem
On Wed, May 21, 2014 at 4:49 PM, Aleksander Sadecki
<al...@pi.esisar.grenoble-inp.fr> wrote:
> Hi,
>
> I am following the book Mahout In Action.
>
> I downloaded sources and I am trying to run this piece of code:
>
> import java.io.File;
> import java.io.IOException;
> import java.util.ArrayList;
> import java.util.List;
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.IntWritable;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.SequenceFile;
> import org.apache.hadoop.io.Text;
> import org.apache.mahout.clustering.WeightedVectorWritable;
> import org.apache.mahout.clustering.kmeans.Cluster;
> import org.apache.mahout.clustering.kmeans.KMeansDriver;
> import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
> import org.apache.mahout.math.RandomAccessSparseVector;
> import org.apache.mahout.math.Vector;
> import org.apache.mahout.math.VectorWritable;
>
> public class KMeansClustering {
> public static final double[][] points = { { 1, 1 }, { 2, 1 }, { 1, 2 },
> { 2, 2 }, { 3, 3 }, { 8, 8 }, { 9, 8 }, { 8, 9 }, { 9, 9 } };
>
> public static void writePointsToFile(List<Vector> points, String fileName,
> FileSystem fs, Configuration conf) throws IOException {
> Path path = new Path(fileName);
> SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
> LongWritable.class, VectorWritable.class);
> long recNum = 0;
> VectorWritable vec = new VectorWritable();
> for (Vector point : points) {
> vec.set(point);
> writer.append(new LongWritable(recNum++), vec);
> }
> writer.close();
> }
>
> public static List<Vector> getPoints(double[][] raw) {
> List<Vector> points = new ArrayList<Vector>();
> for (int i = 0; i < raw.length; i++) {
> double[] fr = raw[i];
> Vector vec = new RandomAccessSparseVector(fr.length);
> vec.assign(fr);
> points.add(vec);
> }
> return points;
> }
>
> public static void main(String args[]) throws Exception {
>
> int k = 2;
>
> List<Vector> vectors = getPoints(points);
>
> File testData = new File("testdata");
> if (!testData.exists()) {
> testData.mkdir();
> }
> testData = new File("testdata/points");
> if (!testData.exists()) {
> testData.mkdir();
> }
>
> Configuration conf = new Configuration();
> FileSystem fs = FileSystem.get(conf);
> writePointsToFile(vectors, "testdata/points/file1", fs, conf);
>
> Path path = new Path("testdata/clusters/part-00000");
> SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
> Text.class, Cluster.class);
>
> for (int i = 0; i < k; i++) {
> Vector vec = vectors.get(i);
> Cluster cluster = new Cluster(vec, i,
> new EuclideanDistanceMeasure());
> writer.append(new Text(cluster.getIdentifier()), cluster);
> }
> writer.close();
>
> KMeansDriver.run(conf, new Path("testdata/points"), new Path(
> "testdata/clusters"), new Path("output"),
> new EuclideanDistanceMeasure(), 0.001, 10, true, false);
>
> SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(
> "output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"),
> conf);
>
> IntWritable key = new IntWritable();
> WeightedVectorWritable value = new WeightedVectorWritable();
> while (reader.next(key, value)) {
> System.out.println(value.toString() + " belongs to cluster "
> + key.toString());
> }
> reader.close();
> }
>
> }
>
> In fact, I have got a problem with 2 imports:
>
> import org.apache.mahout.clustering.WeightedVectorWritable;
> import org.apache.mahout.clustering.kmeans.Cluster;
>
> and with line:
>
> KMeansDriver.run(...)
>
> which gives an error
>
> The method run(Configuration, Path, Path, Path, double, int, boolean, double, boolean) in the type KMeansDriver is not applicable for the arguments
> (Configuration, Path, Path, Path, EuclideanDistanceMeasure, double, int, boolean, boolean)
>
> I think I solved it a little bit. I changed
>
> import org.apache.mahout.clustering.WeightedVectorWritable;
>
> to
>
> import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
>
> but I cannot find a solution for other ones. I found Cluster in package org.apache.mahout.clustering but it is an Interface.
>
> Thank you for any help.
>
> My pom.xml
>
> <dependency>
> <groupId>org.apache.mahout</groupId>
> <artifactId>mahout-core</artifactId>
> <version>0.9</version>
> </dependency>
> <dependency>
> <groupId>org.apache.mahout</groupId>
> <artifactId>mahout-math</artifactId>
> <version>0.9</version>
> </dependency>
> <dependency>
> <groupId>org.apache.mahout</groupId>
> <artifactId>mahout-collections</artifactId>
> <version>1.0</version>
> </dependency>
>