You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@mahout.apache.org by Aleksander Sadecki <al...@pi.esisar.grenoble-inp.fr> on 2014/05/21 13:19:22 UTC

Apache Mahout - KMeans Clustering

Hi,

I am following the book Mahout In Action.

I downloaded sources and I am trying to run this piece of code:

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.Cluster;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;

public class KMeansClustering {
    public static final double[][] points = { { 1, 1 }, { 2, 1 }, { 1, 2 },
            { 2, 2 }, { 3, 3 }, { 8, 8 }, { 9, 8 }, { 8, 9 }, { 9, 9 } };

    public static void writePointsToFile(List<Vector> points, String fileName,
            FileSystem fs, Configuration conf) throws IOException {
        Path path = new Path(fileName);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
                LongWritable.class, VectorWritable.class);
        long recNum = 0;
        VectorWritable vec = new VectorWritable();
        for (Vector point : points) {
            vec.set(point);
            writer.append(new LongWritable(recNum++), vec);
        }
        writer.close();
    }

    public static List<Vector> getPoints(double[][] raw) {
        List<Vector> points = new ArrayList<Vector>();
        for (int i = 0; i < raw.length; i++) {
            double[] fr = raw[i];
            Vector vec = new RandomAccessSparseVector(fr.length);
            vec.assign(fr);
            points.add(vec);
        }
        return points;
    }

    public static void main(String args[]) throws Exception {

        int k = 2;

        List<Vector> vectors = getPoints(points);

        File testData = new File("testdata");
        if (!testData.exists()) {
            testData.mkdir();
        }
        testData = new File("testdata/points");
        if (!testData.exists()) {
            testData.mkdir();
        }

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        writePointsToFile(vectors, "testdata/points/file1", fs, conf);

        Path path = new Path("testdata/clusters/part-00000");
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
                Text.class, Cluster.class);

        for (int i = 0; i < k; i++) {
            Vector vec = vectors.get(i);
            Cluster cluster = new Cluster(vec, i,
                    new EuclideanDistanceMeasure());
            writer.append(new Text(cluster.getIdentifier()), cluster);
        }
        writer.close();

        KMeansDriver.run(conf, new Path("testdata/points"), new Path(
                "testdata/clusters"), new Path("output"),
                new EuclideanDistanceMeasure(), 0.001, 10, true, false);

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(
                "output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"),
                conf);

        IntWritable key = new IntWritable();
        WeightedVectorWritable value = new WeightedVectorWritable();
        while (reader.next(key, value)) {
            System.out.println(value.toString() + " belongs to cluster "
                    + key.toString());
        }
        reader.close();
    }

}

In fact, I have got a problem with 2 imports:

import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.Cluster;

and with line:

KMeansDriver.run(...)

which gives an error

The method run(Configuration, Path, Path, Path, double, int, boolean, double, boolean) in the type KMeansDriver is not applicable for the arguments 
 (Configuration, Path, Path, Path, EuclideanDistanceMeasure, double, int, boolean, boolean)

I think I solved it a little bit. I changed

import org.apache.mahout.clustering.WeightedVectorWritable;

to

import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;

but I cannot find a solution for other ones. I found Cluster in package org.apache.mahout.clustering but it is an Interface.

Thank you for any help.

My pom.xml

    <dependency>
        <groupId>org.apache.mahout</groupId>
        <artifactId>mahout-core</artifactId>
        <version>0.9</version>
    </dependency>
    <dependency>
        <groupId>org.apache.mahout</groupId>
        <artifactId>mahout-math</artifactId>
        <version>0.9</version>
    </dependency>
    <dependency>
        <groupId>org.apache.mahout</groupId>
        <artifactId>mahout-collections</artifactId>
        <version>1.0</version>
    </dependency>


Apache Mahout - KMeans Clustering

Posted by Aleksander Sadecki <al...@pi.esisar.grenoble-inp.fr>.
Hi,

Thank you for your answer.

I changed my pom.xml:

		<mahout.version>0.7</mahout.version>
		
		<mahout.groupid>org.apache.mahout</mahout.groupid>

	        <dependency>
			<groupId>${mahout.groupid}</groupId>
			<artifactId>mahout-core</artifactId>
			<version>${mahout.version}</version>
		</dependency>
		<dependency>
			<groupId>${mahout.groupid}</groupId>
			<artifactId>mahout-core</artifactId>
			<type>test-jar</type>
			<scope>test</scope>
			<version>${mahout.version}</version>
		</dependency>
		<dependency>
			<groupId>${mahout.groupid}</groupId>
			<artifactId>mahout-math</artifactId>
			<version>${mahout.version}</version>
		</dependency>
		<dependency>
			<groupId>${mahout.groupid}</groupId>
			<artifactId>mahout-math</artifactId>
			<type>test-jar</type>
			<scope>test</scope>
			<version>${mahout.version}</version>
		</dependency>
		<dependency>
			<groupId>${mahout.groupid}</groupId>
			<artifactId>mahout-examples</artifactId>
			<version>${mahout.version}</version>
		</dependency>

I copied classes:

mia.clustering.ClusterHelper
mia.clustering.ClustersFilter
mia.clustering.ch07.SimpleKMeansClustering

and finally I can compile it.

When I run it, I can see an error:

DEBUG Configuration - java.io.IOException: config()
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
	at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:53)

DEBUG Configuration - java.io.IOException: config()
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
	at org.apache.hadoop.security.UserGroupInformation.ensureInitialized(UserGroupInformation.java:159)
	at org.apache.hadoop.security.UserGroupInformation.isSecurityEnabled(UserGroupInformation.java:216)
	at org.apache.hadoop.security.UserGroupInformation.getLoginUser(UserGroupInformation.java:409)
	at org.apache.hadoop.security.UserGroupInformation.getCurrentUser(UserGroupInformation.java:395)
	at org.apache.hadoop.fs.FileSystem$Cache$Key.<init>(FileSystem.java:1418)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:1319)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:226)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:109)
	at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:54)

DEBUG Groups -  Creating new Groups object
DEBUG Groups - Group mapping impl=org.apache.hadoop.security.ShellBasedUnixGroupsMapping; cacheTimeout=300000
DEBUG FileSystem - Creating filesystem for file:///
INFO  HadoopUtil - Deleting output
INFO  KMeansDriver - Input: testdata/points Clusters In: testdata/clusters Out: output Distance: org.apache.mahout.common.distance.EuclideanDistanceMeasure
INFO  KMeansDriver - convergence: 0.001 max Iterations: 10 num Reduce Tasks: org.apache.mahout.math.VectorWritable Input Vectors: {}
DEBUG KMeansUtil - Read 1 Cluster from testdata/clusters
DEBUG KMeansUtil - Read 1 Cluster from testdata/clusters
DEBUG Configuration - java.io.IOException: config()
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
	at org.apache.mahout.clustering.classify.ClusterClassifier.writePolicy(ClusterClassifier.java:232)
	at org.apache.mahout.clustering.classify.ClusterClassifier.writeToSeqFiles(ClusterClassifier.java:185)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:224)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
	at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)

DEBUG Configuration - java.io.IOException: config()
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
	at org.apache.mahout.clustering.classify.ClusterClassifier.writeToSeqFiles(ClusterClassifier.java:186)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:224)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
	at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)

DEBUG Configuration - java.io.IOException: config()
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:226)
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:213)
	at org.apache.mahout.clustering.classify.ClusterClassifier.readPolicy(ClusterClassifier.java:221)
	at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:160)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
	at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)

Cluster Iterator running iteration 1 over priorPath: output/clusters-0
DEBUG Configuration - java.io.IOException: config(config)
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:259)
	at org.apache.hadoop.mapred.JobConf.<init>(JobConf.java:343)
	at org.apache.hadoop.mapreduce.JobContext.<init>(JobContext.java:76)
	at org.apache.hadoop.mapreduce.Job.<init>(Job.java:50)
	at org.apache.hadoop.mapreduce.Job.<init>(Job.java:54)
	at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:168)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
	at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)

DEBUG Configuration - java.io.IOException: config(config)
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:259)
	at org.apache.hadoop.mapred.JobConf.<init>(JobConf.java:343)
	at org.apache.hadoop.mapred.LocalJobRunner.<init>(LocalJobRunner.java:420)
	at org.apache.hadoop.mapred.JobClient.init(JobClient.java:468)
	at org.apache.hadoop.mapred.JobClient.<init>(JobClient.java:453)
	at org.apache.hadoop.mapreduce.Job$1.run(Job.java:478)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Unknown Source)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1059)
	at org.apache.hadoop.mapreduce.Job.connect(Job.java:476)
	at org.apache.hadoop.mapreduce.Job.submit(Job.java:464)
	at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:495)
	at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:185)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
	at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)

DEBUG Configuration - java.io.IOException: config(config)
	at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:259)
	at org.apache.hadoop.mapred.JobConf.<init>(JobConf.java:343)
	at org.apache.hadoop.mapred.LocalJobRunner.<init>(LocalJobRunner.java:421)
	at org.apache.hadoop.mapred.JobClient.init(JobClient.java:468)
	at org.apache.hadoop.mapred.JobClient.<init>(JobClient.java:453)
	at org.apache.hadoop.mapreduce.Job$1.run(Job.java:478)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Unknown Source)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1059)
	at org.apache.hadoop.mapreduce.Job.connect(Job.java:476)
	at org.apache.hadoop.mapreduce.Job.submit(Job.java:464)
	at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:495)
	at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:185)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
	at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)

Exception in thread "main" java.io.IOException: Failed to set permissions of path: \tmp\hadoop-mynamehere\mapred\staging\mynamehere-965149672\.staging to 0700
	at org.apache.hadoop.fs.FileUtil.checkReturnValue(FileUtil.java:680)
	at org.apache.hadoop.fs.FileUtil.setPermission(FileUtil.java:653)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:483)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:318)
	at org.apache.hadoop.fs.FilterFileSystem.mkdirs(FilterFileSystem.java:183)
	at org.apache.hadoop.mapreduce.JobSubmissionFiles.getStagingDir(JobSubmissionFiles.java:116)
	at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:813)
	at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:807)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Unknown Source)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1059)
	at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:807)
	at org.apache.hadoop.mapreduce.Job.submit(Job.java:465)
	at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:495)
	at org.apache.mahout.clustering.iterator.ClusterIterator.iterateMR(ClusterIterator.java:185)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:229)
	at org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:149)
	at com.sicap.targetme.bi.kmeans.KMeansClustering.main(KMeansClustering.java:73)
DEBUG FileSystem - Starting clear of FileSystem cache with 1 elements.
DEBUG FileSystem - Removing filesystem for file:///
DEBUG FileSystem - Removing filesystem for file:///
DEBUG FileSystem - Done clearing cache

So, what I should do now? Is it not a problem with Windows platform?

Thank you in advance
Aleksander Sadecki

Re: Apache Mahout - KMeans Clustering

Posted by tuxdna <tu...@gmail.com>.
You are using 0.9 version of Mahout amd 1.0 version of
mahout-collections. The API might have changed considerably.

I suggest you checkout the code from here:
https://github.com/tdunning/MiA/tree/mahout-0.7

This code works with mahout-0.7

Regards,
Saleem


On Wed, May 21, 2014 at 4:49 PM, Aleksander Sadecki
<al...@pi.esisar.grenoble-inp.fr> wrote:
> Hi,
>
> I am following the book Mahout In Action.
>
> I downloaded sources and I am trying to run this piece of code:
>
> import java.io.File;
> import java.io.IOException;
> import java.util.ArrayList;
> import java.util.List;
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.IntWritable;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.SequenceFile;
> import org.apache.hadoop.io.Text;
> import org.apache.mahout.clustering.WeightedVectorWritable;
> import org.apache.mahout.clustering.kmeans.Cluster;
> import org.apache.mahout.clustering.kmeans.KMeansDriver;
> import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
> import org.apache.mahout.math.RandomAccessSparseVector;
> import org.apache.mahout.math.Vector;
> import org.apache.mahout.math.VectorWritable;
>
> public class KMeansClustering {
>     public static final double[][] points = { { 1, 1 }, { 2, 1 }, { 1, 2 },
>             { 2, 2 }, { 3, 3 }, { 8, 8 }, { 9, 8 }, { 8, 9 }, { 9, 9 } };
>
>     public static void writePointsToFile(List<Vector> points, String fileName,
>             FileSystem fs, Configuration conf) throws IOException {
>         Path path = new Path(fileName);
>         SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
>                 LongWritable.class, VectorWritable.class);
>         long recNum = 0;
>         VectorWritable vec = new VectorWritable();
>         for (Vector point : points) {
>             vec.set(point);
>             writer.append(new LongWritable(recNum++), vec);
>         }
>         writer.close();
>     }
>
>     public static List<Vector> getPoints(double[][] raw) {
>         List<Vector> points = new ArrayList<Vector>();
>         for (int i = 0; i < raw.length; i++) {
>             double[] fr = raw[i];
>             Vector vec = new RandomAccessSparseVector(fr.length);
>             vec.assign(fr);
>             points.add(vec);
>         }
>         return points;
>     }
>
>     public static void main(String args[]) throws Exception {
>
>         int k = 2;
>
>         List<Vector> vectors = getPoints(points);
>
>         File testData = new File("testdata");
>         if (!testData.exists()) {
>             testData.mkdir();
>         }
>         testData = new File("testdata/points");
>         if (!testData.exists()) {
>             testData.mkdir();
>         }
>
>         Configuration conf = new Configuration();
>         FileSystem fs = FileSystem.get(conf);
>         writePointsToFile(vectors, "testdata/points/file1", fs, conf);
>
>         Path path = new Path("testdata/clusters/part-00000");
>         SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
>                 Text.class, Cluster.class);
>
>         for (int i = 0; i < k; i++) {
>             Vector vec = vectors.get(i);
>             Cluster cluster = new Cluster(vec, i,
>                     new EuclideanDistanceMeasure());
>             writer.append(new Text(cluster.getIdentifier()), cluster);
>         }
>         writer.close();
>
>         KMeansDriver.run(conf, new Path("testdata/points"), new Path(
>                 "testdata/clusters"), new Path("output"),
>                 new EuclideanDistanceMeasure(), 0.001, 10, true, false);
>
>         SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(
>                 "output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"),
>                 conf);
>
>         IntWritable key = new IntWritable();
>         WeightedVectorWritable value = new WeightedVectorWritable();
>         while (reader.next(key, value)) {
>             System.out.println(value.toString() + " belongs to cluster "
>                     + key.toString());
>         }
>         reader.close();
>     }
>
> }
>
> In fact, I have got a problem with 2 imports:
>
> import org.apache.mahout.clustering.WeightedVectorWritable;
> import org.apache.mahout.clustering.kmeans.Cluster;
>
> and with line:
>
> KMeansDriver.run(...)
>
> which gives an error
>
> The method run(Configuration, Path, Path, Path, double, int, boolean, double, boolean) in the type KMeansDriver is not applicable for the arguments
>  (Configuration, Path, Path, Path, EuclideanDistanceMeasure, double, int, boolean, boolean)
>
> I think I solved it a little bit. I changed
>
> import org.apache.mahout.clustering.WeightedVectorWritable;
>
> to
>
> import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
>
> but I cannot find a solution for other ones. I found Cluster in package org.apache.mahout.clustering but it is an Interface.
>
> Thank you for any help.
>
> My pom.xml
>
>     <dependency>
>         <groupId>org.apache.mahout</groupId>
>         <artifactId>mahout-core</artifactId>
>         <version>0.9</version>
>     </dependency>
>     <dependency>
>         <groupId>org.apache.mahout</groupId>
>         <artifactId>mahout-math</artifactId>
>         <version>0.9</version>
>     </dependency>
>     <dependency>
>         <groupId>org.apache.mahout</groupId>
>         <artifactId>mahout-collections</artifactId>
>         <version>1.0</version>
>     </dependency>
>