You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@mahout.apache.org by tom pierce <tc...@apache.org> on 2012/03/12 19:30:55 UTC
Re: svn commit: r1299770 - in /mahout/trunk: ./ core/ core/src/main/java/org/apache/mahout/common/
core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/ core/src/test/java/org/apache/mahout/clustering/canopy/
core/src/test/java/org/apache/m...
Can someone hook me up with JIRA privs so I can close tickets?
(Or, if that isn't something all committers get, someone pls mark -822
and -980 closed)
-tom
On 03/12/2012 02:25 PM, tcp@apache.org wrote:
> Author: tcp
> Date: Mon Mar 12 18:25:45 2012
> New Revision: 1299770
>
> URL: http://svn.apache.org/viewvc?rev=1299770&view=rev
> Log:
> MAHOUT-822: Make Mahout compatible with Hadoop 0.23.1.
>
> Modified:
> mahout/trunk/core/pom.xml
> mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
> mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java
> mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java
> mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java
> mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
> mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
> mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
> mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
> mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java
> mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java
> mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java
> mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
> mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
> mahout/trunk/pom.xml
>
> Modified: mahout/trunk/core/pom.xml
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/pom.xml?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/pom.xml (original)
> +++ mahout/trunk/core/pom.xml Mon Mar 12 18:25:45 2012
> @@ -140,10 +140,6 @@
>
> <!-- Third Party -->
> <dependency>
> -<groupId>org.apache.hadoop</groupId>
> -<artifactId>hadoop-core</artifactId>
> -</dependency>
> -<dependency>
> <groupId>org.codehaus.jackson</groupId>
> <artifactId>jackson-core-asl</artifactId>
> </dependency>
> @@ -211,4 +207,43 @@
> </dependency>
>
> </dependencies>
> +
> +<profiles>
> +<profile>
> +<id>hadoop-0.20</id>
> +<activation>
> +<property>
> +<name>!hadoop.version</name>
> +</property>
> +</activation>
> +<dependencies>
> +<dependency>
> +<groupId>org.apache.hadoop</groupId>
> +<artifactId>hadoop-core</artifactId>
> +</dependency>
> +</dependencies>
> +</profile>
> +<profile>
> +<id>hadoop-0.23</id>
> +<activation>
> +<property>
> +<name>hadoop.version</name>
> +</property>
> +</activation>
> +<dependencies>
> +<dependency>
> +<groupId>org.apache.hadoop</groupId>
> +<artifactId>hadoop-common</artifactId>
> +</dependency>
> +<dependency>
> +<groupId>org.apache.hadoop</groupId>
> +<artifactId>hadoop-mapreduce-client-common</artifactId>
> +</dependency>
> +<dependency>
> +<groupId>org.apache.hadoop</groupId>
> +<artifactId>hadoop-mapreduce-client-core</artifactId>
> +</dependency>
> +</dependencies>
> +</profile>
> +</profiles>
> </project>
>
> Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java (original)
> +++ mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java Mon Mar 12 18:25:45 2012
> @@ -17,6 +17,7 @@
>
> package org.apache.mahout.common;
>
> +import java.io.FileNotFoundException;
> import java.io.IOException;
> import java.io.InputStream;
> import java.net.URI;
> @@ -229,9 +230,9 @@ public final class HadoopUtil {
> FileStatus[] statuses;
> FileSystem fs = path.getFileSystem(conf);
> if (filter == null) {
> - statuses = pathType == PathType.GLOB ? fs.globStatus(path) : fs.listStatus(path);
> + statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path);
> } else {
> - statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : fs.listStatus(path, filter);
> + statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter);
> }
> if (ordering != null) {
> Arrays.sort(statuses, ordering);
> @@ -239,6 +240,22 @@ public final class HadoopUtil {
> return statuses;
> }
>
> + public static FileStatus[] listStatus(FileSystem fs, Path path) throws IOException {
> + try {
> + return fs.listStatus(path);
> + } catch (FileNotFoundException e) {
> + return new FileStatus[0];
> + }
> + }
> +
> + public static FileStatus[] listStatus(FileSystem fs, Path path, PathFilter filter) throws IOException {
> + try {
> + return fs.listStatus(path, filter);
> + } catch (FileNotFoundException e) {
> + return new FileStatus[0];
> + }
> + }
> +
> public static void cacheFiles(Path fileToCache, Configuration conf) {
> DistributedCache.setCacheFiles(new URI[]{fileToCache.toUri()}, conf);
> }
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java Mon Mar 12 18:25:45 2012
> @@ -1,70 +0,0 @@
> -/**
> - * Licensed to the Apache Software Foundation (ASF) under one or more
> - * contributor license agreements. See the NOTICE file distributed with
> - * this work for additional information regarding copyright ownership.
> - * The ASF licenses this file to You under the Apache License, Version 2.0
> - * (the "License"); you may not use this file except in compliance with
> - * the License. You may obtain a copy of the License at
> - *
> - * http://www.apache.org/licenses/LICENSE-2.0
> - *
> - * Unless required by applicable law or agreed to in writing, software
> - * distributed under the License is distributed on an "AS IS" BASIS,
> - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> - * See the License for the specific language governing permissions and
> - * limitations under the License.
> - */
> -package org.apache.mahout.classifier.df.mapreduce.partial;
> -
> -import java.io.IOException;
> -
> -import org.apache.hadoop.conf.Configuration;
> -import org.apache.hadoop.mapreduce.Mapper;
> -import org.apache.hadoop.mapreduce.TaskAttemptID;
> -import org.apache.hadoop.mapreduce.Mapper.Context;
> -import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
> -
> -/**
> - * Special implementation that collects the output of the mappers
> - */
> -final class MockContext extends Context {
> -
> - private final TreeID[] keys;
> - private final MapredOutput[] values;
> - private int index;
> -
> - MockContext(Mapper<?,?,?,?> mapper, Configuration conf, TaskAttemptID taskid, int nbTrees)
> - throws IOException, InterruptedException {
> - mapper.super(conf, taskid, null, null, null, null, null);
> -
> - keys = new TreeID[nbTrees];
> - values = new MapredOutput[nbTrees];
> - }
> -
> - @Override
> - public void write(Object key, Object value) throws IOException {
> - if (index == keys.length) {
> - throw new IOException("Received more output than expected : " + index);
> - }
> -
> - keys[index] = ((TreeID) key).clone();
> - values[index] = ((MapredOutput) value).clone();
> -
> - index++;
> - }
> -
> - /**
> - * @return number of outputs collected
> - */
> - public int nbOutputs() {
> - return index;
> - }
> -
> - public TreeID[] getKeys() {
> - return keys;
> - }
> -
> - public MapredOutput[] getValues() {
> - return values;
> - }
> -}
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java Mon Mar 12 18:25:45 2012
> @@ -1,176 +0,0 @@
> -/**
> - * Licensed to the Apache Software Foundation (ASF) under one or more
> - * contributor license agreements. See the NOTICE file distributed with
> - * this work for additional information regarding copyright ownership.
> - * The ASF licenses this file to You under the Apache License, Version 2.0
> - * (the "License"); you may not use this file except in compliance with
> - * the License. You may obtain a copy of the License at
> - *
> - * http://www.apache.org/licenses/LICENSE-2.0
> - *
> - * Unless required by applicable law or agreed to in writing, software
> - * distributed under the License is distributed on an "AS IS" BASIS,
> - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> - * See the License for the specific language governing permissions and
> - * limitations under the License.
> - */
> -
> -package org.apache.mahout.classifier.df.mapreduce.partial;
> -
> -import java.io.IOException;
> -import java.util.List;
> -
> -import org.apache.commons.lang.ArrayUtils;
> -import org.apache.hadoop.conf.Configuration;
> -import org.apache.hadoop.fs.Path;
> -import org.apache.hadoop.io.LongWritable;
> -import org.apache.hadoop.io.Text;
> -import org.apache.hadoop.mapreduce.InputSplit;
> -import org.apache.hadoop.mapreduce.Job;
> -import org.apache.hadoop.mapreduce.RecordReader;
> -import org.apache.hadoop.mapreduce.TaskAttemptContext;
> -import org.apache.hadoop.mapreduce.TaskAttemptID;
> -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
> -import org.apache.mahout.classifier.df.DFUtils;
> -import org.apache.mahout.classifier.df.DecisionForest;
> -import org.apache.mahout.classifier.df.builder.TreeBuilder;
> -import org.apache.mahout.classifier.df.data.Dataset;
> -import org.apache.mahout.classifier.df.mapreduce.Builder;
> -import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
> -import org.apache.mahout.classifier.df.node.Node;
> -import org.slf4j.Logger;
> -import org.slf4j.LoggerFactory;
> -
> -import com.google.common.collect.Lists;
> -
> -/**
> - * Simulates the Partial mapreduce implementation in a sequential manner. Must
> - * receive a seed
> - */
> -public class PartialSequentialBuilder extends PartialBuilder {
> -
> - private static final Logger log = LoggerFactory.getLogger(PartialSequentialBuilder.class);
> -
> - private MockContext firstOutput;
> -
> - private final Dataset dataset;
> -
> - public PartialSequentialBuilder(TreeBuilder treeBuilder, Path dataPath,
> - Dataset dataset, long seed, Configuration conf) {
> - super(treeBuilder, dataPath, new Path("notUsed"), seed, conf);
> - this.dataset = dataset;
> - }
> -
> - public PartialSequentialBuilder(TreeBuilder treeBuilder, Path dataPath,
> - Dataset dataset, long seed) {
> - this(treeBuilder, dataPath, dataset, seed, new Configuration());
> - }
> -
> - @Override
> - protected void configureJob(Job job)
> - throws IOException {
> - Configuration conf = job.getConfiguration();
> -
> - int num = conf.getInt("mapred.map.tasks", -1);
> -
> - super.configureJob(job);
> -
> - // PartialBuilder sets the number of maps to 1 if we are running in 'local'
> - conf.setInt("mapred.map.tasks", num);
> - }
> -
> - @Override
> - protected boolean runJob(Job job) throws IOException, InterruptedException {
> - Configuration conf = job.getConfiguration();
> -
> - // retrieve the splits
> - TextInputFormat input = new TextInputFormat();
> - List<InputSplit> splits = input.getSplits(job);
> -
> - int nbSplits = splits.size();
> - log.debug("Nb splits : {}", nbSplits);
> -
> - InputSplit[] sorted = new InputSplit[nbSplits];
> - splits.toArray(sorted);
> - Builder.sortSplits(sorted);
> -
> - int numTrees = Builder.getNbTrees(conf); // total number of trees
> -
> - TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());
> -
> - firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);
> -
> - /* first instance id in hadoop's order */
> - //int[] firstIds = new int[nbSplits];
> - /* partitions' sizes in hadoop order */
> - int[] sizes = new int[nbSplits];
> -
> - // to compute firstIds, process the splits in file order
> - long slowest = 0; // duration of slowest map
> - int firstId = 0;
> - for (InputSplit split : splits) {
> - int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition
> -
> - RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
> - reader.initialize(split, task);
> -
> - Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(),
> - hp, nbSplits, numTrees);
> -
> - long time = System.currentTimeMillis();
> -
> - //firstIds[hp] = firstId;
> -
> - while (reader.nextKeyValue()) {
> - mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput);
> - firstId++;
> - sizes[hp]++;
> - }
> -
> - mapper.cleanup(firstOutput);
> -
> - time = System.currentTimeMillis() - time;
> - log.info("Duration : {}", DFUtils.elapsedTime(time));
> -
> - if (time> slowest) {
> - slowest = time;
> - }
> - }
> -
> - log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
> - return true;
> - }
> -
> - @Override
> - protected DecisionForest parseOutput(Job job) throws IOException {
> - return processOutput(firstOutput.getKeys(), firstOutput.getValues());
> - }
> -
> - /**
> - * extract the decision forest
> - */
> - protected static DecisionForest processOutput(TreeID[] keys, MapredOutput[] values) {
> - List<Node> trees = Lists.newArrayList();
> -
> - for (int index = 0; index< keys.length; index++) {
> - MapredOutput value = values[index];
> - trees.add(value.getTree());
> - }
> -
> - return new DecisionForest(trees);
> - }
> -
> - /**
> - * Special Step1Mapper that can be configured without using a Configuration
> - *
> - */
> - private static class MockStep1Mapper extends Step1Mapper {
> - protected MockStep1Mapper(TreeBuilder treeBuilder, Dataset dataset, Long seed,
> - int partition, int numMapTasks, int numTrees) {
> - configure(false, treeBuilder, dataset);
> - configure(seed, partition, numMapTasks, numTrees);
> - }
> -
> - }
> -
> -}
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java Mon Mar 12 18:25:45 2012
> @@ -17,21 +17,30 @@
>
> package org.apache.mahout.classifier.df.mapreduce.partial;
>
> +import static org.easymock.EasyMock.anyObject;
> +import static org.easymock.EasyMock.capture;
> +import static org.easymock.EasyMock.createMock;
> +import static org.easymock.EasyMock.expectLastCall;
> +import static org.easymock.EasyMock.replay;
> +import static org.easymock.EasyMock.verify;
> +
> import java.util.Random;
>
> -import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.Text;
> -import org.apache.hadoop.mapreduce.TaskAttemptID;
> -import org.apache.mahout.common.MahoutTestCase;
> +import org.apache.hadoop.mapreduce.Mapper;
> import org.apache.mahout.common.RandomUtils;
> import org.apache.mahout.classifier.df.builder.TreeBuilder;
> import org.apache.mahout.classifier.df.data.Data;
> import org.apache.mahout.classifier.df.data.DataLoader;
> import org.apache.mahout.classifier.df.data.Dataset;
> import org.apache.mahout.classifier.df.data.Utils;
> +import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
> import org.apache.mahout.classifier.df.node.Leaf;
> import org.apache.mahout.classifier.df.node.Node;
> +import org.apache.mahout.common.MahoutTestCase;
> +import org.easymock.Capture;
> +import org.easymock.CaptureType;
> import org.junit.Test;
>
> public final class Step1MapperTest extends MahoutTestCase {
> @@ -71,6 +80,17 @@ public final class Step1MapperTest exten
> }
> }
>
> + private static class TreeIDCapture extends Capture<TreeID> {
> +
> + public TreeIDCapture() {
> + super(CaptureType.ALL);
> + }
> +
> + public void setValue(final TreeID value) {
> + super.setValue(value.clone());
> + }
> + }
> +
> /** nb attributes per generated data instance */
> static final int NUM_ATTRIBUTES = 4;
>
> @@ -83,6 +103,7 @@ public final class Step1MapperTest exten
> /** nb mappers to use */
> static final int NUM_MAPPERS = 2;
>
> + @SuppressWarnings({ "rawtypes", "unchecked" })
> @Test
> public void testMapper() throws Exception {
> Long seed = null;
> @@ -109,8 +130,13 @@ public final class Step1MapperTest exten
> // expected number of trees that this mapper will build
> int mapNbTrees = Step1Mapper.nbTrees(NUM_MAPPERS, NUM_TREES, partition);
>
> - MockContext context = new MockContext(new Step1Mapper(),
> - new Configuration(), new TaskAttemptID(), mapNbTrees);
> + Mapper.Context context =
> + createMock(Mapper.Context.class);
> + Capture<TreeID> capturedKeys = new TreeIDCapture();
> + context.write(capture(capturedKeys), anyObject());
> + expectLastCall().anyTimes();
> +
> + replay(context);
>
> MockStep1Mapper mapper = new MockStep1Mapper(treeBuilder, dataset, seed,
> partition, NUM_MAPPERS, NUM_TREES);
> @@ -125,12 +151,13 @@ public final class Step1MapperTest exten
> }
>
> mapper.cleanup(context);
> + verify(context);
>
> // make sure the mapper built all its trees
> - assertEquals(mapNbTrees, context.nbOutputs());
> + assertEquals(mapNbTrees, capturedKeys.getValues().size());
>
> // check the returned keys
> - for (TreeID k : context.getKeys()) {
> + for (TreeID k : capturedKeys.getValues()) {
> assertEquals(partition, k.partition());
> assertEquals(treeIndex, k.treeId());
>
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Mon Mar 12 18:25:45 2012
> @@ -34,6 +34,7 @@ import org.apache.mahout.clustering.Clus
> import org.apache.mahout.common.DummyRecordWriter;
> import org.apache.mahout.common.HadoopUtil;
> import org.apache.mahout.common.MahoutTestCase;
> +import org.apache.mahout.common.Pair;
> import org.apache.mahout.common.commandline.DefaultOptionCreator;
> import org.apache.mahout.common.distance.DistanceMeasure;
> import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
> @@ -126,8 +127,8 @@ public final class TestCanopyCreation ex
> int[] expectedNumPoints = { 4, 4, 3 };
> double[][] expectedCentroids = { { 1.5, 1.5 }, { 4.0, 4.0 },
> { 4.666666666666667, 4.6666666666666667 } };
> - assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
> - testCanopy.getNumObservations());
> + assertEquals("canopy points " + canopyIx, testCanopy.getNumObservations(),
> + expectedNumPoints[canopyIx]);
> double[] refCentroid = expectedCentroids[canopyIx];
> Vector testCentroid = testCanopy.computeCentroid();
> for (int pointIx = 0; pointIx< refCentroid.length; pointIx++) {
> @@ -151,8 +152,8 @@ public final class TestCanopyCreation ex
> { 4.666666666666667, 4.666666666666667 } };
> for (int canopyIx = 0; canopyIx< referenceEuclidean.size(); canopyIx++) {
> Canopy testCanopy = referenceEuclidean.get(canopyIx);
> - assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
> - testCanopy.getNumObservations());
> + assertEquals("canopy points " + canopyIx, testCanopy.getNumObservations(),
> + expectedNumPoints[canopyIx]);
> double[] refCentroid = expectedCentroids[canopyIx];
> Vector testCentroid = testCanopy.computeCentroid();
> for (int pointIx = 0; pointIx< refCentroid.length; pointIx++) {
> @@ -328,20 +329,36 @@ public final class TestCanopyCreation ex
> Canopy canopy = new Canopy();
> assertTrue("more to come", reader.next(key, canopy));
> assertEquals("1st key", "C-0", key.toString());
> - assertEquals("1st x value", 1.5, canopy.getCenter().get(0), EPSILON);
> - assertEquals("1st y value", 1.5, canopy.getCenter().get(1), EPSILON);
> +
> + List<Pair<Double,Double>> refCenters = Lists.newArrayList();
> + refCenters.add(new Pair<Double,Double>(1.5,1.5));
> + refCenters.add(new Pair<Double,Double>(4.333333333333334,4.333333333333334));
> + Pair<Double,Double> c = new Pair<Double,Double>(canopy.getCenter().get(0),
> + canopy.getCenter().get(1));
> + assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
> assertTrue("more to come", reader.next(key, canopy));
> assertEquals("2nd key", "C-1", key.toString());
> - assertEquals("2nd x value", 4.333333333333334, canopy.getCenter().get(0),
> - EPSILON);
> - assertEquals("2nd y value", 4.333333333333334, canopy.getCenter().get(1),
> - EPSILON);
> + c = new Pair<Double,Double>(canopy.getCenter().get(0),
> + canopy.getCenter().get(1));
> + assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
> assertFalse("more to come", reader.next(key, canopy));
> } finally {
> Closeables.closeQuietly(reader);
> }
> }
>
> + boolean findAndRemove(Pair<Double,Double> target,
> + List<Pair<Double,Double>> list, double epsilon) {
> + for (Pair<Double,Double> curr : list) {
> + if ( (Math.abs(target.getFirst() - curr.getFirst())< epsilon)
> +&& (Math.abs(target.getSecond() - curr.getSecond())< epsilon) ) {
> + list.remove(curr);
> + return true;
> + }
> + }
> + return false;
> + }
> +
> /**
> * Story: User can produce final canopy centers using a Hadoop map/reduce job
> * and a EuclideanDistanceMeasure.
> @@ -368,14 +385,18 @@ public final class TestCanopyCreation ex
> Canopy value = new Canopy();
> assertTrue("more to come", reader.next(key, value));
> assertEquals("1st key", "C-0", key.toString());
> - assertEquals("1st x value", 1.8, value.getCenter().get(0), EPSILON);
> - assertEquals("1st y value", 1.8, value.getCenter().get(1), EPSILON);
> +
> + List<Pair<Double,Double>> refCenters = Lists.newArrayList();
> + refCenters.add(new Pair<Double,Double>(1.8,1.8));
> + refCenters.add(new Pair<Double,Double>(4.433333333333334, 4.433333333333334));
> + Pair<Double,Double> c = new Pair<Double,Double>(value.getCenter().get(0),
> + value.getCenter().get(1));
> + assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
> assertTrue("more to come", reader.next(key, value));
> assertEquals("2nd key", "C-1", key.toString());
> - assertEquals("2nd x value", 4.433333333333334, value.getCenter().get(0),
> - EPSILON);
> - assertEquals("2nd y value", 4.433333333333334, value.getCenter().get(1),
> - EPSILON);
> + c = new Pair<Double,Double>(value.getCenter().get(0),
> + value.getCenter().get(1));
> + assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
> assertFalse("more to come", reader.next(key, value));
> } finally {
> Closeables.closeQuietly(reader);
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java Mon Mar 12 18:25:45 2012
> @@ -20,6 +20,9 @@ package org.apache.mahout.clustering.cla
> import java.io.IOException;
> import java.util.ArrayList;
> import java.util.List;
> +import java.util.Set;
> +
> +import com.google.common.collect.Sets;
>
> import junit.framework.Assert;
>
> @@ -195,9 +198,7 @@ public class ClusterClassificationDriver
> }
>
> private void assertVectorsWithOutlierRemoval() {
> - assertFirstClusterWithOutlierRemoval();
> - assertSecondClusterWithOutlierRemoval();
> - assertThirdClusterWithOutlierRemoval();
> + checkClustersWithOutlierRemoval();
> }
>
> private void assertVectorsWithoutOutlierRemoval() {
> @@ -230,25 +231,33 @@ public class ClusterClassificationDriver
> "{1:1.0,0:2.0}", "{1:2.0,0:1.0}"}, vector.asFormatString()));
> }
> }
> -
> - private void assertThirdClusterWithOutlierRemoval() {
> - Assert.assertEquals(1, thirdCluster.size());
> - for (Vector vector : thirdCluster) {
> - Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:9.0,0:9.0}"},
> - vector.asFormatString()));
> - }
> - }
> -
> - private void assertSecondClusterWithOutlierRemoval() {
> - Assert.assertEquals(0, secondCluster.size());
> - }
> -
> - private void assertFirstClusterWithOutlierRemoval() {
> - Assert.assertEquals(1, firstCluster.size());
> - for (Vector vector : firstCluster) {
> - Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:1.0,0:1.0}"},
> - vector.asFormatString()));
> - }
> +
> + private void checkClustersWithOutlierRemoval() {
> + Set<String> reference = Sets.newHashSet(new String[] {"{1:9.0,0:9.0}",
> + "{1:1.0,0:1.0}"});
> + int singletonCnt = 0;
> + int emptyCnt = 0;
> +
> + List<List<Vector>> clusters = Lists.newArrayList();
> + clusters.add(firstCluster);
> + clusters.add(secondCluster);
> + clusters.add(thirdCluster);
> +
> + for (List<Vector> vList : clusters) {
> + if (vList.size() == 0) {
> + emptyCnt++;
> + } else {
> + singletonCnt++;
> + Assert.assertTrue("expecting only singleton clusters; got size=" + vList.size(),
> + vList.size() == 1);
> + Assert.assertTrue("not expecting cluster:" + vList.get(0).asFormatString(),
> + reference.contains(vList.get(0).asFormatString()));
> + reference.remove(vList.get(0).asFormatString());
> + }
> + }
> + Assert.assertEquals("Different number of empty clusters than expected!", 1, emptyCnt);
> + Assert.assertEquals("Different number of singletons than expected!", 2, singletonCnt);
> + Assert.assertEquals("Didn't match all reference clusters!", 0, reference.size());
> }
> -
> +
> }
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Mon Mar 12 18:25:45 2012
> @@ -26,6 +26,7 @@ import com.google.common.collect.Lists;
> import com.google.common.collect.Maps;
> import com.google.common.io.Closeables;
> import org.apache.hadoop.conf.Configuration;
> +import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.IntWritable;
> @@ -38,6 +39,7 @@ import org.apache.hadoop.util.ToolRunner
> import org.apache.mahout.clustering.AbstractCluster;
> import org.apache.mahout.clustering.ClusterObservations;
> import org.apache.mahout.clustering.ClusteringTestUtils;
> +import org.apache.mahout.clustering.canopy.Canopy;
> import org.apache.mahout.clustering.canopy.CanopyDriver;
> import org.apache.mahout.clustering.classify.WeightedVectorWritable;
> import org.apache.mahout.common.DummyOutputCollector;
> @@ -486,6 +488,42 @@ public final class TestKmeansClustering
> // now run the Canopy job
> CanopyDriver.run(conf, pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, false, 0.0, false);
>
> + DummyOutputCollector<Text, Canopy> collector1 =
> + new DummyOutputCollector<Text, Canopy>();
> +
> + FileStatus[] outParts = FileSystem.get(conf).globStatus(
> + new Path(outputPath, "clusters-0-final/*-0*"));
> + for (FileStatus outPartStat : outParts) {
> + for (Pair<Text,Canopy> record :
> + new SequenceFileIterable<Text,Canopy>(
> + outPartStat.getPath(), conf)) {
> + collector1.collect(record.getFirst(), record.getSecond());
> + }
> + }
> +
> + boolean got15 = false;
> + boolean got43 = false;
> + int count = 0;
> + for (Text k : collector1.getKeys()) {
> + count++;
> + List<Canopy> vl = collector1.getValue(k);
> + assertEquals("non-singleton centroid!", 1, vl.size());
> + Vector v = vl.get(0).getCenter();
> + assertEquals("cetriod vector is wrong length", 2, v.size());
> + if ( (Math.abs(v.get(0) - 1.5)< EPSILON)
> +&& (Math.abs(v.get(1) - 1.5)< EPSILON)
> +&& !got15) {
> + got15 = true;
> + } else if ( (Math.abs(v.get(0) - 4.333333333333334)< EPSILON)
> +&& (Math.abs(v.get(1) - 4.333333333333334)< EPSILON)
> +&& !got43) {
> + got43 = true;
> + } else {
> + assertTrue("got unexpected center: "+v+" ["+v.getClass().toString()+"]", false);
> + }
> + }
> + assertEquals("got unexpected number of centers", 2, count);
> +
> // now run the KMeans job
> KMeansDriver.run(pointsPath, new Path(outputPath, "clusters-0-final"), outputPath, new EuclideanDistanceMeasure(),
> 0.001, 10, true, false);
> @@ -500,7 +538,28 @@ public final class TestKmeansClustering
> collector.collect(record.getFirst(), record.getSecond());
> }
>
> - assertEquals("num points[0]", 4, collector.getValue(new IntWritable(0)).size());
> - assertEquals("num points[1]", 5, collector.getValue(new IntWritable(1)).size());
> + boolean gotLowClust = false; // clusters should be [1, *] and [2, *]
> + boolean gotHighClust = false; // vs [3 , *], [4 , *] and [5, *]
> + for (IntWritable k : collector.getKeys()) {
> + List<WeightedVectorWritable> wvList = collector.getValue(k);
> + assertTrue("empty cluster!", wvList.size() != 0);
> + if (wvList.get(0).getVector().get(0)<= 2.0) {
> + for (WeightedVectorWritable wv : wvList) {
> + Vector v = wv.getVector();
> + int idx = v.maxValueIndex();
> + assertTrue("bad cluster!", v.get(idx)<= 2.0);
> + }
> + assertEquals("Wrong size cluster", 4, wvList.size());
> + gotLowClust= true;
> + } else {
> + for (WeightedVectorWritable wv : wvList) {
> + Vector v = wv.getVector();
> + int idx = v.minValueIndex();
> + assertTrue("bad cluster!", v.get(idx)> 2.0);
> + }
> + assertEquals("Wrong size cluster", 5, wvList.size());
> + gotHighClust= true;
> + }
> + }
> }
> }
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Mon Mar 12 18:25:45 2012
> @@ -21,10 +21,12 @@ import java.util.Collection;
> import java.util.Iterator;
> import java.util.List;
> import java.util.Map;
> +import java.util.Random;
>
> import com.google.common.collect.Lists;
> import com.google.common.collect.Maps;
> import org.apache.hadoop.conf.Configuration;
> +import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.Text;
> @@ -350,7 +352,13 @@ public final class TestMeanShift extends
> Configuration conf = new Configuration();
> FileSystem fs = FileSystem.get(input.toUri(), conf);
> Collection<VectorWritable> points = Lists.newArrayList();
> - for (Vector v : raw) {
> + Random r = new Random(123);
> + Vector[] permutedRaw = new Vector[raw.length];
> + for (int i = 0; i< raw.length; i++)
> + permutedRaw = raw;
> + for (int i = 0; i< permutedRaw.length; i++)
> + permutedRaw[i] = permutedRaw[i + r.nextInt(raw.length - i)];
> + for (Vector v : permutedRaw) {
> points.add(new VectorWritable(v));
> }
> ClusteringTestUtils.writePointsToFile(points,
> @@ -376,10 +384,12 @@ public final class TestMeanShift extends
> optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.2",
> optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
> ToolRunner.run(conf, new MeanShiftCanopyDriver(), args);
> - Path outPart = new Path(output, "clusters-4-final/part-r-00000");
> - long count = HadoopUtil.countRecords(outPart, conf);
> - assertEquals("count", 3, count);
> - outPart = new Path(output, "clusters-0/part-m-00000");
> + FileStatus[] outParts = FileSystem.get(conf).globStatus(
> + new Path(output, "clusters-?-final/part-r-*"));
> + assertEquals("Wrong number of matching final parts", 1, outParts.length);
> + long count = HadoopUtil.countRecords(outParts[0].getPath(), conf);
> + assertEquals("count", 5, count);
> + Path outPart = new Path(output, "clusters-0/part-m-00000");
> Iterator<?> iterator = new SequenceFileValueIterator<Writable>(outPart,
> true, conf);
> // now test the initial clusters to ensure the type of their centers has
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java Mon Mar 12 18:25:45 2012
> @@ -1,26 +0,0 @@
> -/**
> - * Licensed to the Apache Software Foundation (ASF) under one
> - * or more contributor license agreements. See the NOTICE file
> - * distributed with this work for additional information
> - * regarding copyright ownership. The ASF licenses this file
> - * to you under the Apache License, Version 2.0 (the
> - * "License"); you may not use this file except in compliance
> - * with the License. You may obtain a copy of the License at
> - *
> - * http://www.apache.org/licenses/LICENSE-2.0
> - *
> - * Unless required by applicable law or agreed to in writing,
> - * software distributed under the License is distributed on an
> - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
> - * KIND, either express or implied. See the License for the
> - * specific language governing permissions and limitations
> - * under the License.
> - */
> -
> -package org.apache.mahout.common;
> -
> -import org.apache.hadoop.mapreduce.Counter;
> -
> -final class DummyCounter extends Counter {
> -
> -}
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java Mon Mar 12 18:25:45 2012
> @@ -17,16 +17,21 @@
>
> package org.apache.mahout.common;
>
> +import com.google.common.collect.Lists;
> +
> import java.io.IOException;
> +import java.lang.reflect.Constructor;
> +import java.lang.reflect.Method;
> import java.util.List;
> import java.util.Map;
> import java.util.Set;
> import java.util.TreeMap;
>
> -import com.google.common.collect.Lists;
> import org.apache.hadoop.conf.Configuration;
> +import org.apache.hadoop.mapreduce.MapContext;
> import org.apache.hadoop.mapreduce.Mapper;
> import org.apache.hadoop.mapreduce.RecordWriter;
> +import org.apache.hadoop.mapreduce.ReduceContext;
> import org.apache.hadoop.mapreduce.Reducer;
> import org.apache.hadoop.mapreduce.TaskAttemptContext;
> import org.apache.hadoop.mapreduce.TaskAttemptID;
> @@ -65,7 +70,18 @@ public final class DummyRecordWriter<K,
> Configuration configuration,
> RecordWriter<K2, V2> output)
> throws IOException, InterruptedException {
> - return mapper.new Context(configuration, new TaskAttemptID(), null, output, null, new DummyStatusReporter(), null);
> +
> + // Use reflection since the context types changed incompatibly between 0.20
> + // and 0.23.
> + try {
> + return buildNewMapperContext(configuration, output);
> + } catch (Exception e) {
> + try {
> + return buildOldMapperContext(mapper, configuration, output);
> + } catch (Exception ex) {
> + throw new IllegalStateException(ex);
> + }
> + }
> }
>
> public static<K1, V1, K2, V2> Reducer<K1, V1, K2, V2>.Context build(Reducer<K1, V1, K2, V2> reducer,
> @@ -74,17 +90,96 @@ public final class DummyRecordWriter<K,
> Class<K1> keyClass,
> Class<V1> valueClass)
> throws IOException, InterruptedException {
> - return reducer.new Context(configuration,
> - new TaskAttemptID(),
> - new MockIterator(),
> - null,
> - null,
> - output,
> - null,
> - new DummyStatusReporter(),
> - null,
> - keyClass,
> - valueClass);
> +
> + // Use reflection since the context types changed incompatibly between 0.20
> + // and 0.23.
> + try {
> + return buildNewReducerContext(configuration, output, keyClass, valueClass);
> + } catch (Exception e) {
> + try {
> + return buildOldReducerContext(reducer, configuration, output, keyClass, valueClass);
> + } catch (Exception ex) {
> + throw new IllegalStateException(ex);
> + }
> + }
> + }
> +
> + @SuppressWarnings({ "unchecked", "rawtypes" })
> + private static<K1, V1, K2, V2> Mapper<K1, V1, K2, V2>.Context buildNewMapperContext(
> + Configuration configuration, RecordWriter<K2, V2> output) throws Exception {
> + Class<?> mapContextImplClass = Class.forName("org.apache.hadoop.mapreduce.task.MapContextImpl");
> + Constructor<?> cons = mapContextImplClass.getConstructors()[0];
> + Object mapContextImpl = cons.newInstance(configuration,
> + new TaskAttemptID(), null, output, null, new DummyStatusReporter(), null);
> +
> + Class<?> wrappedMapperClass = Class.forName("org.apache.hadoop.mapreduce.lib.map.WrappedMapper");
> + Object wrappedMapper = wrappedMapperClass.newInstance();
> + Method getMapContext = wrappedMapperClass.getMethod("getMapContext", MapContext.class);
> + return (Mapper.Context) getMapContext.invoke(wrappedMapper, mapContextImpl);
> + }
> +
> + @SuppressWarnings({ "unchecked", "rawtypes" })
> + private static<K1, V1, K2, V2> Mapper<K1, V1, K2, V2>.Context buildOldMapperContext(
> + Mapper<K1, V1, K2, V2> mapper, Configuration configuration,
> + RecordWriter<K2, V2> output) throws Exception {
> + Constructor<?> cons = getNestedContextConstructor(mapper.getClass());
> + // first argument to the constructor is the enclosing instance
> + return (Mapper.Context) cons.newInstance(mapper, configuration,
> + new TaskAttemptID(), null, output, null, new DummyStatusReporter(), null);
> + }
> +
> + @SuppressWarnings({ "unchecked", "rawtypes" })
> + private static<K1, V1, K2, V2> Reducer<K1, V1, K2, V2>.Context buildNewReducerContext(
> + Configuration configuration, RecordWriter<K2, V2> output, Class<K1> keyClass,
> + Class<V1> valueClass) throws Exception {
> + Class<?> reduceContextImplClass = Class.forName("org.apache.hadoop.mapreduce.task.ReduceContextImpl");
> + Constructor<?> cons = reduceContextImplClass.getConstructors()[0];
> + Object reduceContextImpl = cons.newInstance(configuration,
> + new TaskAttemptID(),
> + new MockIterator(),
> + null,
> + null,
> + output,
> + null,
> + new DummyStatusReporter(),
> + null,
> + keyClass,
> + valueClass);
> +
> + Class<?> wrappedReducerClass = Class.forName("org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer");
> + Object wrappedReducer = wrappedReducerClass.newInstance();
> + Method getReducerContext = wrappedReducerClass.getMethod("getReducerContext", ReduceContext.class);
> + return (Reducer.Context) getReducerContext.invoke(wrappedReducer, reduceContextImpl);
> + }
> +
> + @SuppressWarnings({ "unchecked", "rawtypes" })
> + private static<K1, V1, K2, V2> Reducer<K1, V1, K2, V2>.Context buildOldReducerContext(
> + Reducer<K1, V1, K2, V2> reducer, Configuration configuration,
> + RecordWriter<K2, V2> output, Class<K1> keyClass,
> + Class<V1> valueClass) throws Exception {
> + Constructor<?> cons = getNestedContextConstructor(reducer.getClass());
> + // first argument to the constructor is the enclosing instance
> + return (Reducer.Context) cons.newInstance(reducer,
> + configuration,
> + new TaskAttemptID(),
> + new MockIterator(),
> + null,
> + null,
> + output,
> + null,
> + new DummyStatusReporter(),
> + null,
> + keyClass,
> + valueClass);
> + }
> +
> + private static Constructor<?> getNestedContextConstructor(Class<?> outerClass) {
> + for (Class<?> nestedClass : outerClass.getClasses()) {
> + if ("Context".equals(nestedClass.getSimpleName())) {
> + return nestedClass.getConstructors()[0];
> + }
> + }
> + throw new IllegalStateException("Cannot find context class for " + outerClass);
> }
>
> }
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java Mon Mar 12 18:25:45 2012
> @@ -19,6 +19,8 @@
>
> package org.apache.mahout.common;
>
> +import static org.easymock.EasyMock.createMockBuilder;
> +
> import java.util.Map;
>
> import com.google.common.collect.Maps;
> @@ -30,10 +32,21 @@ public final class DummyStatusReporter e
> private final Map<Enum<?>, Counter> counters = Maps.newHashMap();
> private final Map<String, Counter> counterGroups = Maps.newHashMap();
>
> + private Counter newCounter() {
> + try {
> + // 0.23 case
> + String c = "org.apache.hadoop.mapreduce.counters.GenericCounter";
> + return (Counter) createMockBuilder(Class.forName(c)).createMock();
> + } catch (ClassNotFoundException e) {
> + // 0.20 case
> + return createMockBuilder(Counter.class).createMock();
> + }
> + }
> +
> @Override
> public Counter getCounter(Enum<?> name) {
> if (!counters.containsKey(name)) {
> - counters.put(name, new DummyCounter());
> + counters.put(name, newCounter());
> }
> return counters.get(name);
> }
> @@ -42,7 +55,7 @@ public final class DummyStatusReporter e
> @Override
> public Counter getCounter(String group, String name) {
> if (!counterGroups.containsKey(group + name)) {
> - counterGroups.put(group + name, new DummyCounter());
> + counterGroups.put(group + name, newCounter());
> }
> return counterGroups.get(group+name);
> }
> @@ -55,4 +68,8 @@ public final class DummyStatusReporter e
> public void setStatus(String status) {
> }
>
> + public float getProgress() {
> + return 0;
> + }
> +
> }
>
> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java (original)
> +++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java Mon Mar 12 18:25:45 2012
> @@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.Path;
> import org.apache.mahout.clustering.ClusteringTestUtils;
> +import org.apache.mahout.common.HadoopUtil;
> import org.apache.mahout.common.MahoutTestCase;
> import org.apache.mahout.common.iterator.sequencefile.PathFilters;
> import org.apache.mahout.math.DenseVector;
> @@ -254,14 +255,14 @@ public final class TestDistributedRowMat
>
> deleteContentsOfPath(conf, outputPath);
>
> - assertEquals(0, fs.listStatus(outputPath).length);
> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>
> Vector result1 = dm.times(v);
>
> - assertEquals(0, fs.listStatus(outputPath).length);
> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>
> deleteContentsOfPath(conf, outputPath);
> - assertEquals(0, fs.listStatus(outputPath).length);
> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>
> conf.setBoolean(DistributedRowMatrix.KEEP_TEMP_FILES, true);
> dm.setConf(conf);
> @@ -291,14 +292,14 @@ public final class TestDistributedRowMat
>
> deleteContentsOfPath(conf, outputPath);
>
> - assertEquals(0, fs.listStatus(outputPath).length);
> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>
> Vector result1 = dm.timesSquared(v);
>
> - assertEquals(0, fs.listStatus(outputPath).length);
> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>
> deleteContentsOfPath(conf, outputPath);
> - assertEquals(0, fs.listStatus(outputPath).length);
> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>
> conf.setBoolean(DistributedRowMatrix.KEEP_TEMP_FILES, true);
> dm.setConf(conf);
> @@ -325,7 +326,7 @@ public final class TestDistributedRowMat
> private static void deleteContentsOfPath(Configuration conf, Path path) throws Exception {
> FileSystem fs = path.getFileSystem(conf);
>
> - FileStatus[] statuses = fs.listStatus(path);
> + FileStatus[] statuses = HadoopUtil.listStatus(fs, path);
> for (FileStatus status : statuses) {
> fs.delete(status.getPath(), true);
> }
>
> Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
> +++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Mon Mar 12 18:25:45 2012
> @@ -193,7 +193,7 @@ public final class TestClusterDumper ext
> output, measure, 8, 4, true, 0.0, true);
> // run ClusterDumper
> ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
> - "clusters-0"), new Path(output, "clusteredPoints"));
> + "clusters-0-final"), new Path(output, "clusteredPoints"));
> clusterDumper.printClusters(termDictionary);
> }
>
>
> Modified: mahout/trunk/pom.xml
> URL: http://svn.apache.org/viewvc/mahout/trunk/pom.xml?rev=1299770&r1=1299769&r2=1299770&view=diff
> ==============================================================================
> --- mahout/trunk/pom.xml (original)
> +++ mahout/trunk/pom.xml Mon Mar 12 18:25:45 2012
> @@ -107,6 +107,17 @@
> <url>https://issues.apache.org/jira/browse/MAHOUT</url>
> </issueManagement>
>
> +<repositories>
> +<repository>
> +<id>apache.snapshots</id>
> +<name>Apache Snapshot Repository</name>
> +<url>http://repository.apache.org/snapshots</url>
> +<releases>
> +<enabled>false</enabled>
> +</releases>
> +</repository>
> +</repositories>
> +
> <dependencyManagement>
> <dependencies>
>
> @@ -264,6 +275,100 @@
> </exclusions>
> </dependency>
> <dependency>
> +<groupId>org.apache.hadoop</groupId>
> +<artifactId>hadoop-common</artifactId>
> +<version>${hadoop.version}</version>
> +<exclusions>
> +<exclusion>
> +<groupId>net.sf.kosmosfs</groupId>
> +<artifactId>kfs</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>org.mortbay.jetty</groupId>
> +<artifactId>jetty</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>org.mortbay.jetty</groupId>
> +<artifactId>jetty-util</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>hsqldb</groupId>
> +<artifactId>hsqldb</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>commons-el</groupId>
> +<artifactId>commons-el</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>junit</groupId>
> +<artifactId>junit</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>oro</groupId>
> +<artifactId>oro</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>org.mortbay.jetty</groupId>
> +<artifactId>jsp-2.1</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>org.mortbay.jetty</groupId>
> +<artifactId>jsp-api-2.1</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>org.mortbay.jetty</groupId>
> +<artifactId>servlet-api-2.5</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>commons-net</groupId>
> +<artifactId>commons-net</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>tomcat</groupId>
> +<artifactId>jasper-runtime</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>tomcat</groupId>
> +<artifactId>jasper-compiler</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>xmlenc</groupId>
> +<artifactId>xmlenc</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>net.java.dev.jets3t</groupId>
> +<artifactId>jets3t</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>org.eclipse.jdt</groupId>
> +<artifactId>core</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>org.slf4j</groupId>
> +<artifactId>slf4j-api</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>org.slf4j</groupId>
> +<artifactId>slf4j-jcl</artifactId>
> +</exclusion>
> +<exclusion>
> +<groupId>org.slf4j</groupId>
> +<artifactId>slf4j-log4j12</artifactId>
> +</exclusion>
> +</exclusions>
> +</dependency>
> +<dependency>
> +<groupId>org.apache.hadoop</groupId>
> +<artifactId>hadoop-mapreduce-client-core</artifactId>
> +<version>${hadoop.version}</version>
> +</dependency>
> +<dependency>
> +<groupId>org.apache.hadoop</groupId>
> +<artifactId>hadoop-mapreduce-client-common</artifactId>
> +<version>${hadoop.version}</version>
> +</dependency>
> +
> +<dependency>
> <groupId>org.codehaus.jackson</groupId>
> <artifactId>jackson-core-asl</artifactId>
> <version>1.8.2</version>
>
>
Re: svn commit: r1299770 - in /mahout/trunk: ./ core/ core/src/main/java/org/apache/mahout/common/ core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/ core/src/test/java/org/apache/mahout/clustering/canopy/ core/src/test/java/org/apache/m...
Posted by Grant Ingersoll <gs...@apache.org>.
Done (tcp@a.o)
On Mar 12, 2012, at 2:30 PM, tom pierce wrote:
> Can someone hook me up with JIRA privs so I can close tickets?
>
> (Or, if that isn't something all committers get, someone pls mark -822 and -980 closed)
>
> -tom
>
> On 03/12/2012 02:25 PM, tcp@apache.org wrote:
>> Author: tcp
>> Date: Mon Mar 12 18:25:45 2012
>> New Revision: 1299770
>>
>> URL: http://svn.apache.org/viewvc?rev=1299770&view=rev
>> Log:
>> MAHOUT-822: Make Mahout compatible with Hadoop 0.23.1.
>>
>> Modified:
>> mahout/trunk/core/pom.xml
>> mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java
>> mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
>> mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
>> mahout/trunk/pom.xml
>>
>> Modified: mahout/trunk/core/pom.xml
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/pom.xml?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/pom.xml (original)
>> +++ mahout/trunk/core/pom.xml Mon Mar 12 18:25:45 2012
>> @@ -140,10 +140,6 @@
>>
>> <!-- Third Party -->
>> <dependency>
>> -<groupId>org.apache.hadoop</groupId>
>> -<artifactId>hadoop-core</artifactId>
>> -</dependency>
>> -<dependency>
>> <groupId>org.codehaus.jackson</groupId>
>> <artifactId>jackson-core-asl</artifactId>
>> </dependency>
>> @@ -211,4 +207,43 @@
>> </dependency>
>>
>> </dependencies>
>> +
>> +<profiles>
>> +<profile>
>> +<id>hadoop-0.20</id>
>> +<activation>
>> +<property>
>> +<name>!hadoop.version</name>
>> +</property>
>> +</activation>
>> +<dependencies>
>> +<dependency>
>> +<groupId>org.apache.hadoop</groupId>
>> +<artifactId>hadoop-core</artifactId>
>> +</dependency>
>> +</dependencies>
>> +</profile>
>> +<profile>
>> +<id>hadoop-0.23</id>
>> +<activation>
>> +<property>
>> +<name>hadoop.version</name>
>> +</property>
>> +</activation>
>> +<dependencies>
>> +<dependency>
>> +<groupId>org.apache.hadoop</groupId>
>> +<artifactId>hadoop-common</artifactId>
>> +</dependency>
>> +<dependency>
>> +<groupId>org.apache.hadoop</groupId>
>> +<artifactId>hadoop-mapreduce-client-common</artifactId>
>> +</dependency>
>> +<dependency>
>> +<groupId>org.apache.hadoop</groupId>
>> +<artifactId>hadoop-mapreduce-client-core</artifactId>
>> +</dependency>
>> +</dependencies>
>> +</profile>
>> +</profiles>
>> </project>
>>
>> Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java (original)
>> +++ mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java Mon Mar 12 18:25:45 2012
>> @@ -17,6 +17,7 @@
>>
>> package org.apache.mahout.common;
>>
>> +import java.io.FileNotFoundException;
>> import java.io.IOException;
>> import java.io.InputStream;
>> import java.net.URI;
>> @@ -229,9 +230,9 @@ public final class HadoopUtil {
>> FileStatus[] statuses;
>> FileSystem fs = path.getFileSystem(conf);
>> if (filter == null) {
>> - statuses = pathType == PathType.GLOB ? fs.globStatus(path) : fs.listStatus(path);
>> + statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path);
>> } else {
>> - statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : fs.listStatus(path, filter);
>> + statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter);
>> }
>> if (ordering != null) {
>> Arrays.sort(statuses, ordering);
>> @@ -239,6 +240,22 @@ public final class HadoopUtil {
>> return statuses;
>> }
>>
>> + public static FileStatus[] listStatus(FileSystem fs, Path path) throws IOException {
>> + try {
>> + return fs.listStatus(path);
>> + } catch (FileNotFoundException e) {
>> + return new FileStatus[0];
>> + }
>> + }
>> +
>> + public static FileStatus[] listStatus(FileSystem fs, Path path, PathFilter filter) throws IOException {
>> + try {
>> + return fs.listStatus(path, filter);
>> + } catch (FileNotFoundException e) {
>> + return new FileStatus[0];
>> + }
>> + }
>> +
>> public static void cacheFiles(Path fileToCache, Configuration conf) {
>> DistributedCache.setCacheFiles(new URI[]{fileToCache.toUri()}, conf);
>> }
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java Mon Mar 12 18:25:45 2012
>> @@ -1,70 +0,0 @@
>> -/**
>> - * Licensed to the Apache Software Foundation (ASF) under one or more
>> - * contributor license agreements. See the NOTICE file distributed with
>> - * this work for additional information regarding copyright ownership.
>> - * The ASF licenses this file to You under the Apache License, Version 2.0
>> - * (the "License"); you may not use this file except in compliance with
>> - * the License. You may obtain a copy of the License at
>> - *
>> - * http://www.apache.org/licenses/LICENSE-2.0
>> - *
>> - * Unless required by applicable law or agreed to in writing, software
>> - * distributed under the License is distributed on an "AS IS" BASIS,
>> - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> - * See the License for the specific language governing permissions and
>> - * limitations under the License.
>> - */
>> -package org.apache.mahout.classifier.df.mapreduce.partial;
>> -
>> -import java.io.IOException;
>> -
>> -import org.apache.hadoop.conf.Configuration;
>> -import org.apache.hadoop.mapreduce.Mapper;
>> -import org.apache.hadoop.mapreduce.TaskAttemptID;
>> -import org.apache.hadoop.mapreduce.Mapper.Context;
>> -import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
>> -
>> -/**
>> - * Special implementation that collects the output of the mappers
>> - */
>> -final class MockContext extends Context {
>> -
>> - private final TreeID[] keys;
>> - private final MapredOutput[] values;
>> - private int index;
>> -
>> - MockContext(Mapper<?,?,?,?> mapper, Configuration conf, TaskAttemptID taskid, int nbTrees)
>> - throws IOException, InterruptedException {
>> - mapper.super(conf, taskid, null, null, null, null, null);
>> -
>> - keys = new TreeID[nbTrees];
>> - values = new MapredOutput[nbTrees];
>> - }
>> -
>> - @Override
>> - public void write(Object key, Object value) throws IOException {
>> - if (index == keys.length) {
>> - throw new IOException("Received more output than expected : " + index);
>> - }
>> -
>> - keys[index] = ((TreeID) key).clone();
>> - values[index] = ((MapredOutput) value).clone();
>> -
>> - index++;
>> - }
>> -
>> - /**
>> - * @return number of outputs collected
>> - */
>> - public int nbOutputs() {
>> - return index;
>> - }
>> -
>> - public TreeID[] getKeys() {
>> - return keys;
>> - }
>> -
>> - public MapredOutput[] getValues() {
>> - return values;
>> - }
>> -}
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java Mon Mar 12 18:25:45 2012
>> @@ -1,176 +0,0 @@
>> -/**
>> - * Licensed to the Apache Software Foundation (ASF) under one or more
>> - * contributor license agreements. See the NOTICE file distributed with
>> - * this work for additional information regarding copyright ownership.
>> - * The ASF licenses this file to You under the Apache License, Version 2.0
>> - * (the "License"); you may not use this file except in compliance with
>> - * the License. You may obtain a copy of the License at
>> - *
>> - * http://www.apache.org/licenses/LICENSE-2.0
>> - *
>> - * Unless required by applicable law or agreed to in writing, software
>> - * distributed under the License is distributed on an "AS IS" BASIS,
>> - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> - * See the License for the specific language governing permissions and
>> - * limitations under the License.
>> - */
>> -
>> -package org.apache.mahout.classifier.df.mapreduce.partial;
>> -
>> -import java.io.IOException;
>> -import java.util.List;
>> -
>> -import org.apache.commons.lang.ArrayUtils;
>> -import org.apache.hadoop.conf.Configuration;
>> -import org.apache.hadoop.fs.Path;
>> -import org.apache.hadoop.io.LongWritable;
>> -import org.apache.hadoop.io.Text;
>> -import org.apache.hadoop.mapreduce.InputSplit;
>> -import org.apache.hadoop.mapreduce.Job;
>> -import org.apache.hadoop.mapreduce.RecordReader;
>> -import org.apache.hadoop.mapreduce.TaskAttemptContext;
>> -import org.apache.hadoop.mapreduce.TaskAttemptID;
>> -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>> -import org.apache.mahout.classifier.df.DFUtils;
>> -import org.apache.mahout.classifier.df.DecisionForest;
>> -import org.apache.mahout.classifier.df.builder.TreeBuilder;
>> -import org.apache.mahout.classifier.df.data.Dataset;
>> -import org.apache.mahout.classifier.df.mapreduce.Builder;
>> -import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
>> -import org.apache.mahout.classifier.df.node.Node;
>> -import org.slf4j.Logger;
>> -import org.slf4j.LoggerFactory;
>> -
>> -import com.google.common.collect.Lists;
>> -
>> -/**
>> - * Simulates the Partial mapreduce implementation in a sequential manner. Must
>> - * receive a seed
>> - */
>> -public class PartialSequentialBuilder extends PartialBuilder {
>> -
>> - private static final Logger log = LoggerFactory.getLogger(PartialSequentialBuilder.class);
>> -
>> - private MockContext firstOutput;
>> -
>> - private final Dataset dataset;
>> -
>> - public PartialSequentialBuilder(TreeBuilder treeBuilder, Path dataPath,
>> - Dataset dataset, long seed, Configuration conf) {
>> - super(treeBuilder, dataPath, new Path("notUsed"), seed, conf);
>> - this.dataset = dataset;
>> - }
>> -
>> - public PartialSequentialBuilder(TreeBuilder treeBuilder, Path dataPath,
>> - Dataset dataset, long seed) {
>> - this(treeBuilder, dataPath, dataset, seed, new Configuration());
>> - }
>> -
>> - @Override
>> - protected void configureJob(Job job)
>> - throws IOException {
>> - Configuration conf = job.getConfiguration();
>> -
>> - int num = conf.getInt("mapred.map.tasks", -1);
>> -
>> - super.configureJob(job);
>> -
>> - // PartialBuilder sets the number of maps to 1 if we are running in 'local'
>> - conf.setInt("mapred.map.tasks", num);
>> - }
>> -
>> - @Override
>> - protected boolean runJob(Job job) throws IOException, InterruptedException {
>> - Configuration conf = job.getConfiguration();
>> -
>> - // retrieve the splits
>> - TextInputFormat input = new TextInputFormat();
>> - List<InputSplit> splits = input.getSplits(job);
>> -
>> - int nbSplits = splits.size();
>> - log.debug("Nb splits : {}", nbSplits);
>> -
>> - InputSplit[] sorted = new InputSplit[nbSplits];
>> - splits.toArray(sorted);
>> - Builder.sortSplits(sorted);
>> -
>> - int numTrees = Builder.getNbTrees(conf); // total number of trees
>> -
>> - TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());
>> -
>> - firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);
>> -
>> - /* first instance id in hadoop's order */
>> - //int[] firstIds = new int[nbSplits];
>> - /* partitions' sizes in hadoop order */
>> - int[] sizes = new int[nbSplits];
>> -
>> - // to compute firstIds, process the splits in file order
>> - long slowest = 0; // duration of slowest map
>> - int firstId = 0;
>> - for (InputSplit split : splits) {
>> - int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition
>> -
>> - RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
>> - reader.initialize(split, task);
>> -
>> - Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(),
>> - hp, nbSplits, numTrees);
>> -
>> - long time = System.currentTimeMillis();
>> -
>> - //firstIds[hp] = firstId;
>> -
>> - while (reader.nextKeyValue()) {
>> - mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput);
>> - firstId++;
>> - sizes[hp]++;
>> - }
>> -
>> - mapper.cleanup(firstOutput);
>> -
>> - time = System.currentTimeMillis() - time;
>> - log.info("Duration : {}", DFUtils.elapsedTime(time));
>> -
>> - if (time> slowest) {
>> - slowest = time;
>> - }
>> - }
>> -
>> - log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
>> - return true;
>> - }
>> -
>> - @Override
>> - protected DecisionForest parseOutput(Job job) throws IOException {
>> - return processOutput(firstOutput.getKeys(), firstOutput.getValues());
>> - }
>> -
>> - /**
>> - * extract the decision forest
>> - */
>> - protected static DecisionForest processOutput(TreeID[] keys, MapredOutput[] values) {
>> - List<Node> trees = Lists.newArrayList();
>> -
>> - for (int index = 0; index< keys.length; index++) {
>> - MapredOutput value = values[index];
>> - trees.add(value.getTree());
>> - }
>> -
>> - return new DecisionForest(trees);
>> - }
>> -
>> - /**
>> - * Special Step1Mapper that can be configured without using a Configuration
>> - *
>> - */
>> - private static class MockStep1Mapper extends Step1Mapper {
>> - protected MockStep1Mapper(TreeBuilder treeBuilder, Dataset dataset, Long seed,
>> - int partition, int numMapTasks, int numTrees) {
>> - configure(false, treeBuilder, dataset);
>> - configure(seed, partition, numMapTasks, numTrees);
>> - }
>> -
>> - }
>> -
>> -}
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java Mon Mar 12 18:25:45 2012
>> @@ -17,21 +17,30 @@
>>
>> package org.apache.mahout.classifier.df.mapreduce.partial;
>>
>> +import static org.easymock.EasyMock.anyObject;
>> +import static org.easymock.EasyMock.capture;
>> +import static org.easymock.EasyMock.createMock;
>> +import static org.easymock.EasyMock.expectLastCall;
>> +import static org.easymock.EasyMock.replay;
>> +import static org.easymock.EasyMock.verify;
>> +
>> import java.util.Random;
>>
>> -import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.io.LongWritable;
>> import org.apache.hadoop.io.Text;
>> -import org.apache.hadoop.mapreduce.TaskAttemptID;
>> -import org.apache.mahout.common.MahoutTestCase;
>> +import org.apache.hadoop.mapreduce.Mapper;
>> import org.apache.mahout.common.RandomUtils;
>> import org.apache.mahout.classifier.df.builder.TreeBuilder;
>> import org.apache.mahout.classifier.df.data.Data;
>> import org.apache.mahout.classifier.df.data.DataLoader;
>> import org.apache.mahout.classifier.df.data.Dataset;
>> import org.apache.mahout.classifier.df.data.Utils;
>> +import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
>> import org.apache.mahout.classifier.df.node.Leaf;
>> import org.apache.mahout.classifier.df.node.Node;
>> +import org.apache.mahout.common.MahoutTestCase;
>> +import org.easymock.Capture;
>> +import org.easymock.CaptureType;
>> import org.junit.Test;
>>
>> public final class Step1MapperTest extends MahoutTestCase {
>> @@ -71,6 +80,17 @@ public final class Step1MapperTest exten
>> }
>> }
>>
>> + private static class TreeIDCapture extends Capture<TreeID> {
>> +
>> + public TreeIDCapture() {
>> + super(CaptureType.ALL);
>> + }
>> +
>> + public void setValue(final TreeID value) {
>> + super.setValue(value.clone());
>> + }
>> + }
>> +
>> /** nb attributes per generated data instance */
>> static final int NUM_ATTRIBUTES = 4;
>>
>> @@ -83,6 +103,7 @@ public final class Step1MapperTest exten
>> /** nb mappers to use */
>> static final int NUM_MAPPERS = 2;
>>
>> + @SuppressWarnings({ "rawtypes", "unchecked" })
>> @Test
>> public void testMapper() throws Exception {
>> Long seed = null;
>> @@ -109,8 +130,13 @@ public final class Step1MapperTest exten
>> // expected number of trees that this mapper will build
>> int mapNbTrees = Step1Mapper.nbTrees(NUM_MAPPERS, NUM_TREES, partition);
>>
>> - MockContext context = new MockContext(new Step1Mapper(),
>> - new Configuration(), new TaskAttemptID(), mapNbTrees);
>> + Mapper.Context context =
>> + createMock(Mapper.Context.class);
>> + Capture<TreeID> capturedKeys = new TreeIDCapture();
>> + context.write(capture(capturedKeys), anyObject());
>> + expectLastCall().anyTimes();
>> +
>> + replay(context);
>>
>> MockStep1Mapper mapper = new MockStep1Mapper(treeBuilder, dataset, seed,
>> partition, NUM_MAPPERS, NUM_TREES);
>> @@ -125,12 +151,13 @@ public final class Step1MapperTest exten
>> }
>>
>> mapper.cleanup(context);
>> + verify(context);
>>
>> // make sure the mapper built all its trees
>> - assertEquals(mapNbTrees, context.nbOutputs());
>> + assertEquals(mapNbTrees, capturedKeys.getValues().size());
>>
>> // check the returned keys
>> - for (TreeID k : context.getKeys()) {
>> + for (TreeID k : capturedKeys.getValues()) {
>> assertEquals(partition, k.partition());
>> assertEquals(treeIndex, k.treeId());
>>
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Mon Mar 12 18:25:45 2012
>> @@ -34,6 +34,7 @@ import org.apache.mahout.clustering.Clus
>> import org.apache.mahout.common.DummyRecordWriter;
>> import org.apache.mahout.common.HadoopUtil;
>> import org.apache.mahout.common.MahoutTestCase;
>> +import org.apache.mahout.common.Pair;
>> import org.apache.mahout.common.commandline.DefaultOptionCreator;
>> import org.apache.mahout.common.distance.DistanceMeasure;
>> import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
>> @@ -126,8 +127,8 @@ public final class TestCanopyCreation ex
>> int[] expectedNumPoints = { 4, 4, 3 };
>> double[][] expectedCentroids = { { 1.5, 1.5 }, { 4.0, 4.0 },
>> { 4.666666666666667, 4.6666666666666667 } };
>> - assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
>> - testCanopy.getNumObservations());
>> + assertEquals("canopy points " + canopyIx, testCanopy.getNumObservations(),
>> + expectedNumPoints[canopyIx]);
>> double[] refCentroid = expectedCentroids[canopyIx];
>> Vector testCentroid = testCanopy.computeCentroid();
>> for (int pointIx = 0; pointIx< refCentroid.length; pointIx++) {
>> @@ -151,8 +152,8 @@ public final class TestCanopyCreation ex
>> { 4.666666666666667, 4.666666666666667 } };
>> for (int canopyIx = 0; canopyIx< referenceEuclidean.size(); canopyIx++) {
>> Canopy testCanopy = referenceEuclidean.get(canopyIx);
>> - assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
>> - testCanopy.getNumObservations());
>> + assertEquals("canopy points " + canopyIx, testCanopy.getNumObservations(),
>> + expectedNumPoints[canopyIx]);
>> double[] refCentroid = expectedCentroids[canopyIx];
>> Vector testCentroid = testCanopy.computeCentroid();
>> for (int pointIx = 0; pointIx< refCentroid.length; pointIx++) {
>> @@ -328,20 +329,36 @@ public final class TestCanopyCreation ex
>> Canopy canopy = new Canopy();
>> assertTrue("more to come", reader.next(key, canopy));
>> assertEquals("1st key", "C-0", key.toString());
>> - assertEquals("1st x value", 1.5, canopy.getCenter().get(0), EPSILON);
>> - assertEquals("1st y value", 1.5, canopy.getCenter().get(1), EPSILON);
>> +
>> + List<Pair<Double,Double>> refCenters = Lists.newArrayList();
>> + refCenters.add(new Pair<Double,Double>(1.5,1.5));
>> + refCenters.add(new Pair<Double,Double>(4.333333333333334,4.333333333333334));
>> + Pair<Double,Double> c = new Pair<Double,Double>(canopy.getCenter().get(0),
>> + canopy.getCenter().get(1));
>> + assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
>> assertTrue("more to come", reader.next(key, canopy));
>> assertEquals("2nd key", "C-1", key.toString());
>> - assertEquals("2nd x value", 4.333333333333334, canopy.getCenter().get(0),
>> - EPSILON);
>> - assertEquals("2nd y value", 4.333333333333334, canopy.getCenter().get(1),
>> - EPSILON);
>> + c = new Pair<Double,Double>(canopy.getCenter().get(0),
>> + canopy.getCenter().get(1));
>> + assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
>> assertFalse("more to come", reader.next(key, canopy));
>> } finally {
>> Closeables.closeQuietly(reader);
>> }
>> }
>>
>> + boolean findAndRemove(Pair<Double,Double> target,
>> + List<Pair<Double,Double>> list, double epsilon) {
>> + for (Pair<Double,Double> curr : list) {
>> + if ( (Math.abs(target.getFirst() - curr.getFirst())< epsilon)
>> +&& (Math.abs(target.getSecond() - curr.getSecond())< epsilon) ) {
>> + list.remove(curr);
>> + return true;
>> + }
>> + }
>> + return false;
>> + }
>> +
>> /**
>> * Story: User can produce final canopy centers using a Hadoop map/reduce job
>> * and a EuclideanDistanceMeasure.
>> @@ -368,14 +385,18 @@ public final class TestCanopyCreation ex
>> Canopy value = new Canopy();
>> assertTrue("more to come", reader.next(key, value));
>> assertEquals("1st key", "C-0", key.toString());
>> - assertEquals("1st x value", 1.8, value.getCenter().get(0), EPSILON);
>> - assertEquals("1st y value", 1.8, value.getCenter().get(1), EPSILON);
>> +
>> + List<Pair<Double,Double>> refCenters = Lists.newArrayList();
>> + refCenters.add(new Pair<Double,Double>(1.8,1.8));
>> + refCenters.add(new Pair<Double,Double>(4.433333333333334, 4.433333333333334));
>> + Pair<Double,Double> c = new Pair<Double,Double>(value.getCenter().get(0),
>> + value.getCenter().get(1));
>> + assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
>> assertTrue("more to come", reader.next(key, value));
>> assertEquals("2nd key", "C-1", key.toString());
>> - assertEquals("2nd x value", 4.433333333333334, value.getCenter().get(0),
>> - EPSILON);
>> - assertEquals("2nd y value", 4.433333333333334, value.getCenter().get(1),
>> - EPSILON);
>> + c = new Pair<Double,Double>(value.getCenter().get(0),
>> + value.getCenter().get(1));
>> + assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
>> assertFalse("more to come", reader.next(key, value));
>> } finally {
>> Closeables.closeQuietly(reader);
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java Mon Mar 12 18:25:45 2012
>> @@ -20,6 +20,9 @@ package org.apache.mahout.clustering.cla
>> import java.io.IOException;
>> import java.util.ArrayList;
>> import java.util.List;
>> +import java.util.Set;
>> +
>> +import com.google.common.collect.Sets;
>>
>> import junit.framework.Assert;
>>
>> @@ -195,9 +198,7 @@ public class ClusterClassificationDriver
>> }
>>
>> private void assertVectorsWithOutlierRemoval() {
>> - assertFirstClusterWithOutlierRemoval();
>> - assertSecondClusterWithOutlierRemoval();
>> - assertThirdClusterWithOutlierRemoval();
>> + checkClustersWithOutlierRemoval();
>> }
>>
>> private void assertVectorsWithoutOutlierRemoval() {
>> @@ -230,25 +231,33 @@ public class ClusterClassificationDriver
>> "{1:1.0,0:2.0}", "{1:2.0,0:1.0}"}, vector.asFormatString()));
>> }
>> }
>> -
>> - private void assertThirdClusterWithOutlierRemoval() {
>> - Assert.assertEquals(1, thirdCluster.size());
>> - for (Vector vector : thirdCluster) {
>> - Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:9.0,0:9.0}"},
>> - vector.asFormatString()));
>> - }
>> - }
>> -
>> - private void assertSecondClusterWithOutlierRemoval() {
>> - Assert.assertEquals(0, secondCluster.size());
>> - }
>> -
>> - private void assertFirstClusterWithOutlierRemoval() {
>> - Assert.assertEquals(1, firstCluster.size());
>> - for (Vector vector : firstCluster) {
>> - Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:1.0,0:1.0}"},
>> - vector.asFormatString()));
>> - }
>> +
>> + private void checkClustersWithOutlierRemoval() {
>> + Set<String> reference = Sets.newHashSet(new String[] {"{1:9.0,0:9.0}",
>> + "{1:1.0,0:1.0}"});
>> + int singletonCnt = 0;
>> + int emptyCnt = 0;
>> +
>> + List<List<Vector>> clusters = Lists.newArrayList();
>> + clusters.add(firstCluster);
>> + clusters.add(secondCluster);
>> + clusters.add(thirdCluster);
>> +
>> + for (List<Vector> vList : clusters) {
>> + if (vList.size() == 0) {
>> + emptyCnt++;
>> + } else {
>> + singletonCnt++;
>> + Assert.assertTrue("expecting only singleton clusters; got size=" + vList.size(),
>> + vList.size() == 1);
>> + Assert.assertTrue("not expecting cluster:" + vList.get(0).asFormatString(),
>> + reference.contains(vList.get(0).asFormatString()));
>> + reference.remove(vList.get(0).asFormatString());
>> + }
>> + }
>> + Assert.assertEquals("Different number of empty clusters than expected!", 1, emptyCnt);
>> + Assert.assertEquals("Different number of singletons than expected!", 2, singletonCnt);
>> + Assert.assertEquals("Didn't match all reference clusters!", 0, reference.size());
>> }
>> -
>> +
>> }
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Mon Mar 12 18:25:45 2012
>> @@ -26,6 +26,7 @@ import com.google.common.collect.Lists;
>> import com.google.common.collect.Maps;
>> import com.google.common.io.Closeables;
>> import org.apache.hadoop.conf.Configuration;
>> +import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.io.IntWritable;
>> @@ -38,6 +39,7 @@ import org.apache.hadoop.util.ToolRunner
>> import org.apache.mahout.clustering.AbstractCluster;
>> import org.apache.mahout.clustering.ClusterObservations;
>> import org.apache.mahout.clustering.ClusteringTestUtils;
>> +import org.apache.mahout.clustering.canopy.Canopy;
>> import org.apache.mahout.clustering.canopy.CanopyDriver;
>> import org.apache.mahout.clustering.classify.WeightedVectorWritable;
>> import org.apache.mahout.common.DummyOutputCollector;
>> @@ -486,6 +488,42 @@ public final class TestKmeansClustering
>> // now run the Canopy job
>> CanopyDriver.run(conf, pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, false, 0.0, false);
>>
>> + DummyOutputCollector<Text, Canopy> collector1 =
>> + new DummyOutputCollector<Text, Canopy>();
>> +
>> + FileStatus[] outParts = FileSystem.get(conf).globStatus(
>> + new Path(outputPath, "clusters-0-final/*-0*"));
>> + for (FileStatus outPartStat : outParts) {
>> + for (Pair<Text,Canopy> record :
>> + new SequenceFileIterable<Text,Canopy>(
>> + outPartStat.getPath(), conf)) {
>> + collector1.collect(record.getFirst(), record.getSecond());
>> + }
>> + }
>> +
>> + boolean got15 = false;
>> + boolean got43 = false;
>> + int count = 0;
>> + for (Text k : collector1.getKeys()) {
>> + count++;
>> + List<Canopy> vl = collector1.getValue(k);
>> + assertEquals("non-singleton centroid!", 1, vl.size());
>> + Vector v = vl.get(0).getCenter();
>> + assertEquals("cetriod vector is wrong length", 2, v.size());
>> + if ( (Math.abs(v.get(0) - 1.5)< EPSILON)
>> +&& (Math.abs(v.get(1) - 1.5)< EPSILON)
>> +&& !got15) {
>> + got15 = true;
>> + } else if ( (Math.abs(v.get(0) - 4.333333333333334)< EPSILON)
>> +&& (Math.abs(v.get(1) - 4.333333333333334)< EPSILON)
>> +&& !got43) {
>> + got43 = true;
>> + } else {
>> + assertTrue("got unexpected center: "+v+" ["+v.getClass().toString()+"]", false);
>> + }
>> + }
>> + assertEquals("got unexpected number of centers", 2, count);
>> +
>> // now run the KMeans job
>> KMeansDriver.run(pointsPath, new Path(outputPath, "clusters-0-final"), outputPath, new EuclideanDistanceMeasure(),
>> 0.001, 10, true, false);
>> @@ -500,7 +538,28 @@ public final class TestKmeansClustering
>> collector.collect(record.getFirst(), record.getSecond());
>> }
>>
>> - assertEquals("num points[0]", 4, collector.getValue(new IntWritable(0)).size());
>> - assertEquals("num points[1]", 5, collector.getValue(new IntWritable(1)).size());
>> + boolean gotLowClust = false; // clusters should be [1, *] and [2, *]
>> + boolean gotHighClust = false; // vs [3 , *], [4 , *] and [5, *]
>> + for (IntWritable k : collector.getKeys()) {
>> + List<WeightedVectorWritable> wvList = collector.getValue(k);
>> + assertTrue("empty cluster!", wvList.size() != 0);
>> + if (wvList.get(0).getVector().get(0)<= 2.0) {
>> + for (WeightedVectorWritable wv : wvList) {
>> + Vector v = wv.getVector();
>> + int idx = v.maxValueIndex();
>> + assertTrue("bad cluster!", v.get(idx)<= 2.0);
>> + }
>> + assertEquals("Wrong size cluster", 4, wvList.size());
>> + gotLowClust= true;
>> + } else {
>> + for (WeightedVectorWritable wv : wvList) {
>> + Vector v = wv.getVector();
>> + int idx = v.minValueIndex();
>> + assertTrue("bad cluster!", v.get(idx)> 2.0);
>> + }
>> + assertEquals("Wrong size cluster", 5, wvList.size());
>> + gotHighClust= true;
>> + }
>> + }
>> }
>> }
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Mon Mar 12 18:25:45 2012
>> @@ -21,10 +21,12 @@ import java.util.Collection;
>> import java.util.Iterator;
>> import java.util.List;
>> import java.util.Map;
>> +import java.util.Random;
>>
>> import com.google.common.collect.Lists;
>> import com.google.common.collect.Maps;
>> import org.apache.hadoop.conf.Configuration;
>> +import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.io.Text;
>> @@ -350,7 +352,13 @@ public final class TestMeanShift extends
>> Configuration conf = new Configuration();
>> FileSystem fs = FileSystem.get(input.toUri(), conf);
>> Collection<VectorWritable> points = Lists.newArrayList();
>> - for (Vector v : raw) {
>> + Random r = new Random(123);
>> + Vector[] permutedRaw = new Vector[raw.length];
>> + for (int i = 0; i< raw.length; i++)
>> + permutedRaw = raw;
>> + for (int i = 0; i< permutedRaw.length; i++)
>> + permutedRaw[i] = permutedRaw[i + r.nextInt(raw.length - i)];
>> + for (Vector v : permutedRaw) {
>> points.add(new VectorWritable(v));
>> }
>> ClusteringTestUtils.writePointsToFile(points,
>> @@ -376,10 +384,12 @@ public final class TestMeanShift extends
>> optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.2",
>> optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
>> ToolRunner.run(conf, new MeanShiftCanopyDriver(), args);
>> - Path outPart = new Path(output, "clusters-4-final/part-r-00000");
>> - long count = HadoopUtil.countRecords(outPart, conf);
>> - assertEquals("count", 3, count);
>> - outPart = new Path(output, "clusters-0/part-m-00000");
>> + FileStatus[] outParts = FileSystem.get(conf).globStatus(
>> + new Path(output, "clusters-?-final/part-r-*"));
>> + assertEquals("Wrong number of matching final parts", 1, outParts.length);
>> + long count = HadoopUtil.countRecords(outParts[0].getPath(), conf);
>> + assertEquals("count", 5, count);
>> + Path outPart = new Path(output, "clusters-0/part-m-00000");
>> Iterator<?> iterator = new SequenceFileValueIterator<Writable>(outPart,
>> true, conf);
>> // now test the initial clusters to ensure the type of their centers has
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java Mon Mar 12 18:25:45 2012
>> @@ -1,26 +0,0 @@
>> -/**
>> - * Licensed to the Apache Software Foundation (ASF) under one
>> - * or more contributor license agreements. See the NOTICE file
>> - * distributed with this work for additional information
>> - * regarding copyright ownership. The ASF licenses this file
>> - * to you under the Apache License, Version 2.0 (the
>> - * "License"); you may not use this file except in compliance
>> - * with the License. You may obtain a copy of the License at
>> - *
>> - * http://www.apache.org/licenses/LICENSE-2.0
>> - *
>> - * Unless required by applicable law or agreed to in writing,
>> - * software distributed under the License is distributed on an
>> - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
>> - * KIND, either express or implied. See the License for the
>> - * specific language governing permissions and limitations
>> - * under the License.
>> - */
>> -
>> -package org.apache.mahout.common;
>> -
>> -import org.apache.hadoop.mapreduce.Counter;
>> -
>> -final class DummyCounter extends Counter {
>> -
>> -}
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java Mon Mar 12 18:25:45 2012
>> @@ -17,16 +17,21 @@
>>
>> package org.apache.mahout.common;
>>
>> +import com.google.common.collect.Lists;
>> +
>> import java.io.IOException;
>> +import java.lang.reflect.Constructor;
>> +import java.lang.reflect.Method;
>> import java.util.List;
>> import java.util.Map;
>> import java.util.Set;
>> import java.util.TreeMap;
>>
>> -import com.google.common.collect.Lists;
>> import org.apache.hadoop.conf.Configuration;
>> +import org.apache.hadoop.mapreduce.MapContext;
>> import org.apache.hadoop.mapreduce.Mapper;
>> import org.apache.hadoop.mapreduce.RecordWriter;
>> +import org.apache.hadoop.mapreduce.ReduceContext;
>> import org.apache.hadoop.mapreduce.Reducer;
>> import org.apache.hadoop.mapreduce.TaskAttemptContext;
>> import org.apache.hadoop.mapreduce.TaskAttemptID;
>> @@ -65,7 +70,18 @@ public final class DummyRecordWriter<K,
>> Configuration configuration,
>> RecordWriter<K2, V2> output)
>> throws IOException, InterruptedException {
>> - return mapper.new Context(configuration, new TaskAttemptID(), null, output, null, new DummyStatusReporter(), null);
>> +
>> + // Use reflection since the context types changed incompatibly between 0.20
>> + // and 0.23.
>> + try {
>> + return buildNewMapperContext(configuration, output);
>> + } catch (Exception e) {
>> + try {
>> + return buildOldMapperContext(mapper, configuration, output);
>> + } catch (Exception ex) {
>> + throw new IllegalStateException(ex);
>> + }
>> + }
>> }
>>
>> public static<K1, V1, K2, V2> Reducer<K1, V1, K2, V2>.Context build(Reducer<K1, V1, K2, V2> reducer,
>> @@ -74,17 +90,96 @@ public final class DummyRecordWriter<K,
>> Class<K1> keyClass,
>> Class<V1> valueClass)
>> throws IOException, InterruptedException {
>> - return reducer.new Context(configuration,
>> - new TaskAttemptID(),
>> - new MockIterator(),
>> - null,
>> - null,
>> - output,
>> - null,
>> - new DummyStatusReporter(),
>> - null,
>> - keyClass,
>> - valueClass);
>> +
>> + // Use reflection since the context types changed incompatibly between 0.20
>> + // and 0.23.
>> + try {
>> + return buildNewReducerContext(configuration, output, keyClass, valueClass);
>> + } catch (Exception e) {
>> + try {
>> + return buildOldReducerContext(reducer, configuration, output, keyClass, valueClass);
>> + } catch (Exception ex) {
>> + throw new IllegalStateException(ex);
>> + }
>> + }
>> + }
>> +
>> + @SuppressWarnings({ "unchecked", "rawtypes" })
>> + private static<K1, V1, K2, V2> Mapper<K1, V1, K2, V2>.Context buildNewMapperContext(
>> + Configuration configuration, RecordWriter<K2, V2> output) throws Exception {
>> + Class<?> mapContextImplClass = Class.forName("org.apache.hadoop.mapreduce.task.MapContextImpl");
>> + Constructor<?> cons = mapContextImplClass.getConstructors()[0];
>> + Object mapContextImpl = cons.newInstance(configuration,
>> + new TaskAttemptID(), null, output, null, new DummyStatusReporter(), null);
>> +
>> + Class<?> wrappedMapperClass = Class.forName("org.apache.hadoop.mapreduce.lib.map.WrappedMapper");
>> + Object wrappedMapper = wrappedMapperClass.newInstance();
>> + Method getMapContext = wrappedMapperClass.getMethod("getMapContext", MapContext.class);
>> + return (Mapper.Context) getMapContext.invoke(wrappedMapper, mapContextImpl);
>> + }
>> +
>> + @SuppressWarnings({ "unchecked", "rawtypes" })
>> + private static<K1, V1, K2, V2> Mapper<K1, V1, K2, V2>.Context buildOldMapperContext(
>> + Mapper<K1, V1, K2, V2> mapper, Configuration configuration,
>> + RecordWriter<K2, V2> output) throws Exception {
>> + Constructor<?> cons = getNestedContextConstructor(mapper.getClass());
>> + // first argument to the constructor is the enclosing instance
>> + return (Mapper.Context) cons.newInstance(mapper, configuration,
>> + new TaskAttemptID(), null, output, null, new DummyStatusReporter(), null);
>> + }
>> +
>> + @SuppressWarnings({ "unchecked", "rawtypes" })
>> + private static<K1, V1, K2, V2> Reducer<K1, V1, K2, V2>.Context buildNewReducerContext(
>> + Configuration configuration, RecordWriter<K2, V2> output, Class<K1> keyClass,
>> + Class<V1> valueClass) throws Exception {
>> + Class<?> reduceContextImplClass = Class.forName("org.apache.hadoop.mapreduce.task.ReduceContextImpl");
>> + Constructor<?> cons = reduceContextImplClass.getConstructors()[0];
>> + Object reduceContextImpl = cons.newInstance(configuration,
>> + new TaskAttemptID(),
>> + new MockIterator(),
>> + null,
>> + null,
>> + output,
>> + null,
>> + new DummyStatusReporter(),
>> + null,
>> + keyClass,
>> + valueClass);
>> +
>> + Class<?> wrappedReducerClass = Class.forName("org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer");
>> + Object wrappedReducer = wrappedReducerClass.newInstance();
>> + Method getReducerContext = wrappedReducerClass.getMethod("getReducerContext", ReduceContext.class);
>> + return (Reducer.Context) getReducerContext.invoke(wrappedReducer, reduceContextImpl);
>> + }
>> +
>> + @SuppressWarnings({ "unchecked", "rawtypes" })
>> + private static<K1, V1, K2, V2> Reducer<K1, V1, K2, V2>.Context buildOldReducerContext(
>> + Reducer<K1, V1, K2, V2> reducer, Configuration configuration,
>> + RecordWriter<K2, V2> output, Class<K1> keyClass,
>> + Class<V1> valueClass) throws Exception {
>> + Constructor<?> cons = getNestedContextConstructor(reducer.getClass());
>> + // first argument to the constructor is the enclosing instance
>> + return (Reducer.Context) cons.newInstance(reducer,
>> + configuration,
>> + new TaskAttemptID(),
>> + new MockIterator(),
>> + null,
>> + null,
>> + output,
>> + null,
>> + new DummyStatusReporter(),
>> + null,
>> + keyClass,
>> + valueClass);
>> + }
>> +
>> + private static Constructor<?> getNestedContextConstructor(Class<?> outerClass) {
>> + for (Class<?> nestedClass : outerClass.getClasses()) {
>> + if ("Context".equals(nestedClass.getSimpleName())) {
>> + return nestedClass.getConstructors()[0];
>> + }
>> + }
>> + throw new IllegalStateException("Cannot find context class for " + outerClass);
>> }
>>
>> }
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java Mon Mar 12 18:25:45 2012
>> @@ -19,6 +19,8 @@
>>
>> package org.apache.mahout.common;
>>
>> +import static org.easymock.EasyMock.createMockBuilder;
>> +
>> import java.util.Map;
>>
>> import com.google.common.collect.Maps;
>> @@ -30,10 +32,21 @@ public final class DummyStatusReporter e
>> private final Map<Enum<?>, Counter> counters = Maps.newHashMap();
>> private final Map<String, Counter> counterGroups = Maps.newHashMap();
>>
>> + private Counter newCounter() {
>> + try {
>> + // 0.23 case
>> + String c = "org.apache.hadoop.mapreduce.counters.GenericCounter";
>> + return (Counter) createMockBuilder(Class.forName(c)).createMock();
>> + } catch (ClassNotFoundException e) {
>> + // 0.20 case
>> + return createMockBuilder(Counter.class).createMock();
>> + }
>> + }
>> +
>> @Override
>> public Counter getCounter(Enum<?> name) {
>> if (!counters.containsKey(name)) {
>> - counters.put(name, new DummyCounter());
>> + counters.put(name, newCounter());
>> }
>> return counters.get(name);
>> }
>> @@ -42,7 +55,7 @@ public final class DummyStatusReporter e
>> @Override
>> public Counter getCounter(String group, String name) {
>> if (!counterGroups.containsKey(group + name)) {
>> - counterGroups.put(group + name, new DummyCounter());
>> + counterGroups.put(group + name, newCounter());
>> }
>> return counterGroups.get(group+name);
>> }
>> @@ -55,4 +68,8 @@ public final class DummyStatusReporter e
>> public void setStatus(String status) {
>> }
>>
>> + public float getProgress() {
>> + return 0;
>> + }
>> +
>> }
>>
>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java (original)
>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java Mon Mar 12 18:25:45 2012
>> @@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.mahout.clustering.ClusteringTestUtils;
>> +import org.apache.mahout.common.HadoopUtil;
>> import org.apache.mahout.common.MahoutTestCase;
>> import org.apache.mahout.common.iterator.sequencefile.PathFilters;
>> import org.apache.mahout.math.DenseVector;
>> @@ -254,14 +255,14 @@ public final class TestDistributedRowMat
>>
>> deleteContentsOfPath(conf, outputPath);
>>
>> - assertEquals(0, fs.listStatus(outputPath).length);
>> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>
>> Vector result1 = dm.times(v);
>>
>> - assertEquals(0, fs.listStatus(outputPath).length);
>> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>
>> deleteContentsOfPath(conf, outputPath);
>> - assertEquals(0, fs.listStatus(outputPath).length);
>> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>
>> conf.setBoolean(DistributedRowMatrix.KEEP_TEMP_FILES, true);
>> dm.setConf(conf);
>> @@ -291,14 +292,14 @@ public final class TestDistributedRowMat
>>
>> deleteContentsOfPath(conf, outputPath);
>>
>> - assertEquals(0, fs.listStatus(outputPath).length);
>> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>
>> Vector result1 = dm.timesSquared(v);
>>
>> - assertEquals(0, fs.listStatus(outputPath).length);
>> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>
>> deleteContentsOfPath(conf, outputPath);
>> - assertEquals(0, fs.listStatus(outputPath).length);
>> + assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>
>> conf.setBoolean(DistributedRowMatrix.KEEP_TEMP_FILES, true);
>> dm.setConf(conf);
>> @@ -325,7 +326,7 @@ public final class TestDistributedRowMat
>> private static void deleteContentsOfPath(Configuration conf, Path path) throws Exception {
>> FileSystem fs = path.getFileSystem(conf);
>>
>> - FileStatus[] statuses = fs.listStatus(path);
>> + FileStatus[] statuses = HadoopUtil.listStatus(fs, path);
>> for (FileStatus status : statuses) {
>> fs.delete(status.getPath(), true);
>> }
>>
>> Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
>> +++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Mon Mar 12 18:25:45 2012
>> @@ -193,7 +193,7 @@ public final class TestClusterDumper ext
>> output, measure, 8, 4, true, 0.0, true);
>> // run ClusterDumper
>> ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
>> - "clusters-0"), new Path(output, "clusteredPoints"));
>> + "clusters-0-final"), new Path(output, "clusteredPoints"));
>> clusterDumper.printClusters(termDictionary);
>> }
>>
>>
>> Modified: mahout/trunk/pom.xml
>> URL: http://svn.apache.org/viewvc/mahout/trunk/pom.xml?rev=1299770&r1=1299769&r2=1299770&view=diff
>> ==============================================================================
>> --- mahout/trunk/pom.xml (original)
>> +++ mahout/trunk/pom.xml Mon Mar 12 18:25:45 2012
>> @@ -107,6 +107,17 @@
>> <url>https://issues.apache.org/jira/browse/MAHOUT</url>
>> </issueManagement>
>>
>> +<repositories>
>> +<repository>
>> +<id>apache.snapshots</id>
>> +<name>Apache Snapshot Repository</name>
>> +<url>http://repository.apache.org/snapshots</url>
>> +<releases>
>> +<enabled>false</enabled>
>> +</releases>
>> +</repository>
>> +</repositories>
>> +
>> <dependencyManagement>
>> <dependencies>
>>
>> @@ -264,6 +275,100 @@
>> </exclusions>
>> </dependency>
>> <dependency>
>> +<groupId>org.apache.hadoop</groupId>
>> +<artifactId>hadoop-common</artifactId>
>> +<version>${hadoop.version}</version>
>> +<exclusions>
>> +<exclusion>
>> +<groupId>net.sf.kosmosfs</groupId>
>> +<artifactId>kfs</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>org.mortbay.jetty</groupId>
>> +<artifactId>jetty</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>org.mortbay.jetty</groupId>
>> +<artifactId>jetty-util</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>hsqldb</groupId>
>> +<artifactId>hsqldb</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>commons-el</groupId>
>> +<artifactId>commons-el</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>junit</groupId>
>> +<artifactId>junit</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>oro</groupId>
>> +<artifactId>oro</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>org.mortbay.jetty</groupId>
>> +<artifactId>jsp-2.1</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>org.mortbay.jetty</groupId>
>> +<artifactId>jsp-api-2.1</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>org.mortbay.jetty</groupId>
>> +<artifactId>servlet-api-2.5</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>commons-net</groupId>
>> +<artifactId>commons-net</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>tomcat</groupId>
>> +<artifactId>jasper-runtime</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>tomcat</groupId>
>> +<artifactId>jasper-compiler</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>xmlenc</groupId>
>> +<artifactId>xmlenc</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>net.java.dev.jets3t</groupId>
>> +<artifactId>jets3t</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>org.eclipse.jdt</groupId>
>> +<artifactId>core</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>org.slf4j</groupId>
>> +<artifactId>slf4j-api</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>org.slf4j</groupId>
>> +<artifactId>slf4j-jcl</artifactId>
>> +</exclusion>
>> +<exclusion>
>> +<groupId>org.slf4j</groupId>
>> +<artifactId>slf4j-log4j12</artifactId>
>> +</exclusion>
>> +</exclusions>
>> +</dependency>
>> +<dependency>
>> +<groupId>org.apache.hadoop</groupId>
>> +<artifactId>hadoop-mapreduce-client-core</artifactId>
>> +<version>${hadoop.version}</version>
>> +</dependency>
>> +<dependency>
>> +<groupId>org.apache.hadoop</groupId>
>> +<artifactId>hadoop-mapreduce-client-common</artifactId>
>> +<version>${hadoop.version}</version>
>> +</dependency>
>> +
>> +<dependency>
>> <groupId>org.codehaus.jackson</groupId>
>> <artifactId>jackson-core-asl</artifactId>
>> <version>1.8.2</version>
>>
>>
>
--------------------------------------------
Grant Ingersoll
http://www.lucidimagination.com