You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@mahout.apache.org by Lance Norskog <go...@gmail.com> on 2012/03/13 04:56:30 UTC
Re: svn commit: r1299770 - in /mahout/trunk: ./ core/ core/src/main/java/org/apache/mahout/common/ core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/ core/src/test/java/org/apache/mahout/clustering/canopy/ core/src/test/java/org/ap

Wow! This is a great leap forward in the Grand Hadoop API F-Up Saga.

On Mon, Mar 12, 2012 at 12:12 PM, Grant Ingersoll <gs...@apache.org> wrote:
> Done (tcp@a.o)
>
> On Mar 12, 2012, at 2:30 PM, tom pierce wrote:
>
>> Can someone hook me up with JIRA privs so I can close tickets?
>>
>> (Or, if that isn't something all committers get, someone pls mark -822 and -980 closed)
>>
>> -tom
>>
>> On 03/12/2012 02:25 PM, tcp@apache.org wrote:
>>> Author: tcp
>>> Date: Mon Mar 12 18:25:45 2012
>>> New Revision: 1299770
>>>
>>> URL: http://svn.apache.org/viewvc?rev=1299770&view=rev
>>> Log:
>>> MAHOUT-822: Make Mahout compatible with Hadoop 0.23.1.
>>>
>>> Modified:
>>>     mahout/trunk/core/pom.xml
>>>     mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java
>>>     mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
>>>     mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
>>>     mahout/trunk/pom.xml
>>>
>>> Modified: mahout/trunk/core/pom.xml
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/pom.xml?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/pom.xml (original)
>>> +++ mahout/trunk/core/pom.xml Mon Mar 12 18:25:45 2012
>>> @@ -140,10 +140,6 @@
>>>
>>>      <!-- Third Party -->
>>>      <dependency>
>>> -<groupId>org.apache.hadoop</groupId>
>>> -<artifactId>hadoop-core</artifactId>
>>> -</dependency>
>>> -<dependency>
>>>        <groupId>org.codehaus.jackson</groupId>
>>>        <artifactId>jackson-core-asl</artifactId>
>>>      </dependency>
>>> @@ -211,4 +207,43 @@
>>>      </dependency>
>>>
>>>    </dependencies>
>>> +
>>> +<profiles>
>>> +<profile>
>>> +<id>hadoop-0.20</id>
>>> +<activation>
>>> +<property>
>>> +<name>!hadoop.version</name>
>>> +</property>
>>> +</activation>
>>> +<dependencies>
>>> +<dependency>
>>> +<groupId>org.apache.hadoop</groupId>
>>> +<artifactId>hadoop-core</artifactId>
>>> +</dependency>
>>> +</dependencies>
>>> +</profile>
>>> +<profile>
>>> +<id>hadoop-0.23</id>
>>> +<activation>
>>> +<property>
>>> +<name>hadoop.version</name>
>>> +</property>
>>> +</activation>
>>> +<dependencies>
>>> +<dependency>
>>> +<groupId>org.apache.hadoop</groupId>
>>> +<artifactId>hadoop-common</artifactId>
>>> +</dependency>
>>> +<dependency>
>>> +<groupId>org.apache.hadoop</groupId>
>>> +<artifactId>hadoop-mapreduce-client-common</artifactId>
>>> +</dependency>
>>> +<dependency>
>>> +<groupId>org.apache.hadoop</groupId>
>>> +<artifactId>hadoop-mapreduce-client-core</artifactId>
>>> +</dependency>
>>> +</dependencies>
>>> +</profile>
>>> +</profiles>
>>>  </project>
>>>
>>> Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java (original)
>>> +++ mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java Mon Mar 12 18:25:45 2012
>>> @@ -17,6 +17,7 @@
>>>
>>>  package org.apache.mahout.common;
>>>
>>> +import java.io.FileNotFoundException;
>>>  import java.io.IOException;
>>>  import java.io.InputStream;
>>>  import java.net.URI;
>>> @@ -229,9 +230,9 @@ public final class HadoopUtil {
>>>      FileStatus[] statuses;
>>>      FileSystem fs = path.getFileSystem(conf);
>>>      if (filter == null) {
>>> -      statuses = pathType == PathType.GLOB ? fs.globStatus(path) : fs.listStatus(path);
>>> +      statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path);
>>>      } else {
>>> -      statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : fs.listStatus(path, filter);
>>> +      statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter);
>>>      }
>>>      if (ordering != null) {
>>>        Arrays.sort(statuses, ordering);
>>> @@ -239,6 +240,22 @@ public final class HadoopUtil {
>>>      return statuses;
>>>    }
>>>
>>> +  public static FileStatus[] listStatus(FileSystem fs, Path path) throws IOException {
>>> +    try {
>>> +      return fs.listStatus(path);
>>> +    } catch (FileNotFoundException e) {
>>> +      return new FileStatus[0];
>>> +    }
>>> +  }
>>> +
>>> +  public static FileStatus[] listStatus(FileSystem fs, Path path, PathFilter filter) throws IOException {
>>> +    try {
>>> +      return fs.listStatus(path, filter);
>>> +    } catch (FileNotFoundException e) {
>>> +      return new FileStatus[0];
>>> +    }
>>> +  }
>>> +
>>>    public static void cacheFiles(Path fileToCache, Configuration conf) {
>>>      DistributedCache.setCacheFiles(new URI[]{fileToCache.toUri()}, conf);
>>>    }
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/MockContext.java Mon Mar 12 18:25:45 2012
>>> @@ -1,70 +0,0 @@
>>> -/**
>>> - * Licensed to the Apache Software Foundation (ASF) under one or more
>>> - * contributor license agreements.  See the NOTICE file distributed with
>>> - * this work for additional information regarding copyright ownership.
>>> - * The ASF licenses this file to You under the Apache License, Version 2.0
>>> - * (the "License"); you may not use this file except in compliance with
>>> - * the License.  You may obtain a copy of the License at
>>> - *
>>> - *     http://www.apache.org/licenses/LICENSE-2.0
>>> - *
>>> - * Unless required by applicable law or agreed to in writing, software
>>> - * distributed under the License is distributed on an "AS IS" BASIS,
>>> - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> - * See the License for the specific language governing permissions and
>>> - * limitations under the License.
>>> - */
>>> -package org.apache.mahout.classifier.df.mapreduce.partial;
>>> -
>>> -import java.io.IOException;
>>> -
>>> -import org.apache.hadoop.conf.Configuration;
>>> -import org.apache.hadoop.mapreduce.Mapper;
>>> -import org.apache.hadoop.mapreduce.TaskAttemptID;
>>> -import org.apache.hadoop.mapreduce.Mapper.Context;
>>> -import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
>>> -
>>> -/**
>>> - * Special implementation that collects the output of the mappers
>>> - */
>>> -final class MockContext extends Context {
>>> -
>>> -  private final TreeID[] keys;
>>> -  private final MapredOutput[] values;
>>> -  private int index;
>>> -
>>> -  MockContext(Mapper<?,?,?,?>  mapper, Configuration conf, TaskAttemptID taskid, int nbTrees)
>>> -    throws IOException, InterruptedException {
>>> -    mapper.super(conf, taskid, null, null, null, null, null);
>>> -
>>> -    keys = new TreeID[nbTrees];
>>> -    values = new MapredOutput[nbTrees];
>>> -  }
>>> -
>>> -  @Override
>>> -  public void write(Object key, Object value) throws IOException {
>>> -    if (index == keys.length) {
>>> -      throw new IOException("Received more output than expected : " + index);
>>> -    }
>>> -
>>> -    keys[index] = ((TreeID) key).clone();
>>> -    values[index] = ((MapredOutput) value).clone();
>>> -
>>> -    index++;
>>> -  }
>>> -
>>> -  /**
>>> -   * @return number of outputs collected
>>> -   */
>>> -  public int nbOutputs() {
>>> -    return index;
>>> -  }
>>> -
>>> -  public TreeID[] getKeys() {
>>> -    return keys;
>>> -  }
>>> -
>>> -  public MapredOutput[] getValues() {
>>> -    return values;
>>> -  }
>>> -}
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialSequentialBuilder.java Mon Mar 12 18:25:45 2012
>>> @@ -1,176 +0,0 @@
>>> -/**
>>> - * Licensed to the Apache Software Foundation (ASF) under one or more
>>> - * contributor license agreements.  See the NOTICE file distributed with
>>> - * this work for additional information regarding copyright ownership.
>>> - * The ASF licenses this file to You under the Apache License, Version 2.0
>>> - * (the "License"); you may not use this file except in compliance with
>>> - * the License.  You may obtain a copy of the License at
>>> - *
>>> - *     http://www.apache.org/licenses/LICENSE-2.0
>>> - *
>>> - * Unless required by applicable law or agreed to in writing, software
>>> - * distributed under the License is distributed on an "AS IS" BASIS,
>>> - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> - * See the License for the specific language governing permissions and
>>> - * limitations under the License.
>>> - */
>>> -
>>> -package org.apache.mahout.classifier.df.mapreduce.partial;
>>> -
>>> -import java.io.IOException;
>>> -import java.util.List;
>>> -
>>> -import org.apache.commons.lang.ArrayUtils;
>>> -import org.apache.hadoop.conf.Configuration;
>>> -import org.apache.hadoop.fs.Path;
>>> -import org.apache.hadoop.io.LongWritable;
>>> -import org.apache.hadoop.io.Text;
>>> -import org.apache.hadoop.mapreduce.InputSplit;
>>> -import org.apache.hadoop.mapreduce.Job;
>>> -import org.apache.hadoop.mapreduce.RecordReader;
>>> -import org.apache.hadoop.mapreduce.TaskAttemptContext;
>>> -import org.apache.hadoop.mapreduce.TaskAttemptID;
>>> -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>>> -import org.apache.mahout.classifier.df.DFUtils;
>>> -import org.apache.mahout.classifier.df.DecisionForest;
>>> -import org.apache.mahout.classifier.df.builder.TreeBuilder;
>>> -import org.apache.mahout.classifier.df.data.Dataset;
>>> -import org.apache.mahout.classifier.df.mapreduce.Builder;
>>> -import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
>>> -import org.apache.mahout.classifier.df.node.Node;
>>> -import org.slf4j.Logger;
>>> -import org.slf4j.LoggerFactory;
>>> -
>>> -import com.google.common.collect.Lists;
>>> -
>>> -/**
>>> - * Simulates the Partial mapreduce implementation in a sequential manner. Must
>>> - * receive a seed
>>> - */
>>> -public class PartialSequentialBuilder extends PartialBuilder {
>>> -
>>> -  private static final Logger log = LoggerFactory.getLogger(PartialSequentialBuilder.class);
>>> -
>>> -  private MockContext firstOutput;
>>> -
>>> -  private final Dataset dataset;
>>> -
>>> -  public PartialSequentialBuilder(TreeBuilder treeBuilder, Path dataPath,
>>> -      Dataset dataset, long seed, Configuration conf) {
>>> -    super(treeBuilder, dataPath, new Path("notUsed"), seed, conf);
>>> -    this.dataset = dataset;
>>> -  }
>>> -
>>> -  public PartialSequentialBuilder(TreeBuilder treeBuilder, Path dataPath,
>>> -      Dataset dataset, long seed) {
>>> -    this(treeBuilder, dataPath, dataset, seed, new Configuration());
>>> -  }
>>> -
>>> -  @Override
>>> -  protected void configureJob(Job job)
>>> -      throws IOException {
>>> -    Configuration conf = job.getConfiguration();
>>> -
>>> -    int num = conf.getInt("mapred.map.tasks", -1);
>>> -
>>> -    super.configureJob(job);
>>> -
>>> -    // PartialBuilder sets the number of maps to 1 if we are running in 'local'
>>> -    conf.setInt("mapred.map.tasks", num);
>>> -  }
>>> -
>>> -  @Override
>>> -  protected boolean runJob(Job job) throws IOException, InterruptedException {
>>> -    Configuration conf = job.getConfiguration();
>>> -
>>> -    // retrieve the splits
>>> -    TextInputFormat input = new TextInputFormat();
>>> -    List<InputSplit>  splits = input.getSplits(job);
>>> -
>>> -    int nbSplits = splits.size();
>>> -    log.debug("Nb splits : {}", nbSplits);
>>> -
>>> -    InputSplit[] sorted = new InputSplit[nbSplits];
>>> -    splits.toArray(sorted);
>>> -    Builder.sortSplits(sorted);
>>> -
>>> -    int numTrees = Builder.getNbTrees(conf); // total number of trees
>>> -
>>> -    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());
>>> -
>>> -    firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);
>>> -
>>> -    /* first instance id in hadoop's order */
>>> -    //int[] firstIds = new int[nbSplits];
>>> -    /* partitions' sizes in hadoop order */
>>> -    int[] sizes = new int[nbSplits];
>>> -
>>> -    // to compute firstIds, process the splits in file order
>>> -    long slowest = 0; // duration of slowest map
>>> -    int firstId = 0;
>>> -    for (InputSplit split : splits) {
>>> -      int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition
>>> -
>>> -      RecordReader<LongWritable, Text>  reader = input.createRecordReader(split, task);
>>> -      reader.initialize(split, task);
>>> -
>>> -      Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(),
>>> -                                               hp, nbSplits, numTrees);
>>> -
>>> -      long time = System.currentTimeMillis();
>>> -
>>> -      //firstIds[hp] = firstId;
>>> -
>>> -      while (reader.nextKeyValue()) {
>>> -        mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput);
>>> -        firstId++;
>>> -        sizes[hp]++;
>>> -      }
>>> -
>>> -      mapper.cleanup(firstOutput);
>>> -
>>> -      time = System.currentTimeMillis() - time;
>>> -      log.info("Duration : {}", DFUtils.elapsedTime(time));
>>> -
>>> -      if (time>  slowest) {
>>> -        slowest = time;
>>> -      }
>>> -    }
>>> -
>>> -    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
>>> -    return true;
>>> -  }
>>> -
>>> -  @Override
>>> -  protected DecisionForest parseOutput(Job job) throws IOException {
>>> -    return processOutput(firstOutput.getKeys(), firstOutput.getValues());
>>> -  }
>>> -
>>> -  /**
>>> -   * extract the decision forest
>>> -   */
>>> -  protected static DecisionForest processOutput(TreeID[] keys, MapredOutput[] values) {
>>> -    List<Node>  trees = Lists.newArrayList();
>>> -
>>> -    for (int index = 0; index<  keys.length; index++) {
>>> -      MapredOutput value = values[index];
>>> -      trees.add(value.getTree());
>>> -    }
>>> -
>>> -    return new DecisionForest(trees);
>>> -  }
>>> -
>>> -  /**
>>> -   * Special Step1Mapper that can be configured without using a Configuration
>>> -   *
>>> -   */
>>> -  private static class MockStep1Mapper extends Step1Mapper {
>>> -    protected MockStep1Mapper(TreeBuilder treeBuilder, Dataset dataset, Long seed,
>>> -        int partition, int numMapTasks, int numTrees) {
>>> -      configure(false, treeBuilder, dataset);
>>> -      configure(seed, partition, numMapTasks, numTrees);
>>> -    }
>>> -
>>> -  }
>>> -
>>> -}
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1MapperTest.java Mon Mar 12 18:25:45 2012
>>> @@ -17,21 +17,30 @@
>>>
>>>  package org.apache.mahout.classifier.df.mapreduce.partial;
>>>
>>> +import static org.easymock.EasyMock.anyObject;
>>> +import static org.easymock.EasyMock.capture;
>>> +import static org.easymock.EasyMock.createMock;
>>> +import static org.easymock.EasyMock.expectLastCall;
>>> +import static org.easymock.EasyMock.replay;
>>> +import static org.easymock.EasyMock.verify;
>>> +
>>>  import java.util.Random;
>>>
>>> -import org.apache.hadoop.conf.Configuration;
>>>  import org.apache.hadoop.io.LongWritable;
>>>  import org.apache.hadoop.io.Text;
>>> -import org.apache.hadoop.mapreduce.TaskAttemptID;
>>> -import org.apache.mahout.common.MahoutTestCase;
>>> +import org.apache.hadoop.mapreduce.Mapper;
>>>  import org.apache.mahout.common.RandomUtils;
>>>  import org.apache.mahout.classifier.df.builder.TreeBuilder;
>>>  import org.apache.mahout.classifier.df.data.Data;
>>>  import org.apache.mahout.classifier.df.data.DataLoader;
>>>  import org.apache.mahout.classifier.df.data.Dataset;
>>>  import org.apache.mahout.classifier.df.data.Utils;
>>> +import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
>>>  import org.apache.mahout.classifier.df.node.Leaf;
>>>  import org.apache.mahout.classifier.df.node.Node;
>>> +import org.apache.mahout.common.MahoutTestCase;
>>> +import org.easymock.Capture;
>>> +import org.easymock.CaptureType;
>>>  import org.junit.Test;
>>>
>>>  public final class Step1MapperTest extends MahoutTestCase {
>>> @@ -71,6 +80,17 @@ public final class Step1MapperTest exten
>>>      }
>>>    }
>>>
>>> +  private static class TreeIDCapture extends Capture<TreeID>  {
>>> +
>>> +    public TreeIDCapture() {
>>> +      super(CaptureType.ALL);
>>> +    }
>>> +
>>> +    public void setValue(final TreeID value) {
>>> +      super.setValue(value.clone());
>>> +    }
>>> +  }
>>> +
>>>    /** nb attributes per generated data instance */
>>>    static final int NUM_ATTRIBUTES = 4;
>>>
>>> @@ -83,6 +103,7 @@ public final class Step1MapperTest exten
>>>    /** nb mappers to use */
>>>    static final int NUM_MAPPERS = 2;
>>>
>>> +  @SuppressWarnings({ "rawtypes", "unchecked" })
>>>    @Test
>>>    public void testMapper() throws Exception {
>>>      Long seed = null;
>>> @@ -109,8 +130,13 @@ public final class Step1MapperTest exten
>>>        // expected number of trees that this mapper will build
>>>        int mapNbTrees = Step1Mapper.nbTrees(NUM_MAPPERS, NUM_TREES, partition);
>>>
>>> -      MockContext context = new MockContext(new Step1Mapper(),
>>> -          new Configuration(), new TaskAttemptID(), mapNbTrees);
>>> +      Mapper.Context context =
>>> +        createMock(Mapper.Context.class);
>>> +      Capture<TreeID>  capturedKeys = new TreeIDCapture();
>>> +      context.write(capture(capturedKeys), anyObject());
>>> +      expectLastCall().anyTimes();
>>> +
>>> +      replay(context);
>>>
>>>        MockStep1Mapper mapper = new MockStep1Mapper(treeBuilder, dataset, seed,
>>>            partition, NUM_MAPPERS, NUM_TREES);
>>> @@ -125,12 +151,13 @@ public final class Step1MapperTest exten
>>>        }
>>>
>>>        mapper.cleanup(context);
>>> +      verify(context);
>>>
>>>        // make sure the mapper built all its trees
>>> -      assertEquals(mapNbTrees, context.nbOutputs());
>>> +      assertEquals(mapNbTrees, capturedKeys.getValues().size());
>>>
>>>        // check the returned keys
>>> -      for (TreeID k : context.getKeys()) {
>>> +      for (TreeID k : capturedKeys.getValues()) {
>>>          assertEquals(partition, k.partition());
>>>          assertEquals(treeIndex, k.treeId());
>>>
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Mon Mar 12 18:25:45 2012
>>> @@ -34,6 +34,7 @@ import org.apache.mahout.clustering.Clus
>>>  import org.apache.mahout.common.DummyRecordWriter;
>>>  import org.apache.mahout.common.HadoopUtil;
>>>  import org.apache.mahout.common.MahoutTestCase;
>>> +import org.apache.mahout.common.Pair;
>>>  import org.apache.mahout.common.commandline.DefaultOptionCreator;
>>>  import org.apache.mahout.common.distance.DistanceMeasure;
>>>  import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
>>> @@ -126,8 +127,8 @@ public final class TestCanopyCreation ex
>>>        int[] expectedNumPoints = { 4, 4, 3 };
>>>        double[][] expectedCentroids = { { 1.5, 1.5 }, { 4.0, 4.0 },
>>>            { 4.666666666666667, 4.6666666666666667 } };
>>> -      assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
>>> -          testCanopy.getNumObservations());
>>> +      assertEquals("canopy points " + canopyIx, testCanopy.getNumObservations(),
>>> +                   expectedNumPoints[canopyIx]);
>>>        double[] refCentroid = expectedCentroids[canopyIx];
>>>        Vector testCentroid = testCanopy.computeCentroid();
>>>        for (int pointIx = 0; pointIx<  refCentroid.length; pointIx++) {
>>> @@ -151,8 +152,8 @@ public final class TestCanopyCreation ex
>>>          { 4.666666666666667, 4.666666666666667 } };
>>>      for (int canopyIx = 0; canopyIx<  referenceEuclidean.size(); canopyIx++) {
>>>        Canopy testCanopy = referenceEuclidean.get(canopyIx);
>>> -      assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
>>> -          testCanopy.getNumObservations());
>>> +      assertEquals("canopy points " + canopyIx, testCanopy.getNumObservations(),
>>> +                   expectedNumPoints[canopyIx]);
>>>        double[] refCentroid = expectedCentroids[canopyIx];
>>>        Vector testCentroid = testCanopy.computeCentroid();
>>>        for (int pointIx = 0; pointIx<  refCentroid.length; pointIx++) {
>>> @@ -328,20 +329,36 @@ public final class TestCanopyCreation ex
>>>        Canopy canopy = new Canopy();
>>>        assertTrue("more to come", reader.next(key, canopy));
>>>        assertEquals("1st key", "C-0", key.toString());
>>> -      assertEquals("1st x value", 1.5, canopy.getCenter().get(0), EPSILON);
>>> -      assertEquals("1st y value", 1.5, canopy.getCenter().get(1), EPSILON);
>>> +
>>> +      List<Pair<Double,Double>>  refCenters = Lists.newArrayList();
>>> +      refCenters.add(new Pair<Double,Double>(1.5,1.5));
>>> +      refCenters.add(new Pair<Double,Double>(4.333333333333334,4.333333333333334));
>>> +      Pair<Double,Double>  c = new Pair<Double,Double>(canopy.getCenter().get(0),
>>> +                                                      canopy.getCenter().get(1));
>>> +      assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
>>>        assertTrue("more to come", reader.next(key, canopy));
>>>        assertEquals("2nd key", "C-1", key.toString());
>>> -      assertEquals("2nd x value", 4.333333333333334, canopy.getCenter().get(0),
>>> -          EPSILON);
>>> -      assertEquals("2nd y value", 4.333333333333334, canopy.getCenter().get(1),
>>> -          EPSILON);
>>> +      c = new Pair<Double,Double>(canopy.getCenter().get(0),
>>> +                                  canopy.getCenter().get(1));
>>> +      assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
>>>        assertFalse("more to come", reader.next(key, canopy));
>>>      } finally {
>>>        Closeables.closeQuietly(reader);
>>>      }
>>>    }
>>>
>>> +  boolean findAndRemove(Pair<Double,Double>  target,
>>> +                        List<Pair<Double,Double>>  list, double epsilon) {
>>> +    for (Pair<Double,Double>  curr : list) {
>>> +      if ( (Math.abs(target.getFirst() - curr.getFirst())<  epsilon)
>>> +&&  (Math.abs(target.getSecond() - curr.getSecond())<  epsilon) ) {
>>> +        list.remove(curr);
>>> +        return true;
>>> +      }
>>> +    }
>>> +    return false;
>>> +  }
>>> +
>>>    /**
>>>     * Story: User can produce final canopy centers using a Hadoop map/reduce job
>>>     * and a EuclideanDistanceMeasure.
>>> @@ -368,14 +385,18 @@ public final class TestCanopyCreation ex
>>>        Canopy value = new Canopy();
>>>        assertTrue("more to come", reader.next(key, value));
>>>        assertEquals("1st key", "C-0", key.toString());
>>> -      assertEquals("1st x value", 1.8, value.getCenter().get(0), EPSILON);
>>> -      assertEquals("1st y value", 1.8, value.getCenter().get(1), EPSILON);
>>> +
>>> +      List<Pair<Double,Double>>  refCenters = Lists.newArrayList();
>>> +      refCenters.add(new Pair<Double,Double>(1.8,1.8));
>>> +      refCenters.add(new Pair<Double,Double>(4.433333333333334, 4.433333333333334));
>>> +      Pair<Double,Double>  c = new Pair<Double,Double>(value.getCenter().get(0),
>>> +                                                      value.getCenter().get(1));
>>> +      assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
>>>        assertTrue("more to come", reader.next(key, value));
>>>        assertEquals("2nd key", "C-1", key.toString());
>>> -      assertEquals("2nd x value", 4.433333333333334, value.getCenter().get(0),
>>> -          EPSILON);
>>> -      assertEquals("2nd y value", 4.433333333333334, value.getCenter().get(1),
>>> -          EPSILON);
>>> +      c = new Pair<Double,Double>(value.getCenter().get(0),
>>> +                                  value.getCenter().get(1));
>>> +      assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
>>>        assertFalse("more to come", reader.next(key, value));
>>>      } finally {
>>>        Closeables.closeQuietly(reader);
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java Mon Mar 12 18:25:45 2012
>>> @@ -20,6 +20,9 @@ package org.apache.mahout.clustering.cla
>>>  import java.io.IOException;
>>>  import java.util.ArrayList;
>>>  import java.util.List;
>>> +import java.util.Set;
>>> +
>>> +import com.google.common.collect.Sets;
>>>
>>>  import junit.framework.Assert;
>>>
>>> @@ -195,9 +198,7 @@ public class ClusterClassificationDriver
>>>    }
>>>
>>>    private void assertVectorsWithOutlierRemoval() {
>>> -    assertFirstClusterWithOutlierRemoval();
>>> -    assertSecondClusterWithOutlierRemoval();
>>> -    assertThirdClusterWithOutlierRemoval();
>>> +    checkClustersWithOutlierRemoval();
>>>    }
>>>
>>>    private void assertVectorsWithoutOutlierRemoval() {
>>> @@ -230,25 +231,33 @@ public class ClusterClassificationDriver
>>>            "{1:1.0,0:2.0}", "{1:2.0,0:1.0}"}, vector.asFormatString()));
>>>      }
>>>    }
>>> -
>>> -  private void assertThirdClusterWithOutlierRemoval() {
>>> -    Assert.assertEquals(1, thirdCluster.size());
>>> -    for (Vector vector : thirdCluster) {
>>> -      Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:9.0,0:9.0}"},
>>> -          vector.asFormatString()));
>>> -    }
>>> -  }
>>> -
>>> -  private void assertSecondClusterWithOutlierRemoval() {
>>> -    Assert.assertEquals(0, secondCluster.size());
>>> -  }
>>> -
>>> -  private void assertFirstClusterWithOutlierRemoval() {
>>> -    Assert.assertEquals(1, firstCluster.size());
>>> -    for (Vector vector : firstCluster) {
>>> -      Assert.assertTrue(ArrayUtils.contains(new String[] {"{1:1.0,0:1.0}"},
>>> -          vector.asFormatString()));
>>> -    }
>>> +
>>> +  private void checkClustersWithOutlierRemoval() {
>>> +    Set<String>  reference = Sets.newHashSet(new String[] {"{1:9.0,0:9.0}",
>>> +                                                          "{1:1.0,0:1.0}"});
>>> +    int singletonCnt = 0;
>>> +    int emptyCnt = 0;
>>> +
>>> +    List<List<Vector>>  clusters = Lists.newArrayList();
>>> +    clusters.add(firstCluster);
>>> +    clusters.add(secondCluster);
>>> +    clusters.add(thirdCluster);
>>> +
>>> +    for (List<Vector>  vList : clusters) {
>>> +      if (vList.size() == 0) {
>>> +        emptyCnt++;
>>> +      } else {
>>> +        singletonCnt++;
>>> +        Assert.assertTrue("expecting only singleton clusters; got size=" + vList.size(),
>>> +                          vList.size() == 1);
>>> +        Assert.assertTrue("not expecting cluster:" + vList.get(0).asFormatString(),
>>> +                          reference.contains(vList.get(0).asFormatString()));
>>> +        reference.remove(vList.get(0).asFormatString());
>>> +      }
>>> +    }
>>> +    Assert.assertEquals("Different number of empty clusters than expected!", 1, emptyCnt);
>>> +    Assert.assertEquals("Different number of singletons than expected!", 2, singletonCnt);
>>> +    Assert.assertEquals("Didn't match all reference clusters!", 0, reference.size());
>>>    }
>>> -
>>> +
>>>  }
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Mon Mar 12 18:25:45 2012
>>> @@ -26,6 +26,7 @@ import com.google.common.collect.Lists;
>>>  import com.google.common.collect.Maps;
>>>  import com.google.common.io.Closeables;
>>>  import org.apache.hadoop.conf.Configuration;
>>> +import org.apache.hadoop.fs.FileStatus;
>>>  import org.apache.hadoop.fs.FileSystem;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.io.IntWritable;
>>> @@ -38,6 +39,7 @@ import org.apache.hadoop.util.ToolRunner
>>>  import org.apache.mahout.clustering.AbstractCluster;
>>>  import org.apache.mahout.clustering.ClusterObservations;
>>>  import org.apache.mahout.clustering.ClusteringTestUtils;
>>> +import org.apache.mahout.clustering.canopy.Canopy;
>>>  import org.apache.mahout.clustering.canopy.CanopyDriver;
>>>  import org.apache.mahout.clustering.classify.WeightedVectorWritable;
>>>  import org.apache.mahout.common.DummyOutputCollector;
>>> @@ -486,6 +488,42 @@ public final class TestKmeansClustering
>>>      // now run the Canopy job
>>>      CanopyDriver.run(conf, pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, false, 0.0, false);
>>>
>>> +    DummyOutputCollector<Text, Canopy>  collector1 =
>>> +        new DummyOutputCollector<Text, Canopy>();
>>> +
>>> +    FileStatus[] outParts = FileSystem.get(conf).globStatus(
>>> +                    new Path(outputPath, "clusters-0-final/*-0*"));
>>> +    for (FileStatus outPartStat : outParts) {
>>> +      for (Pair<Text,Canopy>  record :
>>> +               new SequenceFileIterable<Text,Canopy>(
>>> +                 outPartStat.getPath(), conf)) {
>>> +          collector1.collect(record.getFirst(), record.getSecond());
>>> +      }
>>> +    }
>>> +
>>> +    boolean got15 = false;
>>> +    boolean got43 = false;
>>> +    int count = 0;
>>> +    for (Text k : collector1.getKeys()) {
>>> +      count++;
>>> +      List<Canopy>  vl = collector1.getValue(k);
>>> +      assertEquals("non-singleton centroid!", 1, vl.size());
>>> +      Vector v = vl.get(0).getCenter();
>>> +      assertEquals("cetriod vector is wrong length", 2, v.size());
>>> +      if ( (Math.abs(v.get(0) - 1.5)<  EPSILON)
>>> +&&  (Math.abs(v.get(1) - 1.5)<  EPSILON)
>>> +&&  !got15) {
>>> +        got15 = true;
>>> +      } else if ( (Math.abs(v.get(0) - 4.333333333333334)<  EPSILON)
>>> +&&  (Math.abs(v.get(1) - 4.333333333333334)<  EPSILON)
>>> +&&  !got43) {
>>> +        got43 = true;
>>> +      } else {
>>> +        assertTrue("got unexpected center: "+v+" ["+v.getClass().toString()+"]", false);
>>> +      }
>>> +    }
>>> +    assertEquals("got unexpected number of centers", 2, count);
>>> +
>>>      // now run the KMeans job
>>>      KMeansDriver.run(pointsPath, new Path(outputPath, "clusters-0-final"), outputPath, new EuclideanDistanceMeasure(),
>>>          0.001, 10, true, false);
>>> @@ -500,7 +538,28 @@ public final class TestKmeansClustering
>>>        collector.collect(record.getFirst(), record.getSecond());
>>>      }
>>>
>>> -    assertEquals("num points[0]", 4, collector.getValue(new IntWritable(0)).size());
>>> -    assertEquals("num points[1]", 5, collector.getValue(new IntWritable(1)).size());
>>> +    boolean gotLowClust = false;  // clusters should be [1, *] and [2, *]
>>> +    boolean gotHighClust = false; // vs [3 , *],  [4 , *] and [5, *]
>>> +    for (IntWritable k : collector.getKeys()) {
>>> +      List<WeightedVectorWritable>  wvList = collector.getValue(k);
>>> +      assertTrue("empty cluster!", wvList.size() != 0);
>>> +      if (wvList.get(0).getVector().get(0)<= 2.0) {
>>> +        for (WeightedVectorWritable wv : wvList) {
>>> +          Vector v = wv.getVector();
>>> +          int idx = v.maxValueIndex();
>>> +          assertTrue("bad cluster!", v.get(idx)<= 2.0);
>>> +        }
>>> +        assertEquals("Wrong size cluster", 4, wvList.size());
>>> +        gotLowClust= true;
>>> +      } else {
>>> +        for (WeightedVectorWritable wv : wvList) {
>>> +          Vector v = wv.getVector();
>>> +          int idx = v.minValueIndex();
>>> +          assertTrue("bad cluster!", v.get(idx)>  2.0);
>>> +        }
>>> +        assertEquals("Wrong size cluster", 5, wvList.size());
>>> +        gotHighClust= true;
>>> +      }
>>> +    }
>>>    }
>>>  }
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Mon Mar 12 18:25:45 2012
>>> @@ -21,10 +21,12 @@ import java.util.Collection;
>>>  import java.util.Iterator;
>>>  import java.util.List;
>>>  import java.util.Map;
>>> +import java.util.Random;
>>>
>>>  import com.google.common.collect.Lists;
>>>  import com.google.common.collect.Maps;
>>>  import org.apache.hadoop.conf.Configuration;
>>> +import org.apache.hadoop.fs.FileStatus;
>>>  import org.apache.hadoop.fs.FileSystem;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.io.Text;
>>> @@ -350,7 +352,13 @@ public final class TestMeanShift extends
>>>      Configuration conf = new Configuration();
>>>      FileSystem fs = FileSystem.get(input.toUri(), conf);
>>>      Collection<VectorWritable>  points = Lists.newArrayList();
>>> -    for (Vector v : raw) {
>>> +    Random r = new Random(123);
>>> +    Vector[] permutedRaw = new Vector[raw.length];
>>> +    for (int i = 0; i<  raw.length; i++)
>>> +      permutedRaw = raw;
>>> +    for (int i = 0; i<  permutedRaw.length; i++)
>>> +      permutedRaw[i] = permutedRaw[i + r.nextInt(raw.length - i)];
>>> +    for (Vector v : permutedRaw) {
>>>        points.add(new VectorWritable(v));
>>>      }
>>>      ClusteringTestUtils.writePointsToFile(points,
>>> @@ -376,10 +384,12 @@ public final class TestMeanShift extends
>>>          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.2",
>>>          optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
>>>      ToolRunner.run(conf, new MeanShiftCanopyDriver(), args);
>>> -    Path outPart = new Path(output, "clusters-4-final/part-r-00000");
>>> -    long count = HadoopUtil.countRecords(outPart, conf);
>>> -    assertEquals("count", 3, count);
>>> -    outPart = new Path(output, "clusters-0/part-m-00000");
>>> +    FileStatus[] outParts = FileSystem.get(conf).globStatus(
>>> +        new Path(output, "clusters-?-final/part-r-*"));
>>> +    assertEquals("Wrong number of matching final parts", 1, outParts.length);
>>> +    long count = HadoopUtil.countRecords(outParts[0].getPath(), conf);
>>> +    assertEquals("count", 5, count);
>>> +    Path outPart = new Path(output, "clusters-0/part-m-00000");
>>>      Iterator<?>  iterator = new SequenceFileValueIterator<Writable>(outPart,
>>>          true, conf);
>>>      // now test the initial clusters to ensure the type of their centers has
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyCounter.java Mon Mar 12 18:25:45 2012
>>> @@ -1,26 +0,0 @@
>>> -/**
>>> - * Licensed to the Apache Software Foundation (ASF) under one
>>> - * or more contributor license agreements. See the NOTICE file
>>> - * distributed with this work for additional information
>>> - * regarding copyright ownership. The ASF licenses this file
>>> - * to you under the Apache License, Version 2.0 (the
>>> - * "License"); you may not use this file except in compliance
>>> - * with the License. You may obtain a copy of the License at
>>> - *
>>> - * http://www.apache.org/licenses/LICENSE-2.0
>>> - *
>>> - * Unless required by applicable law or agreed to in writing,
>>> - * software distributed under the License is distributed on an
>>> - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
>>> - * KIND, either express or implied. See the License for the
>>> - * specific language governing permissions and limitations
>>> - * under the License.
>>> - */
>>> -
>>> -package org.apache.mahout.common;
>>> -
>>> -import org.apache.hadoop.mapreduce.Counter;
>>> -
>>> -final class DummyCounter extends Counter {
>>> -
>>> -}
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyRecordWriter.java Mon Mar 12 18:25:45 2012
>>> @@ -17,16 +17,21 @@
>>>
>>>  package org.apache.mahout.common;
>>>
>>> +import com.google.common.collect.Lists;
>>> +
>>>  import java.io.IOException;
>>> +import java.lang.reflect.Constructor;
>>> +import java.lang.reflect.Method;
>>>  import java.util.List;
>>>  import java.util.Map;
>>>  import java.util.Set;
>>>  import java.util.TreeMap;
>>>
>>> -import com.google.common.collect.Lists;
>>>  import org.apache.hadoop.conf.Configuration;
>>> +import org.apache.hadoop.mapreduce.MapContext;
>>>  import org.apache.hadoop.mapreduce.Mapper;
>>>  import org.apache.hadoop.mapreduce.RecordWriter;
>>> +import org.apache.hadoop.mapreduce.ReduceContext;
>>>  import org.apache.hadoop.mapreduce.Reducer;
>>>  import org.apache.hadoop.mapreduce.TaskAttemptContext;
>>>  import org.apache.hadoop.mapreduce.TaskAttemptID;
>>> @@ -65,7 +70,18 @@ public final class DummyRecordWriter<K,
>>>                                                                        Configuration configuration,
>>>                                                                        RecordWriter<K2, V2>  output)
>>>      throws IOException, InterruptedException {
>>> -    return mapper.new Context(configuration, new TaskAttemptID(), null, output, null, new DummyStatusReporter(), null);
>>> +
>>> +    // Use reflection since the context types changed incompatibly between 0.20
>>> +    // and 0.23.
>>> +    try {
>>> +      return buildNewMapperContext(configuration, output);
>>> +    } catch (Exception e) {
>>> +      try {
>>> +        return buildOldMapperContext(mapper, configuration, output);
>>> +      } catch (Exception ex) {
>>> +        throw new IllegalStateException(ex);
>>> +      }
>>> +    }
>>>    }
>>>
>>>    public static<K1, V1, K2, V2>  Reducer<K1, V1, K2, V2>.Context build(Reducer<K1, V1, K2, V2>  reducer,
>>> @@ -74,17 +90,96 @@ public final class DummyRecordWriter<K,
>>>                                                                         Class<K1>  keyClass,
>>>                                                                         Class<V1>  valueClass)
>>>      throws IOException, InterruptedException {
>>> -    return reducer.new Context(configuration,
>>> -                               new TaskAttemptID(),
>>> -                               new MockIterator(),
>>> -                               null,
>>> -                               null,
>>> -                               output,
>>> -                               null,
>>> -                               new DummyStatusReporter(),
>>> -                               null,
>>> -                               keyClass,
>>> -                               valueClass);
>>> +
>>> +    // Use reflection since the context types changed incompatibly between 0.20
>>> +    // and 0.23.
>>> +    try {
>>> +      return buildNewReducerContext(configuration, output, keyClass, valueClass);
>>> +    } catch (Exception e) {
>>> +      try {
>>> +        return buildOldReducerContext(reducer, configuration, output, keyClass, valueClass);
>>> +      } catch (Exception ex) {
>>> +        throw new IllegalStateException(ex);
>>> +      }
>>> +    }
>>> +  }
>>> +
>>> +  @SuppressWarnings({ "unchecked", "rawtypes" })
>>> +  private static<K1, V1, K2, V2>  Mapper<K1, V1, K2, V2>.Context buildNewMapperContext(
>>> +      Configuration configuration, RecordWriter<K2, V2>  output) throws Exception {
>>> +    Class<?>  mapContextImplClass = Class.forName("org.apache.hadoop.mapreduce.task.MapContextImpl");
>>> +    Constructor<?>  cons = mapContextImplClass.getConstructors()[0];
>>> +    Object mapContextImpl = cons.newInstance(configuration,
>>> +        new TaskAttemptID(), null, output, null, new DummyStatusReporter(), null);
>>> +
>>> +    Class<?>  wrappedMapperClass = Class.forName("org.apache.hadoop.mapreduce.lib.map.WrappedMapper");
>>> +    Object wrappedMapper = wrappedMapperClass.newInstance();
>>> +    Method getMapContext = wrappedMapperClass.getMethod("getMapContext", MapContext.class);
>>> +    return (Mapper.Context) getMapContext.invoke(wrappedMapper, mapContextImpl);
>>> +  }
>>> +
>>> +  @SuppressWarnings({ "unchecked", "rawtypes" })
>>> +  private static<K1, V1, K2, V2>  Mapper<K1, V1, K2, V2>.Context buildOldMapperContext(
>>> +      Mapper<K1, V1, K2, V2>  mapper, Configuration configuration,
>>> +      RecordWriter<K2, V2>  output) throws Exception {
>>> +    Constructor<?>  cons = getNestedContextConstructor(mapper.getClass());
>>> +    // first argument to the constructor is the enclosing instance
>>> +    return (Mapper.Context) cons.newInstance(mapper, configuration,
>>> +        new TaskAttemptID(), null, output, null, new DummyStatusReporter(), null);
>>> +  }
>>> +
>>> +  @SuppressWarnings({ "unchecked", "rawtypes" })
>>> +  private static<K1, V1, K2, V2>  Reducer<K1, V1, K2, V2>.Context buildNewReducerContext(
>>> +      Configuration configuration, RecordWriter<K2, V2>  output, Class<K1>  keyClass,
>>> +      Class<V1>  valueClass) throws Exception {
>>> +    Class<?>  reduceContextImplClass = Class.forName("org.apache.hadoop.mapreduce.task.ReduceContextImpl");
>>> +    Constructor<?>  cons = reduceContextImplClass.getConstructors()[0];
>>> +    Object reduceContextImpl = cons.newInstance(configuration,
>>> +      new TaskAttemptID(),
>>> +      new MockIterator(),
>>> +      null,
>>> +      null,
>>> +      output,
>>> +      null,
>>> +      new DummyStatusReporter(),
>>> +      null,
>>> +      keyClass,
>>> +      valueClass);
>>> +
>>> +    Class<?>  wrappedReducerClass = Class.forName("org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer");
>>> +    Object wrappedReducer = wrappedReducerClass.newInstance();
>>> +    Method getReducerContext = wrappedReducerClass.getMethod("getReducerContext", ReduceContext.class);
>>> +    return (Reducer.Context) getReducerContext.invoke(wrappedReducer, reduceContextImpl);
>>> +  }
>>> +
>>> +  @SuppressWarnings({ "unchecked", "rawtypes" })
>>> +  private static<K1, V1, K2, V2>  Reducer<K1, V1, K2, V2>.Context buildOldReducerContext(
>>> +      Reducer<K1, V1, K2, V2>  reducer, Configuration configuration,
>>> +      RecordWriter<K2, V2>  output, Class<K1>  keyClass,
>>> +      Class<V1>  valueClass) throws Exception {
>>> +    Constructor<?>  cons = getNestedContextConstructor(reducer.getClass());
>>> +    // first argument to the constructor is the enclosing instance
>>> +    return (Reducer.Context) cons.newInstance(reducer,
>>> +        configuration,
>>> +        new TaskAttemptID(),
>>> +        new MockIterator(),
>>> +        null,
>>> +        null,
>>> +        output,
>>> +        null,
>>> +        new DummyStatusReporter(),
>>> +        null,
>>> +        keyClass,
>>> +        valueClass);
>>> +  }
>>> +
>>> +  private static Constructor<?>  getNestedContextConstructor(Class<?>  outerClass) {
>>> +    for (Class<?>  nestedClass : outerClass.getClasses()) {
>>> +      if ("Context".equals(nestedClass.getSimpleName())) {
>>> +        return nestedClass.getConstructors()[0];
>>> +      }
>>> +    }
>>> +    throw new IllegalStateException("Cannot find context class for " + outerClass);
>>>    }
>>>
>>>  }
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/common/DummyStatusReporter.java Mon Mar 12 18:25:45 2012
>>> @@ -19,6 +19,8 @@
>>>
>>>  package org.apache.mahout.common;
>>>
>>> +import static org.easymock.EasyMock.createMockBuilder;
>>> +
>>>  import java.util.Map;
>>>
>>>  import com.google.common.collect.Maps;
>>> @@ -30,10 +32,21 @@ public final class DummyStatusReporter e
>>>    private final Map<Enum<?>, Counter>  counters = Maps.newHashMap();
>>>    private final Map<String, Counter>  counterGroups = Maps.newHashMap();
>>>
>>> +  private Counter newCounter() {
>>> +    try {
>>> +      // 0.23 case
>>> +      String c = "org.apache.hadoop.mapreduce.counters.GenericCounter";
>>> +      return (Counter) createMockBuilder(Class.forName(c)).createMock();
>>> +    } catch (ClassNotFoundException e) {
>>> +      // 0.20 case
>>> +      return createMockBuilder(Counter.class).createMock();
>>> +    }
>>> +  }
>>> +
>>>    @Override
>>>    public Counter getCounter(Enum<?>  name) {
>>>      if (!counters.containsKey(name)) {
>>> -      counters.put(name, new DummyCounter());
>>> +      counters.put(name, newCounter());
>>>      }
>>>      return counters.get(name);
>>>    }
>>> @@ -42,7 +55,7 @@ public final class DummyStatusReporter e
>>>    @Override
>>>    public Counter getCounter(String group, String name) {
>>>      if (!counterGroups.containsKey(group + name)) {
>>> -      counterGroups.put(group + name, new DummyCounter());
>>> +      counterGroups.put(group + name, newCounter());
>>>      }
>>>      return counterGroups.get(group+name);
>>>    }
>>> @@ -55,4 +68,8 @@ public final class DummyStatusReporter e
>>>    public void setStatus(String status) {
>>>    }
>>>
>>> +  public float getProgress() {
>>> +    return 0;
>>> +  }
>>> +
>>>  }
>>>
>>> Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java (original)
>>> +++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java Mon Mar 12 18:25:45 2012
>>> @@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FileStatus;
>>>  import org.apache.hadoop.fs.FileSystem;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.mahout.clustering.ClusteringTestUtils;
>>> +import org.apache.mahout.common.HadoopUtil;
>>>  import org.apache.mahout.common.MahoutTestCase;
>>>  import org.apache.mahout.common.iterator.sequencefile.PathFilters;
>>>  import org.apache.mahout.math.DenseVector;
>>> @@ -254,14 +255,14 @@ public final class TestDistributedRowMat
>>>
>>>      deleteContentsOfPath(conf, outputPath);
>>>
>>> -    assertEquals(0, fs.listStatus(outputPath).length);
>>> +    assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>>
>>>      Vector result1 = dm.times(v);
>>>
>>> -    assertEquals(0, fs.listStatus(outputPath).length);
>>> +    assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>>
>>>      deleteContentsOfPath(conf, outputPath);
>>> -    assertEquals(0, fs.listStatus(outputPath).length);
>>> +    assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>>
>>>      conf.setBoolean(DistributedRowMatrix.KEEP_TEMP_FILES, true);
>>>      dm.setConf(conf);
>>> @@ -291,14 +292,14 @@ public final class TestDistributedRowMat
>>>
>>>      deleteContentsOfPath(conf, outputPath);
>>>
>>> -    assertEquals(0, fs.listStatus(outputPath).length);
>>> +    assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>>
>>>      Vector result1 = dm.timesSquared(v);
>>>
>>> -    assertEquals(0, fs.listStatus(outputPath).length);
>>> +    assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>>
>>>      deleteContentsOfPath(conf, outputPath);
>>> -    assertEquals(0, fs.listStatus(outputPath).length);
>>> +    assertEquals(0, HadoopUtil.listStatus(fs, outputPath).length);
>>>
>>>      conf.setBoolean(DistributedRowMatrix.KEEP_TEMP_FILES, true);
>>>      dm.setConf(conf);
>>> @@ -325,7 +326,7 @@ public final class TestDistributedRowMat
>>>    private static void deleteContentsOfPath(Configuration conf, Path path) throws Exception {
>>>      FileSystem fs = path.getFileSystem(conf);
>>>
>>> -    FileStatus[] statuses = fs.listStatus(path);
>>> +    FileStatus[] statuses = HadoopUtil.listStatus(fs, path);
>>>      for (FileStatus status : statuses) {
>>>        fs.delete(status.getPath(), true);
>>>      }
>>>
>>> Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
>>> +++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Mon Mar 12 18:25:45 2012
>>> @@ -193,7 +193,7 @@ public final class TestClusterDumper ext
>>>          output, measure, 8, 4, true, 0.0, true);
>>>      // run ClusterDumper
>>>      ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
>>> -        "clusters-0"), new Path(output, "clusteredPoints"));
>>> +        "clusters-0-final"), new Path(output, "clusteredPoints"));
>>>      clusterDumper.printClusters(termDictionary);
>>>    }
>>>
>>>
>>> Modified: mahout/trunk/pom.xml
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/pom.xml?rev=1299770&r1=1299769&r2=1299770&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/pom.xml (original)
>>> +++ mahout/trunk/pom.xml Mon Mar 12 18:25:45 2012
>>> @@ -107,6 +107,17 @@
>>>      <url>https://issues.apache.org/jira/browse/MAHOUT</url>
>>>    </issueManagement>
>>>
>>> +<repositories>
>>> +<repository>
>>> +<id>apache.snapshots</id>
>>> +<name>Apache Snapshot Repository</name>
>>> +<url>http://repository.apache.org/snapshots</url>
>>> +<releases>
>>> +<enabled>false</enabled>
>>> +</releases>
>>> +</repository>
>>> +</repositories>
>>> +
>>>    <dependencyManagement>
>>>      <dependencies>
>>>
>>> @@ -264,6 +275,100 @@
>>>          </exclusions>
>>>        </dependency>
>>>        <dependency>
>>> +<groupId>org.apache.hadoop</groupId>
>>> +<artifactId>hadoop-common</artifactId>
>>> +<version>${hadoop.version}</version>
>>> +<exclusions>
>>> +<exclusion>
>>> +<groupId>net.sf.kosmosfs</groupId>
>>> +<artifactId>kfs</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>org.mortbay.jetty</groupId>
>>> +<artifactId>jetty</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>org.mortbay.jetty</groupId>
>>> +<artifactId>jetty-util</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>hsqldb</groupId>
>>> +<artifactId>hsqldb</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>commons-el</groupId>
>>> +<artifactId>commons-el</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>junit</groupId>
>>> +<artifactId>junit</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>oro</groupId>
>>> +<artifactId>oro</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>org.mortbay.jetty</groupId>
>>> +<artifactId>jsp-2.1</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>org.mortbay.jetty</groupId>
>>> +<artifactId>jsp-api-2.1</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>org.mortbay.jetty</groupId>
>>> +<artifactId>servlet-api-2.5</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>commons-net</groupId>
>>> +<artifactId>commons-net</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>tomcat</groupId>
>>> +<artifactId>jasper-runtime</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>tomcat</groupId>
>>> +<artifactId>jasper-compiler</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>xmlenc</groupId>
>>> +<artifactId>xmlenc</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>net.java.dev.jets3t</groupId>
>>> +<artifactId>jets3t</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>org.eclipse.jdt</groupId>
>>> +<artifactId>core</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>org.slf4j</groupId>
>>> +<artifactId>slf4j-api</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>org.slf4j</groupId>
>>> +<artifactId>slf4j-jcl</artifactId>
>>> +</exclusion>
>>> +<exclusion>
>>> +<groupId>org.slf4j</groupId>
>>> +<artifactId>slf4j-log4j12</artifactId>
>>> +</exclusion>
>>> +</exclusions>
>>> +</dependency>
>>> +<dependency>
>>> +<groupId>org.apache.hadoop</groupId>
>>> +<artifactId>hadoop-mapreduce-client-core</artifactId>
>>> +<version>${hadoop.version}</version>
>>> +</dependency>
>>> +<dependency>
>>> +<groupId>org.apache.hadoop</groupId>
>>> +<artifactId>hadoop-mapreduce-client-common</artifactId>
>>> +<version>${hadoop.version}</version>
>>> +</dependency>
>>> +
>>> +<dependency>
>>>          <groupId>org.codehaus.jackson</groupId>
>>>          <artifactId>jackson-core-asl</artifactId>
>>>          <version>1.8.2</version>
>>>
>>>
>>
>
> --------------------------------------------
> Grant Ingersoll
> http://www.lucidimagination.com
>
>
>



-- 
Lance Norskog
goksron@gmail.com