You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2015/11/04 02:15:36 UTC

mahout git commit: MAHOUT-1783: Remove code for ConcatVectors Job, this closes apache/mahout#169

Repository: mahout
Updated Branches:
  refs/heads/master 509c966f1 -> 708cc4f2b


MAHOUT-1783: Remove code for ConcatVectors Job, this closes apache/mahout#169


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/708cc4f2
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/708cc4f2
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/708cc4f2

Branch: refs/heads/master
Commit: 708cc4f2b6c16ca76005d0a06b4dc8e3f0bacd3a
Parents: 509c966
Author: smarthi <sm...@apache.org>
Authored: Tue Nov 3 20:15:07 2015 -0500
Committer: smarthi <sm...@apache.org>
Committed: Tue Nov 3 20:15:07 2015 -0500

----------------------------------------------------------------------
 .../mahout/utils/ConcatenateVectorsJob.java     | 118 -------------------
 .../mahout/utils/ConcatenateVectorsReducer.java | 102 ----------------
 .../mahout/utils/TestConcatenateVectorsJob.java |  99 ----------------
 src/conf/driver.classes.default.props           |   1 -
 4 files changed, 320 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/708cc4f2/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java b/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
deleted file mode 100644
index 33d09a0..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *3
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.IOException;
-
-import com.google.common.base.Preconditions;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.VectorWritable;
-
-/*
- * Map-reduce job to combine two matrices A and B to (a1,a2,...aN,b1,b2,...bN)
- * Technically works on Vector files, so will also concatenate two vectors.
- * If either input is a NamedVector, the output has the name: A.name has precedence over B.name.
- * Concatenation or per-member combinations given a function object.
- * 
- * Uses clever hack which requires different matrices to have a different number of columns.
- * Courtesy of Jake Mannix, https://issues.apache.org/jira/browse/MAHOUT-884
- * If vectors are same length, this will not concatenate them in the right order
- *
- * @deprecated as of 0.10.0
- *
- * TODO: generalize to multiple matrices, should the teeming masses so desire
- */
-@Deprecated
-public class ConcatenateVectorsJob extends AbstractJob {
-  
-  static final String MATRIXA_DIMS = "mahout.concatenatevectors.matrixA_dims";
-  static final String MATRIXB_DIMS = "mahout.concatenatevectors.matrixB_dims";
-  
-  private ConcatenateVectorsJob() {}
-  
-  public static void main(String[] args) throws Exception {
-    ToolRunner.run(new ConcatenateVectorsJob(), args);
-  }
-  
-  @Override
-  public int run(String[] args) throws Exception {
-    addOption("matrixA", "ma", "A (left) matrix directory", true);
-    addOption("matrixB", "mb", "B (right) matrix directory", true);
-    addOutputOption();
-    DefaultOptionCreator.overwriteOption().create();
-
-    if (parseArguments(args) == null) {
-      return -1;
-    }
-
-    Path pathA = new Path(getOption("matrixA"));
-    Path pathB = new Path(getOption("matrixB"));
-    Path pathOutput = getOutputPath();
-
-    Configuration configuration = getConf();
-    FileSystem fs = FileSystem.get(configuration);
-
-    Class<? extends Writable> keyClassA = getKeyClass(pathA, fs);
-    Class<? extends Writable> keyClassB = getKeyClass(pathB, fs);
-
-    Preconditions.checkArgument(keyClassA.equals(keyClassB), "All SequenceFiles must use same key class");
-
-    int dimA = getDimensions(pathA);
-    int dimB = getDimensions(pathB);
-    
-    String nameA = getOption("matrixA");
-    String nameB = getOption("matrixB");
-    
-    Job concatenate = prepareJob(
-      new Path(nameA + "," + nameB), pathOutput, Mapper.class, keyClassA, VectorWritable.class,
-      ConcatenateVectorsReducer.class, keyClassA, VectorWritable.class);
-
-    configuration = concatenate.getConfiguration();
-    configuration.set(MATRIXA_DIMS, Integer.toString(dimA));
-    configuration.set(MATRIXB_DIMS, Integer.toString(dimB));
-    // TODO: add reducer as combiner - need a system that can exercise combiners
-
-    boolean succeeded = concatenate.waitForCompletion(true);
-    if (!succeeded) {
-      return -1;
-    }
-    return 0;
-  }
-
-  private Class<? extends Writable> getKeyClass(Path path, FileSystem fs) throws IOException {
-    // this works for both part* and a directory/ with part*.
-    Path pathPattern = new Path(path, "part*");
-    FileStatus[] paths = fs.globStatus(pathPattern);
-    Preconditions.checkArgument(paths.length > 0, path.getName() + " is a file, should be a directory");
-
-    Path file = paths[0].getPath();
-    try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, fs.getConf())){
-      return reader.getKeyClass().asSubclass(Writable.class);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/708cc4f2/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java b/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
deleted file mode 100644
index 0cf12ae..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.utils;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-
-import com.google.common.base.Preconditions;
-
-/*
- * Moded combiner/reducer. If vector comes in as length A or length B, concatenated.ˇ
- * If it is length A + B, combiner has already concatenated.
- *
- * @deprecated as of 0.10.0.
- *
- */
-@Deprecated
-public class ConcatenateVectorsReducer extends Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable> {
-  
-  int dimsA = 0;
-  int dimsB = 0;
-  
-  public ConcatenateVectorsReducer() {
-    
-  }
-  
-  public void setup(Context context) throws java.io.IOException, InterruptedException {
-    Configuration configuration = context.getConfiguration();
-
-    dimsA = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXA_DIMS)[0]);
-    dimsB = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXB_DIMS)[0]);
-  }
-  
-  public void reduce(IntWritable row, Iterable<VectorWritable> vectorWritableIterable,
-                        Context ctx) throws java.io.IOException ,InterruptedException {
-    Vector vA = null;
-    Vector vB = null;
-    Vector vOut = null;
-    boolean isNamed = false;
-    String name = null;
-
-    for (VectorWritable vw: vectorWritableIterable) {
-      Vector v = vw.get();
-      if (v instanceof NamedVector) {
-        name = ((NamedVector) v).getName();
-        isNamed = true;
-      }
-
-      if (v.size() == dimsA) {
-        vA = v;
-      } else if (v.size() == dimsB) {
-        vB = v;
-      } else if (v.size() == dimsA + dimsB) {
-        vOut = v;
-        break;
-      }
-    }
-
-    Preconditions.checkArgument((vA != null || vB != null) || (vOut != null));
-
-    if (vOut == null) {
-      vOut = new SequentialAccessSparseVector(dimsA + dimsB);
-      if (isNamed) {
-        vOut = new NamedVector(vOut, name);
-      }
-    }
-
-    if (vA != null) {
-      appendVector(vOut, vA, 0);
-    }
-
-    if (vB != null) {
-      appendVector(vOut, vB, dimsA);
-    }
-    ctx.write(row, new VectorWritable(vOut));
-  }
-  
-  private void appendVector(Vector vOut, Vector vIn, int offset) {
-    for (Vector.Element element : vIn.nonZeroes()) {
-      vOut.set(element.index() + offset, element.get());
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/708cc4f2/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java b/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
deleted file mode 100644
index a4e2bfc..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.DummyRecordWriter;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.junit.Test;
-
-/**
- * Code stolen from TestAffinityMatrixJob. Like TAMJ, it tests the Mappers/Reducers but not test the job
- */
-@Deprecated
-public class TestConcatenateVectorsJob extends MahoutTestCase {
-  
-  private static final double [][] DATA_A = {
-    {0,1,2,3,4},
-    {},
-    {0,1,2,3,4}
-  };
-  private static final double [][] DATA_B = {
-    {},
-    {5,6,7},
-    {5,6,7}
-  };
-  
-  @Test
-  public void testConcatenateVectorsReducer() throws Exception {
-    
-    Configuration configuration = getConfiguration();
-    configuration.set(ConcatenateVectorsJob.MATRIXA_DIMS, "5");
-    configuration.set(ConcatenateVectorsJob.MATRIXB_DIMS, "3");
-    
-    // Yes, all of this generic rigmarole is needed, and woe betide he who changes it
-    ConcatenateVectorsReducer reducer = new ConcatenateVectorsReducer();
-
-    DummyRecordWriter<IntWritable, VectorWritable> recordWriter = new DummyRecordWriter<>();
-
-    Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable>.Context reduceContext =
-      DummyRecordWriter.build(reducer, configuration, recordWriter, IntWritable.class, VectorWritable.class);
-    
-    reducer.setup(reduceContext);
-    
-    for(int i = 0; i < 3; i++) {
-      double[] values = DATA_A[i];
-      List<VectorWritable> vwList = new ArrayList<>();
-      if (values.length > 0) {
-        Vector v = new DenseVector(values);
-        VectorWritable vw = new VectorWritable();
-        vw.set(v);
-        vwList.add(vw);
-      }
-      values = DATA_B[i];
-      if (values.length > 0) {
-        Vector v = new DenseVector(values);
-        VectorWritable vw = new VectorWritable();
-        vw.set(v);
-        vwList.add(vw);
-
-      }
-      IntWritable row = new IntWritable(i);
-      
-      reducer.reduce(row, vwList, reduceContext);
-    }
-    
-    for (IntWritable row : recordWriter.getKeys()) {
-      List<VectorWritable> list = recordWriter.getValue(row);
-      Vector v = list.get(0).get();
-      assertEquals(8, v.size());
-      for (Vector.Element element : v.nonZeroes()) {
-        assertEquals(element.index(), v.get(element.index()), 0.001);
-      }
-    }
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/708cc4f2/src/conf/driver.classes.default.props
----------------------------------------------------------------------
diff --git a/src/conf/driver.classes.default.props b/src/conf/driver.classes.default.props
index d6a5ddb..69a9ba5 100644
--- a/src/conf/driver.classes.default.props
+++ b/src/conf/driver.classes.default.props
@@ -14,7 +14,6 @@ org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles = seq2encoded: Enco
 org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump to sequence file
 org.apache.mahout.text.SequenceFilesFromMailArchives = seqmailarchives : Creates SequenceFile from a directory containing gzipped mail archives
 org.apache.mahout.text.SequenceFilesFromLuceneStorageDriver = lucene2seq : Generate Text SequenceFiles from a Lucene index
-org.apache.mahout.utils.ConcatenateVectorsJob = concatmatrices : Concatenates 2 matrices of same cardinality into a single matrix
 org.apache.mahout.clustering.streaming.tools.ResplitSequenceFiles = resplit : Splits a set of SequenceFiles into a number of equal splits
 org.apache.mahout.clustering.streaming.tools.ClusterQualitySummarizer = qualcluster : Runs clustering experiments and summarizes results in a CSV
 org.apache.mahout.classifier.df.tools.Describe = describe : Describe the fields and target variable in a data set