You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/06/10 00:20:39 UTC

svn commit: r1491309 - in /mahout/trunk: ./ integration/src/main/java/org/apache/mahout/utils/ integration/src/test/java/org/apache/mahout/utils/ src/conf/

Author: smarthi
Date: Sun Jun  9 22:20:38 2013
New Revision: 1491309

URL: http://svn.apache.org/r1491309
Log:
MAHOUT-884: Matrix Concatenate Utility

Added:
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/src/conf/driver.classes.default.props

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1491309&r1=1491308&r2=1491309&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Jun  9 22:20:38 2013
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.8 - unreleased
 
+  MAHOUT-884: Matrix Concatenate Utility (Lance Norskog, smarthi)
+
   MAHOUT-1250: Deprecate unused algorithms (ssc)
 
   MAHOUT-1251: Optimize MinHashMapper (ssc)

Added: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java?rev=1491309&view=auto
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java (added)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java Sun Jun  9 22:20:38 2013
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *3
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+
+import java.io.IOException;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+ * Map-reduce job to combine two matrices A and B to (a1,a2,...aN,b1,b2,...bN)
+ * Technically works on Vector files, so will also concatenate two vectors.
+ * If either input is a NamedVector, the output has the name: A.name has precedence over B.name.
+ * Concatenation or per-member combinations given a function object.
+ * 
+ * Uses clever hack which requires different matrices to have a different number of columns.
+ * Courtesy of Jake Mannix, https://issues.apache.org/jira/browse/MAHOUT-884
+ * If vectors are same length, this will not concatenate them in the right order
+ * 
+ * TODO: generalize to multiple matrices, should the teeming masses so desire
+ */
+
+public class ConcatenateVectorsJob extends AbstractJob {
+  
+  static final String MATRIXA_DIMS = "mahout.concatenatevectors.matrixA_dims";
+  static final String MATRIXB_DIMS = "mahout.concatenatevectors.matrixB_dims";
+
+  private static final Logger LOG = LoggerFactory.getLogger(ConcatenateVectorsJob.class);
+  
+  private ConcatenateVectorsJob() {}
+  
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new ConcatenateVectorsJob(), args);
+  }
+  
+  @Override
+  public int run(String[] args) throws Exception {
+    addOption("matrixA", "ma", "A (left) matrix directory", true);
+    addOption("matrixB", "mb", "B (right) matrix directory", true);
+    addOutputOption();
+    DefaultOptionCreator.overwriteOption().create();
+
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    Path pathA = new Path(getOption("matrixA"));
+    Path pathB = new Path(getOption("matrixB"));
+    Path pathOutput = getOutputPath();
+
+    Configuration configuration = getConf();
+    FileSystem fs = FileSystem.get(configuration);
+
+    Class<? extends Writable> keyClassA = getKeyClass(pathA, fs);
+    Class<? extends Writable> keyClassB = getKeyClass(pathB, fs);
+
+    Preconditions.checkArgument(keyClassA.equals(keyClassB), "All SequenceFiles must use same key class");
+
+    int dimA = getDimensions(pathA);
+    int dimB = getDimensions(pathB);
+    
+    String nameA = getOption("matrixA");
+    String nameB = getOption("matrixB");
+    
+    Job concatenate = prepareJob(
+      new Path(nameA + "," + nameB), pathOutput, Mapper.class, keyClassA, VectorWritable.class,
+      ConcatenateVectorsReducer.class, keyClassA, VectorWritable.class);
+
+    configuration = concatenate.getConfiguration();
+    configuration.set(MATRIXA_DIMS, Integer.toString(dimA));
+    configuration.set(MATRIXB_DIMS, Integer.toString(dimB));
+    // TODO: add reducer as combiner - need a system that can exercise combiners
+
+    boolean succeeded = concatenate.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+    return 0;
+  }
+
+  private Class<? extends Writable> getKeyClass(Path path, FileSystem fs) throws IOException {
+    // this works for both part* and a directory/ with part*.
+    Path pathPattern = new Path(path, "part*");
+    FileStatus[] paths = fs.globStatus(pathPattern);
+    Preconditions.checkArgument(paths.length == 0, path.getName() + " is a file, should be a directory");
+
+    Path file = paths[0].getPath();
+    SequenceFile.Reader reader = null;
+    try {
+      reader = new SequenceFile.Reader(fs, file, fs.getConf());
+      return reader.getKeyClass().asSubclass(Writable.class);
+    } finally {
+      Closeables.close(reader, true);
+    }
+  }
+}

Added: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java?rev=1491309&view=auto
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java (added)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java Sun Jun  9 22:20:38 2013
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.utils;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import com.google.common.base.Preconditions;
+
+/*
+ * Moded combiner/reducer. If vector comes in as length A or length B, concatenated.ˇ
+ * If it is length A + B, combiner has already concatenated.
+ * 
+ */
+
+public class ConcatenateVectorsReducer extends Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+  
+  int dimsA = 0;
+  int dimsB = 0;
+  
+  public ConcatenateVectorsReducer() {
+    
+  }
+  
+  public void setup(Context context) throws java.io.IOException, InterruptedException {
+    Configuration configuration = context.getConfiguration();
+
+    dimsA = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXA_DIMS)[0]);
+    dimsB = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXB_DIMS)[0]);
+  }
+  
+  public void reduce(IntWritable row, Iterable<VectorWritable> vectorWritableIterable,
+                        Context ctx) throws java.io.IOException ,InterruptedException {
+    Vector vA = null;
+    Vector vB = null;
+    Vector vOut = null;
+    boolean isNamed = false;
+    String name = null;
+
+    for (VectorWritable vw: vectorWritableIterable) {
+      Vector v = vw.get();
+      if (v instanceof NamedVector) {
+        name = ((NamedVector) v).getName();
+        isNamed = true;
+      }
+
+      if (v.size() == dimsA) {
+        vA = v;
+      } else if (v.size() == dimsB) {
+        vB = v;
+      } else if (v.size() == dimsA + dimsB) {
+        vOut = v;
+        break;
+      }
+    }
+
+    Preconditions.checkArgument((vA != null || vB != null) || (vOut != null));
+
+    if (vOut == null) {
+      vOut = new SequentialAccessSparseVector(dimsA + dimsB);
+      if (isNamed) 
+        vOut = new NamedVector(vOut, name);
+    }
+
+    if (vA != null) {
+      appendVector(vOut, vA, 0);
+    }
+
+    if (vB != null) {
+      appendVector(vOut, vB, dimsA);
+    }
+    ctx.write(row, new VectorWritable(vOut));
+  }
+  
+  private void appendVector(Vector vOut, Vector vIn, int offset) {
+    for (Vector.Element element : vIn.nonZeroes())
+      vOut.set(element.index() + offset, element.get());
+  }
+}

Added: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java?rev=1491309&view=auto
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java (added)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java Sun Jun  9 22:20:38 2013
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.junit.Test;
+
+/**
+ * Code stolen from TestAffinityMatrixJob. Like TAMJ, it tests the Mappers/Reducers but not test the job
+ */
+
+public class TestConcatenateVectorsJob extends MahoutTestCase {
+  
+  private static final double [][] DATA_A = {
+    {0,1,2,3,4},
+    {},
+    {0,1,2,3,4}
+  };
+  private static final double [][] DATA_B = {
+    {},
+    {5,6,7},
+    {5,6,7}
+  };
+  
+  @Test
+  public void testConcatenateVectorsReducer() throws Exception {
+    
+    Configuration configuration = new Configuration();
+    configuration.set(ConcatenateVectorsJob.MATRIXA_DIMS, "5");
+    configuration.set(ConcatenateVectorsJob.MATRIXB_DIMS, "3");
+    
+    // Yes, all of this generic rigmarole is needed, and woe betide he who changes it
+    ConcatenateVectorsReducer reducer = new ConcatenateVectorsReducer();
+
+    DummyRecordWriter<IntWritable, VectorWritable> recordWriter = new DummyRecordWriter<IntWritable, VectorWritable>();
+
+    Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable>.Context reduceContext =
+      DummyRecordWriter.build(reducer, configuration, recordWriter, IntWritable.class, VectorWritable.class);
+    
+    reducer.setup(reduceContext);
+    
+    for(int i = 0; i < 3; i++) {
+      double[] values = DATA_A[i];
+      List<VectorWritable> vwList = Lists.newArrayList();
+      if (values.length > 0) {
+        Vector v = new DenseVector(values);
+        VectorWritable vw = new VectorWritable();
+        vw.set(v);
+        vwList.add(vw);
+      }
+      values = DATA_B[i];
+      if (values.length > 0) {
+        Vector v = new DenseVector(values);
+        VectorWritable vw = new VectorWritable();
+        vw.set(v);
+        vwList.add(vw);
+
+      }
+      IntWritable row = new IntWritable(i);
+      
+      reducer.reduce(row, vwList, reduceContext);
+    }
+    
+    for (IntWritable row : recordWriter.getKeys()) {
+      List<VectorWritable> list = recordWriter.getValue(row);
+      Vector v = list.get(0).get();
+      assertEquals(8, v.size());
+      for (Vector.Element element : v.nonZeroes()) {
+        assertEquals(element.index(), v.get(element.index()), 0.001);
+      }
+    }
+  }
+  
+}

Modified: mahout/trunk/src/conf/driver.classes.default.props
URL: http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.default.props?rev=1491309&r1=1491308&r2=1491309&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.default.props (original)
+++ mahout/trunk/src/conf/driver.classes.default.props Sun Jun  9 22:20:38 2013
@@ -14,6 +14,7 @@ org.apache.mahout.vectorizer.EncodedVect
 org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump to sequence file
 org.apache.mahout.text.SequenceFilesFromMailArchives = seqmailarchives : Creates SequenceFile from a directory containing gzipped mail archives
 org.apache.mahout.text.SequenceFilesFromLuceneStorageDriver = lucene2seq : Generate Text SequenceFiles from a Lucene index
+org.apache.mahout.utils.ConcatenateVectorsJob = concatvectors : Concatenates 2 matrices of same cardinality into a single matrix
 
 #Math
 org.apache.mahout.math.hadoop.TransposeJob = transpose : Take the transpose of a matrix