You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/06/10 00:20:39 UTC
svn commit: r1491309 - in /mahout/trunk: ./
integration/src/main/java/org/apache/mahout/utils/
integration/src/test/java/org/apache/mahout/utils/ src/conf/
Author: smarthi
Date: Sun Jun 9 22:20:38 2013
New Revision: 1491309
URL: http://svn.apache.org/r1491309
Log:
MAHOUT-884: Matrix Concatenate Utility
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/src/conf/driver.classes.default.props
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1491309&r1=1491308&r2=1491309&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Jun 9 22:20:38 2013
@@ -2,6 +2,8 @@ Mahout Change Log
Release 0.8 - unreleased
+ MAHOUT-884: Matrix Concatenate Utility (Lance Norskog, smarthi)
+
MAHOUT-1250: Deprecate unused algorithms (ssc)
MAHOUT-1251: Optimize MinHashMapper (ssc)
Added: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java?rev=1491309&view=auto
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java (added)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java Sun Jun 9 22:20:38 2013
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *3
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+
+import java.io.IOException;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+ * Map-reduce job to combine two matrices A and B to (a1,a2,...aN,b1,b2,...bN)
+ * Technically works on Vector files, so will also concatenate two vectors.
+ * If either input is a NamedVector, the output has the name: A.name has precedence over B.name.
+ * Concatenation or per-member combinations given a function object.
+ *
+ * Uses clever hack which requires different matrices to have a different number of columns.
+ * Courtesy of Jake Mannix, https://issues.apache.org/jira/browse/MAHOUT-884
+ * If vectors are same length, this will not concatenate them in the right order
+ *
+ * TODO: generalize to multiple matrices, should the teeming masses so desire
+ */
+
+public class ConcatenateVectorsJob extends AbstractJob {
+
+ static final String MATRIXA_DIMS = "mahout.concatenatevectors.matrixA_dims";
+ static final String MATRIXB_DIMS = "mahout.concatenatevectors.matrixB_dims";
+
+ private static final Logger LOG = LoggerFactory.getLogger(ConcatenateVectorsJob.class);
+
+ private ConcatenateVectorsJob() {}
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new ConcatenateVectorsJob(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addOption("matrixA", "ma", "A (left) matrix directory", true);
+ addOption("matrixB", "mb", "B (right) matrix directory", true);
+ addOutputOption();
+ DefaultOptionCreator.overwriteOption().create();
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Path pathA = new Path(getOption("matrixA"));
+ Path pathB = new Path(getOption("matrixB"));
+ Path pathOutput = getOutputPath();
+
+ Configuration configuration = getConf();
+ FileSystem fs = FileSystem.get(configuration);
+
+ Class<? extends Writable> keyClassA = getKeyClass(pathA, fs);
+ Class<? extends Writable> keyClassB = getKeyClass(pathB, fs);
+
+ Preconditions.checkArgument(keyClassA.equals(keyClassB), "All SequenceFiles must use same key class");
+
+ int dimA = getDimensions(pathA);
+ int dimB = getDimensions(pathB);
+
+ String nameA = getOption("matrixA");
+ String nameB = getOption("matrixB");
+
+ Job concatenate = prepareJob(
+ new Path(nameA + "," + nameB), pathOutput, Mapper.class, keyClassA, VectorWritable.class,
+ ConcatenateVectorsReducer.class, keyClassA, VectorWritable.class);
+
+ configuration = concatenate.getConfiguration();
+ configuration.set(MATRIXA_DIMS, Integer.toString(dimA));
+ configuration.set(MATRIXB_DIMS, Integer.toString(dimB));
+ // TODO: add reducer as combiner - need a system that can exercise combiners
+
+ boolean succeeded = concatenate.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ return 0;
+ }
+
+ private Class<? extends Writable> getKeyClass(Path path, FileSystem fs) throws IOException {
+ // this works for both part* and a directory/ with part*.
+ Path pathPattern = new Path(path, "part*");
+ FileStatus[] paths = fs.globStatus(pathPattern);
+ Preconditions.checkArgument(paths.length == 0, path.getName() + " is a file, should be a directory");
+
+ Path file = paths[0].getPath();
+ SequenceFile.Reader reader = null;
+ try {
+ reader = new SequenceFile.Reader(fs, file, fs.getConf());
+ return reader.getKeyClass().asSubclass(Writable.class);
+ } finally {
+ Closeables.close(reader, true);
+ }
+ }
+}
Added: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java?rev=1491309&view=auto
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java (added)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java Sun Jun 9 22:20:38 2013
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.utils;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import com.google.common.base.Preconditions;
+
+/*
+ * Moded combiner/reducer. If vector comes in as length A or length B, concatenated.Ë
+ * If it is length A + B, combiner has already concatenated.
+ *
+ */
+
+public class ConcatenateVectorsReducer extends Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+
+ int dimsA = 0;
+ int dimsB = 0;
+
+ public ConcatenateVectorsReducer() {
+
+ }
+
+ public void setup(Context context) throws java.io.IOException, InterruptedException {
+ Configuration configuration = context.getConfiguration();
+
+ dimsA = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXA_DIMS)[0]);
+ dimsB = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXB_DIMS)[0]);
+ }
+
+ public void reduce(IntWritable row, Iterable<VectorWritable> vectorWritableIterable,
+ Context ctx) throws java.io.IOException ,InterruptedException {
+ Vector vA = null;
+ Vector vB = null;
+ Vector vOut = null;
+ boolean isNamed = false;
+ String name = null;
+
+ for (VectorWritable vw: vectorWritableIterable) {
+ Vector v = vw.get();
+ if (v instanceof NamedVector) {
+ name = ((NamedVector) v).getName();
+ isNamed = true;
+ }
+
+ if (v.size() == dimsA) {
+ vA = v;
+ } else if (v.size() == dimsB) {
+ vB = v;
+ } else if (v.size() == dimsA + dimsB) {
+ vOut = v;
+ break;
+ }
+ }
+
+ Preconditions.checkArgument((vA != null || vB != null) || (vOut != null));
+
+ if (vOut == null) {
+ vOut = new SequentialAccessSparseVector(dimsA + dimsB);
+ if (isNamed)
+ vOut = new NamedVector(vOut, name);
+ }
+
+ if (vA != null) {
+ appendVector(vOut, vA, 0);
+ }
+
+ if (vB != null) {
+ appendVector(vOut, vB, dimsA);
+ }
+ ctx.write(row, new VectorWritable(vOut));
+ }
+
+ private void appendVector(Vector vOut, Vector vIn, int offset) {
+ for (Vector.Element element : vIn.nonZeroes())
+ vOut.set(element.index() + offset, element.get());
+ }
+}
Added: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java?rev=1491309&view=auto
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java (added)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java Sun Jun 9 22:20:38 2013
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.junit.Test;
+
+/**
+ * Code stolen from TestAffinityMatrixJob. Like TAMJ, it tests the Mappers/Reducers but not test the job
+ */
+
+public class TestConcatenateVectorsJob extends MahoutTestCase {
+
+ private static final double [][] DATA_A = {
+ {0,1,2,3,4},
+ {},
+ {0,1,2,3,4}
+ };
+ private static final double [][] DATA_B = {
+ {},
+ {5,6,7},
+ {5,6,7}
+ };
+
+ @Test
+ public void testConcatenateVectorsReducer() throws Exception {
+
+ Configuration configuration = new Configuration();
+ configuration.set(ConcatenateVectorsJob.MATRIXA_DIMS, "5");
+ configuration.set(ConcatenateVectorsJob.MATRIXB_DIMS, "3");
+
+ // Yes, all of this generic rigmarole is needed, and woe betide he who changes it
+ ConcatenateVectorsReducer reducer = new ConcatenateVectorsReducer();
+
+ DummyRecordWriter<IntWritable, VectorWritable> recordWriter = new DummyRecordWriter<IntWritable, VectorWritable>();
+
+ Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable>.Context reduceContext =
+ DummyRecordWriter.build(reducer, configuration, recordWriter, IntWritable.class, VectorWritable.class);
+
+ reducer.setup(reduceContext);
+
+ for(int i = 0; i < 3; i++) {
+ double[] values = DATA_A[i];
+ List<VectorWritable> vwList = Lists.newArrayList();
+ if (values.length > 0) {
+ Vector v = new DenseVector(values);
+ VectorWritable vw = new VectorWritable();
+ vw.set(v);
+ vwList.add(vw);
+ }
+ values = DATA_B[i];
+ if (values.length > 0) {
+ Vector v = new DenseVector(values);
+ VectorWritable vw = new VectorWritable();
+ vw.set(v);
+ vwList.add(vw);
+
+ }
+ IntWritable row = new IntWritable(i);
+
+ reducer.reduce(row, vwList, reduceContext);
+ }
+
+ for (IntWritable row : recordWriter.getKeys()) {
+ List<VectorWritable> list = recordWriter.getValue(row);
+ Vector v = list.get(0).get();
+ assertEquals(8, v.size());
+ for (Vector.Element element : v.nonZeroes()) {
+ assertEquals(element.index(), v.get(element.index()), 0.001);
+ }
+ }
+ }
+
+}
Modified: mahout/trunk/src/conf/driver.classes.default.props
URL: http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.default.props?rev=1491309&r1=1491308&r2=1491309&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.default.props (original)
+++ mahout/trunk/src/conf/driver.classes.default.props Sun Jun 9 22:20:38 2013
@@ -14,6 +14,7 @@ org.apache.mahout.vectorizer.EncodedVect
org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump to sequence file
org.apache.mahout.text.SequenceFilesFromMailArchives = seqmailarchives : Creates SequenceFile from a directory containing gzipped mail archives
org.apache.mahout.text.SequenceFilesFromLuceneStorageDriver = lucene2seq : Generate Text SequenceFiles from a Lucene index
+org.apache.mahout.utils.ConcatenateVectorsJob = concatvectors : Concatenates 2 matrices of same cardinality into a single matrix
#Math
org.apache.mahout.math.hadoop.TransposeJob = transpose : Take the transpose of a matrix