You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2015/11/04 02:15:36 UTC
mahout git commit: MAHOUT-1783: Remove code for ConcatVectors Job,
this closes apache/mahout#169
Repository: mahout
Updated Branches:
refs/heads/master 509c966f1 -> 708cc4f2b
MAHOUT-1783: Remove code for ConcatVectors Job, this closes apache/mahout#169
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/708cc4f2
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/708cc4f2
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/708cc4f2
Branch: refs/heads/master
Commit: 708cc4f2b6c16ca76005d0a06b4dc8e3f0bacd3a
Parents: 509c966
Author: smarthi <sm...@apache.org>
Authored: Tue Nov 3 20:15:07 2015 -0500
Committer: smarthi <sm...@apache.org>
Committed: Tue Nov 3 20:15:07 2015 -0500
----------------------------------------------------------------------
.../mahout/utils/ConcatenateVectorsJob.java | 118 -------------------
.../mahout/utils/ConcatenateVectorsReducer.java | 102 ----------------
.../mahout/utils/TestConcatenateVectorsJob.java | 99 ----------------
src/conf/driver.classes.default.props | 1 -
4 files changed, 320 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/708cc4f2/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java b/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
deleted file mode 100644
index 33d09a0..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *3
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.io.IOException;
-
-import com.google.common.base.Preconditions;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.VectorWritable;
-
-/*
- * Map-reduce job to combine two matrices A and B to (a1,a2,...aN,b1,b2,...bN)
- * Technically works on Vector files, so will also concatenate two vectors.
- * If either input is a NamedVector, the output has the name: A.name has precedence over B.name.
- * Concatenation or per-member combinations given a function object.
- *
- * Uses clever hack which requires different matrices to have a different number of columns.
- * Courtesy of Jake Mannix, https://issues.apache.org/jira/browse/MAHOUT-884
- * If vectors are same length, this will not concatenate them in the right order
- *
- * @deprecated as of 0.10.0
- *
- * TODO: generalize to multiple matrices, should the teeming masses so desire
- */
-@Deprecated
-public class ConcatenateVectorsJob extends AbstractJob {
-
- static final String MATRIXA_DIMS = "mahout.concatenatevectors.matrixA_dims";
- static final String MATRIXB_DIMS = "mahout.concatenatevectors.matrixB_dims";
-
- private ConcatenateVectorsJob() {}
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new ConcatenateVectorsJob(), args);
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addOption("matrixA", "ma", "A (left) matrix directory", true);
- addOption("matrixB", "mb", "B (right) matrix directory", true);
- addOutputOption();
- DefaultOptionCreator.overwriteOption().create();
-
- if (parseArguments(args) == null) {
- return -1;
- }
-
- Path pathA = new Path(getOption("matrixA"));
- Path pathB = new Path(getOption("matrixB"));
- Path pathOutput = getOutputPath();
-
- Configuration configuration = getConf();
- FileSystem fs = FileSystem.get(configuration);
-
- Class<? extends Writable> keyClassA = getKeyClass(pathA, fs);
- Class<? extends Writable> keyClassB = getKeyClass(pathB, fs);
-
- Preconditions.checkArgument(keyClassA.equals(keyClassB), "All SequenceFiles must use same key class");
-
- int dimA = getDimensions(pathA);
- int dimB = getDimensions(pathB);
-
- String nameA = getOption("matrixA");
- String nameB = getOption("matrixB");
-
- Job concatenate = prepareJob(
- new Path(nameA + "," + nameB), pathOutput, Mapper.class, keyClassA, VectorWritable.class,
- ConcatenateVectorsReducer.class, keyClassA, VectorWritable.class);
-
- configuration = concatenate.getConfiguration();
- configuration.set(MATRIXA_DIMS, Integer.toString(dimA));
- configuration.set(MATRIXB_DIMS, Integer.toString(dimB));
- // TODO: add reducer as combiner - need a system that can exercise combiners
-
- boolean succeeded = concatenate.waitForCompletion(true);
- if (!succeeded) {
- return -1;
- }
- return 0;
- }
-
- private Class<? extends Writable> getKeyClass(Path path, FileSystem fs) throws IOException {
- // this works for both part* and a directory/ with part*.
- Path pathPattern = new Path(path, "part*");
- FileStatus[] paths = fs.globStatus(pathPattern);
- Preconditions.checkArgument(paths.length > 0, path.getName() + " is a file, should be a directory");
-
- Path file = paths[0].getPath();
- try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, fs.getConf())){
- return reader.getKeyClass().asSubclass(Writable.class);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/708cc4f2/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java b/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
deleted file mode 100644
index 0cf12ae..0000000
--- a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mahout.utils;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.math.NamedVector;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-
-import com.google.common.base.Preconditions;
-
-/*
- * Moded combiner/reducer. If vector comes in as length A or length B, concatenated.ˇ
- * If it is length A + B, combiner has already concatenated.
- *
- * @deprecated as of 0.10.0.
- *
- */
-@Deprecated
-public class ConcatenateVectorsReducer extends Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable> {
-
- int dimsA = 0;
- int dimsB = 0;
-
- public ConcatenateVectorsReducer() {
-
- }
-
- public void setup(Context context) throws java.io.IOException, InterruptedException {
- Configuration configuration = context.getConfiguration();
-
- dimsA = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXA_DIMS)[0]);
- dimsB = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXB_DIMS)[0]);
- }
-
- public void reduce(IntWritable row, Iterable<VectorWritable> vectorWritableIterable,
- Context ctx) throws java.io.IOException ,InterruptedException {
- Vector vA = null;
- Vector vB = null;
- Vector vOut = null;
- boolean isNamed = false;
- String name = null;
-
- for (VectorWritable vw: vectorWritableIterable) {
- Vector v = vw.get();
- if (v instanceof NamedVector) {
- name = ((NamedVector) v).getName();
- isNamed = true;
- }
-
- if (v.size() == dimsA) {
- vA = v;
- } else if (v.size() == dimsB) {
- vB = v;
- } else if (v.size() == dimsA + dimsB) {
- vOut = v;
- break;
- }
- }
-
- Preconditions.checkArgument((vA != null || vB != null) || (vOut != null));
-
- if (vOut == null) {
- vOut = new SequentialAccessSparseVector(dimsA + dimsB);
- if (isNamed) {
- vOut = new NamedVector(vOut, name);
- }
- }
-
- if (vA != null) {
- appendVector(vOut, vA, 0);
- }
-
- if (vB != null) {
- appendVector(vOut, vB, dimsA);
- }
- ctx.write(row, new VectorWritable(vOut));
- }
-
- private void appendVector(Vector vOut, Vector vIn, int offset) {
- for (Vector.Element element : vIn.nonZeroes()) {
- vOut.set(element.index() + offset, element.get());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/708cc4f2/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java b/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
deleted file mode 100644
index a4e2bfc..0000000
--- a/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.utils;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.DummyRecordWriter;
-import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.junit.Test;
-
-/**
- * Code stolen from TestAffinityMatrixJob. Like TAMJ, it tests the Mappers/Reducers but not test the job
- */
-@Deprecated
-public class TestConcatenateVectorsJob extends MahoutTestCase {
-
- private static final double [][] DATA_A = {
- {0,1,2,3,4},
- {},
- {0,1,2,3,4}
- };
- private static final double [][] DATA_B = {
- {},
- {5,6,7},
- {5,6,7}
- };
-
- @Test
- public void testConcatenateVectorsReducer() throws Exception {
-
- Configuration configuration = getConfiguration();
- configuration.set(ConcatenateVectorsJob.MATRIXA_DIMS, "5");
- configuration.set(ConcatenateVectorsJob.MATRIXB_DIMS, "3");
-
- // Yes, all of this generic rigmarole is needed, and woe betide he who changes it
- ConcatenateVectorsReducer reducer = new ConcatenateVectorsReducer();
-
- DummyRecordWriter<IntWritable, VectorWritable> recordWriter = new DummyRecordWriter<>();
-
- Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable>.Context reduceContext =
- DummyRecordWriter.build(reducer, configuration, recordWriter, IntWritable.class, VectorWritable.class);
-
- reducer.setup(reduceContext);
-
- for(int i = 0; i < 3; i++) {
- double[] values = DATA_A[i];
- List<VectorWritable> vwList = new ArrayList<>();
- if (values.length > 0) {
- Vector v = new DenseVector(values);
- VectorWritable vw = new VectorWritable();
- vw.set(v);
- vwList.add(vw);
- }
- values = DATA_B[i];
- if (values.length > 0) {
- Vector v = new DenseVector(values);
- VectorWritable vw = new VectorWritable();
- vw.set(v);
- vwList.add(vw);
-
- }
- IntWritable row = new IntWritable(i);
-
- reducer.reduce(row, vwList, reduceContext);
- }
-
- for (IntWritable row : recordWriter.getKeys()) {
- List<VectorWritable> list = recordWriter.getValue(row);
- Vector v = list.get(0).get();
- assertEquals(8, v.size());
- for (Vector.Element element : v.nonZeroes()) {
- assertEquals(element.index(), v.get(element.index()), 0.001);
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/708cc4f2/src/conf/driver.classes.default.props
----------------------------------------------------------------------
diff --git a/src/conf/driver.classes.default.props b/src/conf/driver.classes.default.props
index d6a5ddb..69a9ba5 100644
--- a/src/conf/driver.classes.default.props
+++ b/src/conf/driver.classes.default.props
@@ -14,7 +14,6 @@ org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles = seq2encoded: Enco
org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump to sequence file
org.apache.mahout.text.SequenceFilesFromMailArchives = seqmailarchives : Creates SequenceFile from a directory containing gzipped mail archives
org.apache.mahout.text.SequenceFilesFromLuceneStorageDriver = lucene2seq : Generate Text SequenceFiles from a Lucene index
-org.apache.mahout.utils.ConcatenateVectorsJob = concatmatrices : Concatenates 2 matrices of same cardinality into a single matrix
org.apache.mahout.clustering.streaming.tools.ResplitSequenceFiles = resplit : Splits a set of SequenceFiles into a number of equal splits
org.apache.mahout.clustering.streaming.tools.ClusterQualitySummarizer = qualcluster : Runs clustering experiments and summarizes results in a CSV
org.apache.mahout.classifier.df.tools.Describe = describe : Describe the fields and target variable in a data set