You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/12/09 00:35:02 UTC
svn commit: r1549353 - in /mahout/trunk: CHANGELOG
integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
Author: smarthi
Date: Sun Dec 8 23:35:02 2013
New Revision: 1549353
URL: http://svn.apache.org/r1549353
Log:
MAHOUT-1349: Clusterdumper/loadTermDictionary crashes when highest index in (sparse) dictionary vector is larger than dictionary vector size
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1549353&r1=1549352&r2=1549353&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Dec 8 23:35:02 2013
@@ -64,6 +64,8 @@ Release 0.9 - unreleased
MAHOUT-1261: TasteHadoopUtils.idToIndex can return an int that has size Integer.MAX_VALUE (Carl Clark, smarthi)
+ MAHOUT-1249: Clusterdumper/loadTermDictionary crashes when highest index in (sparse) dictionary vector is larger than dictionary vector size (Andrew Musselman via smarthi)
+
MAHOUT-1242: No key redistribution function for associative maps (Tharindu Rusira via smarthi)
MAHOUT-1030: Regression: Clustered Points Should be WeightedPropertyVectorWritable not WeightedVectorWritable (Andrew Musselman, Pat Ferrel, Jeff Eastman, Lars Norskog, smarthi)
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1549353&r1=1549352&r2=1549353&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Sun Dec 8 23:35:02 2013
@@ -193,12 +193,18 @@ public final class VectorHelper {
*/
public static String[] loadTermDictionary(Configuration conf, String filePattern) {
OpenObjectIntHashMap<String> dict = new OpenObjectIntHashMap<String>();
+ int maxIndexValue = 0;
for (Pair<Text, IntWritable> record
: new SequenceFileDirIterable<Text, IntWritable>(new Path(filePattern), PathType.GLOB, null, null, true,
conf)) {
dict.put(record.getFirst().toString(), record.getSecond().get());
+ if (record.getSecond().get() > maxIndexValue) {
+ maxIndexValue = record.getSecond().get();
+ }
}
- String[] dictionary = new String[dict.size()];
+ // Set dictionary size to greater of (maxIndexValue + 1, dict.size())
+ int maxDictionarySize = maxIndexValue + 1 > dict.size() ? maxIndexValue + 1 : dict.size();
+ String[] dictionary = new String[maxDictionarySize];
for (String feature : dict.keys()) {
dictionary[dict.get(feature)] = feature;
}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java?rev=1549353&r1=1549352&r2=1549353&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java Sun Dec 8 23:35:02 2013
@@ -19,13 +19,64 @@ package org.apache.mahout.utils.vectors;
import com.google.common.collect.Iterables;
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
+import org.junit.Before;
import org.junit.Test;
+import java.util.Random;
+
public final class VectorHelperTest extends MahoutTestCase {
+ private static final int NUM_DOCS = 100;
+
+ private Path inputPathOne;
+ private Path inputPathTwo;
+
+ private Configuration conf;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ conf = getConfiguration();
+
+ inputPathOne = getTestTempFilePath("documents/docs-one.file");
+ FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf);
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, IntWritable.class);
+ try {
+ Random rd = RandomUtils.getRandom();
+ for (int i = 0; i < NUM_DOCS; i++) {
+ // Make all indices higher than dictionary size
+ writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS + rd.nextInt(NUM_DOCS)));
+ }
+ } finally {
+ Closeables.close(writer, false);
+ }
+
+ inputPathTwo = getTestTempFilePath("documents/docs-two.file");
+ fs = FileSystem.get(inputPathTwo.toUri(), conf);
+ writer = new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, IntWritable.class);
+ try {
+ Random rd = RandomUtils.getRandom();
+ for (int i = 0; i < NUM_DOCS; i++) {
+ // Keep indices within number of documents
+ writer.append(new Text("Document::ID::" + i), new IntWritable(rd.nextInt(NUM_DOCS)));
+ }
+ } finally {
+ Closeables.close(writer, false);
+ }
+ }
+
@Test
public void testJsonFormatting() throws Exception {
Vector v = new SequentialAccessSparseVector(10);
@@ -85,4 +136,12 @@ public final class VectorHelperTest exte
v.set(8, 0.0);
assertEquals(0, VectorHelper.topEntries(v, 6).size());
}
+
+ @Test
+ public void testLoadTermDictionary() throws Exception {
+ // With indices higher than dictionary size
+ VectorHelper.loadTermDictionary(conf, inputPathOne.toString());
+ // With dictionary size higher than indices
+ VectorHelper.loadTermDictionary(conf, inputPathTwo.toString());
+ }
}