You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/12/09 00:35:02 UTC

svn commit: r1549353 - in /mahout/trunk: CHANGELOG integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java

Author: smarthi
Date: Sun Dec  8 23:35:02 2013
New Revision: 1549353

URL: http://svn.apache.org/r1549353
Log:
MAHOUT-1349: Clusterdumper/loadTermDictionary crashes when highest index in (sparse) dictionary vector is larger than dictionary vector size

Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1549353&r1=1549352&r2=1549353&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Dec  8 23:35:02 2013
@@ -64,6 +64,8 @@ Release 0.9 - unreleased
 
   MAHOUT-1261: TasteHadoopUtils.idToIndex can return an int that has size Integer.MAX_VALUE (Carl Clark, smarthi)
 
+  MAHOUT-1249: Clusterdumper/loadTermDictionary crashes when highest index in (sparse) dictionary vector is larger than dictionary vector size (Andrew Musselman via smarthi)
+
   MAHOUT-1242: No key redistribution function for associative maps (Tharindu Rusira via smarthi)
 
   MAHOUT-1030: Regression: Clustered Points Should be WeightedPropertyVectorWritable not WeightedVectorWritable (Andrew Musselman, Pat Ferrel, Jeff Eastman, Lars Norskog, smarthi)

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1549353&r1=1549352&r2=1549353&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Sun Dec  8 23:35:02 2013
@@ -193,12 +193,18 @@ public final class VectorHelper {
    */
   public static String[] loadTermDictionary(Configuration conf, String filePattern) {
     OpenObjectIntHashMap<String> dict = new OpenObjectIntHashMap<String>();
+    int maxIndexValue = 0;
     for (Pair<Text, IntWritable> record
         : new SequenceFileDirIterable<Text, IntWritable>(new Path(filePattern), PathType.GLOB, null, null, true,
                                                          conf)) {
       dict.put(record.getFirst().toString(), record.getSecond().get());
+      if (record.getSecond().get() > maxIndexValue) {
+        maxIndexValue = record.getSecond().get();
+      }
     }
-    String[] dictionary = new String[dict.size()];
+    // Set dictionary size to greater of (maxIndexValue + 1, dict.size())
+    int maxDictionarySize = maxIndexValue + 1 > dict.size() ? maxIndexValue + 1 : dict.size();
+    String[] dictionary = new String[maxDictionarySize];
     for (String feature : dict.keys()) {
       dictionary[dict.get(feature)] = feature;
     }

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java?rev=1549353&r1=1549352&r2=1549353&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java Sun Dec  8 23:35:02 2013
@@ -19,13 +19,64 @@ package org.apache.mahout.utils.vectors;
 
 import com.google.common.collect.Iterables;
 
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
 import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.math.SequentialAccessSparseVector;
 import org.apache.mahout.math.Vector;
+import org.junit.Before;
 import org.junit.Test;
 
+import java.util.Random;
+
 public final class VectorHelperTest extends MahoutTestCase {
 
+  private static final int NUM_DOCS = 100;
+
+  private Path inputPathOne;
+  private Path inputPathTwo;
+
+  private Configuration conf;
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    conf = getConfiguration();
+
+    inputPathOne = getTestTempFilePath("documents/docs-one.file");
+    FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf);
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, IntWritable.class);
+    try {
+      Random rd = RandomUtils.getRandom();
+      for (int i = 0; i < NUM_DOCS; i++) {
+        // Make all indices higher than dictionary size
+        writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS + rd.nextInt(NUM_DOCS)));
+      }
+    } finally {
+      Closeables.close(writer, false);
+    }
+
+    inputPathTwo = getTestTempFilePath("documents/docs-two.file");
+    fs = FileSystem.get(inputPathTwo.toUri(), conf);
+    writer = new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, IntWritable.class);
+    try {
+      Random rd = RandomUtils.getRandom();
+      for (int i = 0; i < NUM_DOCS; i++) {
+        // Keep indices within number of documents
+        writer.append(new Text("Document::ID::" + i), new IntWritable(rd.nextInt(NUM_DOCS)));
+      }
+    } finally {
+      Closeables.close(writer, false);
+    }
+  }
+
   @Test
   public void testJsonFormatting() throws Exception {
     Vector v = new SequentialAccessSparseVector(10);
@@ -85,4 +136,12 @@ public final class VectorHelperTest exte
     v.set(8, 0.0);
     assertEquals(0, VectorHelper.topEntries(v, 6).size());
   }
+
+  @Test
+  public void testLoadTermDictionary() throws Exception {
+    // With indices higher than dictionary size
+    VectorHelper.loadTermDictionary(conf, inputPathOne.toString());
+    // With dictionary size higher than indices
+    VectorHelper.loadTermDictionary(conf, inputPathTwo.toString());
+  }
 }