You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2011/10/31 17:19:46 UTC

svn commit: r1195549 - in /mahout/trunk/core/src: main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java

Author: gsingers
Date: Mon Oct 31 16:19:46 2011
New Revision: 1195549

URL: http://svn.apache.org/viewvc?rev=1195549&view=rev
Log:
MAHOUT-855: fix buffer underflow issue

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
    mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java?rev=1195549&r1=1195548&r2=1195549&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java Mon Oct 31 16:19:46 2011
@@ -71,6 +71,7 @@ public class LuceneTextValueEncoder exte
       for (int i = 0; i < n; i++) {
         buf.put(input.charAt(i));
       }
+      buf.rewind();
     }
 
     /**
@@ -84,8 +85,13 @@ public class LuceneTextValueEncoder exte
      */
     @Override
     public int read(char[] cbuf, int off, int len) {
-      buf.get(cbuf, off, len);
-      return len;
+      int toRead = Math.min(len, buf.remaining());
+      if (toRead > 0){
+        buf.get(cbuf, off, toRead);
+        return toRead;
+      } else {
+        return -1;
+      }
     }
 
     @Override

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java?rev=1195549&r1=1195548&r2=1195549&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java Mon Oct 31 16:19:46 2011
@@ -18,6 +18,8 @@
 package org.apache.mahout.vectorizer.encoders;
 
 import com.google.common.collect.ImmutableMap;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.util.Version;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Vector;
@@ -64,4 +66,36 @@ public final class TextValueEncoderTest 
     FeatureVectorEncoder enc = new TextValueEncoder("text");
     assertEquals("[text:test1:1.0000, text:and:1.0000, text:more:1.0000]", enc.asString("test1 and more"));
   }
+
+  @Test
+  public void testLuceneEncoding() throws Exception {
+    LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text");
+    enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_34));
+    Vector v1;
+    v1 = new DenseVector(200);
+    enc.addToVector("test1 and more", v1);
+    enc.flush(1, v1);
+
+    //should be the same as text test above, since we are splitting on whitespace
+    // should set 6 distinct locations to 1
+    assertEquals(6.0, v1.norm(1), 0);
+    assertEquals(1.0, v1.maxValue(), 0);
+
+    v1 = new DenseVector(200);
+    enc.addToVector("", v1);
+    enc.flush(1, v1);
+    assertEquals(0.0, v1.norm(1), 0);
+    assertEquals(0.0, v1.maxValue(), 0);
+
+    v1 = new DenseVector(200);
+    StringBuilder builder = new StringBuilder(5000);
+    for (int i = 0; i < 1000; i++){//lucene's internal buffer length request is 4096, so let's make sure we can handle larger size
+      builder.append("token_").append(i).append(" ");
+    }
+    enc.addToVector(builder.toString(), v1);
+    enc.flush(1, v1);
+    //System.out.println(v1);
+    assertEquals(2000.0, v1.norm(1), 0);
+    assertEquals(19.0, v1.maxValue(), 0);
+  }
 }