You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2011/10/31 17:19:46 UTC
svn commit: r1195549 - in /mahout/trunk/core/src:
main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
Author: gsingers
Date: Mon Oct 31 16:19:46 2011
New Revision: 1195549
URL: http://svn.apache.org/viewvc?rev=1195549&view=rev
Log:
MAHOUT-855: fix buffer underflow issue
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java?rev=1195549&r1=1195548&r2=1195549&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java Mon Oct 31 16:19:46 2011
@@ -71,6 +71,7 @@ public class LuceneTextValueEncoder exte
for (int i = 0; i < n; i++) {
buf.put(input.charAt(i));
}
+ buf.rewind();
}
/**
@@ -84,8 +85,13 @@ public class LuceneTextValueEncoder exte
*/
@Override
public int read(char[] cbuf, int off, int len) {
- buf.get(cbuf, off, len);
- return len;
+ int toRead = Math.min(len, buf.remaining());
+ if (toRead > 0){
+ buf.get(cbuf, off, toRead);
+ return toRead;
+ } else {
+ return -1;
+ }
}
@Override
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java?rev=1195549&r1=1195548&r2=1195549&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java Mon Oct 31 16:19:46 2011
@@ -18,6 +18,8 @@
package org.apache.mahout.vectorizer.encoders;
import com.google.common.collect.ImmutableMap;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.util.Version;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
@@ -64,4 +66,36 @@ public final class TextValueEncoderTest
FeatureVectorEncoder enc = new TextValueEncoder("text");
assertEquals("[text:test1:1.0000, text:and:1.0000, text:more:1.0000]", enc.asString("test1 and more"));
}
+
+ @Test
+ public void testLuceneEncoding() throws Exception {
+ LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text");
+ enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_34));
+ Vector v1;
+ v1 = new DenseVector(200);
+ enc.addToVector("test1 and more", v1);
+ enc.flush(1, v1);
+
+ //should be the same as text test above, since we are splitting on whitespace
+ // should set 6 distinct locations to 1
+ assertEquals(6.0, v1.norm(1), 0);
+ assertEquals(1.0, v1.maxValue(), 0);
+
+ v1 = new DenseVector(200);
+ enc.addToVector("", v1);
+ enc.flush(1, v1);
+ assertEquals(0.0, v1.norm(1), 0);
+ assertEquals(0.0, v1.maxValue(), 0);
+
+ v1 = new DenseVector(200);
+ StringBuilder builder = new StringBuilder(5000);
+ for (int i = 0; i < 1000; i++){//lucene's internal buffer length request is 4096, so let's make sure we can handle larger size
+ builder.append("token_").append(i).append(" ");
+ }
+ enc.addToVector(builder.toString(), v1);
+ enc.flush(1, v1);
+ //System.out.println(v1);
+ assertEquals(2000.0, v1.norm(1), 0);
+ assertEquals(19.0, v1.maxValue(), 0);
+ }
}