You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/03/23 18:56:24 UTC

svn commit: r1304528 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/ lucene/contrib/facet/ lucene/cont...

Author: mikemccand
Date: Fri Mar 23 17:56:23 2012
New Revision: 1304528

URL: http://svn.apache.org/viewvc?rev=1304528&view=rev
Log:
LUCENE-3905: sometimes run real-ish content (from LineFileDocs) through the analyzers too; fix end() offset bugs in the ngram tokenizers/filters

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
    lucene/dev/branches/branch_3x/lucene/contrib/facet/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/spellchecker/   (props changed)
    lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
    lucene/dev/branches/branch_3x/solr/   (props changed)
    lucene/dev/branches/branch_3x/solr/core/   (props changed)

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Fri Mar 23 17:56:23 2012
@@ -73,7 +73,7 @@ public final class EdgeNGramTokenizer ex
   private int maxGram;
   private int gramSize;
   private Side side;
-  private boolean started = false;
+  private boolean started;
   private int inLen; // length of the input AFTER trim()
   private int charsRead; // length of the input
   private String inStr;
@@ -178,7 +178,7 @@ public final class EdgeNGramTokenizer ex
 
   /** Returns the next token in the stream, or null at EOS. */
   @Override
-  public final boolean incrementToken() throws IOException {
+  public boolean incrementToken() throws IOException {
     clearAttributes();
     // if we are just starting, read the whole input
     if (!started) {
@@ -188,13 +188,28 @@ public final class EdgeNGramTokenizer ex
       charsRead = 0;
       // TODO: refactor to a shared readFully somewhere:
       while (charsRead < chars.length) {
-        int inc = input.read(chars, charsRead, chars.length-charsRead);
+        final int inc = input.read(chars, charsRead, chars.length-charsRead);
         if (inc == -1) {
           break;
         }
         charsRead += inc;
       }
+
       inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
+
+      if (charsRead == chars.length) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
       inLen = inStr.length();
       if (inLen == 0) {
         return false;
@@ -221,21 +236,15 @@ public final class EdgeNGramTokenizer ex
   }
   
   @Override
-  public final void end() {
+  public void end() {
     // set final offset
     final int finalOffset = correctOffset(charsRead);
     this.offsetAtt.setOffset(finalOffset, finalOffset);
   }    
 
   @Override
-  public void reset(Reader input) throws IOException {
-    super.reset(input);
-  }
-
-  @Override
   public void reset() throws IOException {
     super.reset();
     started = false;
-    charsRead = 0;
   }
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Fri Mar 23 17:56:23 2012
@@ -34,11 +34,11 @@ public final class NGramTokenizer extend
 
   private int minGram, maxGram;
   private int gramSize;
-  private int pos = 0;
+  private int pos;
   private int inLen; // length of the input AFTER trim()
   private int charsRead; // length of the input
   private String inStr;
-  private boolean started = false;
+  private boolean started;
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -99,7 +99,7 @@ public final class NGramTokenizer extend
 
   /** Returns the next token in the stream, or null at EOS. */
   @Override
-  public final boolean incrementToken() throws IOException {
+  public boolean incrementToken() throws IOException {
     clearAttributes();
     if (!started) {
       started = true;
@@ -115,6 +115,20 @@ public final class NGramTokenizer extend
         charsRead += inc;
       }
       inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
+
+      if (charsRead == chars.length) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
       inLen = inStr.length();
       if (inLen == 0) {
         return false;
@@ -138,22 +152,16 @@ public final class NGramTokenizer extend
   }
   
   @Override
-  public final void end() {
+  public void end() {
     // set final offset
     final int finalOffset = correctOffset(charsRead);
     this.offsetAtt.setOffset(finalOffset, finalOffset);
   }    
   
   @Override
-  public void reset(Reader input) throws IOException {
-    super.reset(input);
-  }
-
-  @Override
   public void reset() throws IOException {
     super.reset();
     started = false;
     pos = 0;
-    charsRead = 0;
   }
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Fri Mar 23 17:56:23 2012
@@ -110,6 +110,7 @@ public class EdgeNGramTokenizerTest exte
       }    
     };
     checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
     
     Analyzer b = new ReusableAnalyzerBase() {
       @Override
@@ -119,5 +120,6 @@ public class EdgeNGramTokenizerTest exte
       }    
     };
     checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
   }
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Fri Mar 23 17:56:23 2012
@@ -100,5 +100,6 @@ public class NGramTokenizerTest extends 
       }    
     };
     checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
   }
 }

Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Fri Mar 23 17:56:23 2012
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokena
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.LineFileDocs;
 import org.apache.lucene.util._TestUtil;
 
 /** 
@@ -359,12 +360,22 @@ public abstract class BaseTokenStreamTes
   }
 
   private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
+
+    final LineFileDocs docs = new LineFileDocs(random);
+
     for (int i = 0; i < iterations; i++) {
       String text;
-      if (simple) { 
-        text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength);
+
+      if (random.nextInt(10) == 7) {
+        text = docs.nextDoc().get("body");
+        if (text.length() > maxWordLength) {
+          text = text.substring(0, maxWordLength);
+        }
       } else {
-        switch(_TestUtil.nextInt(random, 0, 4)) {
+        if (simple) { 
+          text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength);
+        } else {
+          switch(_TestUtil.nextInt(random, 0, 4)) {
           case 0: 
             text = _TestUtil.randomSimpleString(random, maxWordLength);
             break;
@@ -376,6 +387,7 @@ public abstract class BaseTokenStreamTes
             break;
           default:
             text = _TestUtil.randomUnicodeString(random, maxWordLength);
+          }
         }
       }