You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/16 14:06:30 UTC

svn commit: r1301478 - in /lucene/dev/trunk: lucene/ lucene/test-framework/src/java/org/apache/lucene/analysis/ modules/analysis/common/src/java/org/apache/lucene/analysis/util/ modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/ mod...

Author: rmuir
Date: Fri Mar 16 13:06:30 2012
New Revision: 1301478

URL: http://svn.apache.org/viewvc?rev=1301478&view=rev
Log:
LUCENE-3848: don't produce tokenstreams that start with posinc=0

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Mar 16 13:06:30 2012
@@ -780,6 +780,12 @@ Changes in Runtime Behavior
   boost on a field that omits norms. Because the index-time boost
   is multiplied into the norm, previously your boost would be 
   silently discarded.  (Tomás Fernández Löbbe, Hoss Man, Robert Muir)
+
+* LUCENE-3848: Fix tokenstreams to not produce a stream with an initial
+  position increment of 0: which is out of bounds (overlapping with a
+  non-existant previous term). Consumers such as IndexWriter and QueryParser
+  still check for and silently correct this situation today, but at some point 
+  in the future they may throw an exception.  (Mike McCandless, Robert Muir)
   
 Security fixes
 

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Fri Mar 16 13:06:30 2012
@@ -157,7 +157,11 @@ public abstract class BaseTokenStreamTes
         }
       }
       if (posIncrAtt != null) {
-        assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
+        if (i == 0) {
+          assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
+        } else {
+          assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
+        }
       }
       if (posLengthAtt != null) {
         assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java Fri Mar 16 13:06:30 2012
@@ -33,6 +33,7 @@ public abstract class FilteringTokenFilt
 
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
   private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
+  private boolean first = true; // only used when not preserving gaps
 
   public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
     super(input);
@@ -58,6 +59,13 @@ public abstract class FilteringTokenFilt
     } else {
       while (input.incrementToken()) {
         if (accept()) {
+          if (first) {
+            // first token having posinc=0 is illegal.
+            if (posIncrAtt.getPositionIncrement() == 0) {
+              posIncrAtt.setPositionIncrement(1);
+            }
+            first = false;
+          }
           return true;
         }
       }
@@ -66,6 +74,12 @@ public abstract class FilteringTokenFilt
     return false;
   }
 
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    first = true;
+  }
+
   /**
    * @see #setEnablePositionIncrements(boolean)
    */

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java Fri Mar 16 13:06:30 2012
@@ -121,6 +121,8 @@ public final class WikipediaTokenizer ex
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  
+  private boolean first;
 
   /**
    * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
@@ -209,8 +211,13 @@ public final class WikipediaTokenizer ex
       //output the untokenized Token first
       collapseAndSaveTokens(tokenType, type);
     }
-    posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
+    int posinc = scanner.getPositionIncrement();
+    if (first && posinc == 0) {
+      posinc = 1; // don't emit posinc=0 for the first token!
+    }
+    posIncrAtt.setPositionIncrement(posinc);
     typeAtt.setType(type);
+    first = false;
     return true;
   }
 
@@ -308,6 +315,7 @@ public final class WikipediaTokenizer ex
     super.reset();
     tokens = null;
     scanner.reset();
+    first = true;
   }
 
   @Override

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java Fri Mar 16 13:06:30 2012
@@ -17,13 +17,17 @@ package org.apache.lucene.analysis.core;
  */
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Set;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
@@ -120,4 +124,56 @@ public class TestStopFilter extends Base
       System.out.println(s);
     }
   }
+  
+  // stupid filter that inserts synonym of 'hte' for 'the'
+  private class MockSynonymFilter extends TokenFilter {
+    State bufferedState;
+    CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+    MockSynonymFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (bufferedState != null) {
+        restoreState(bufferedState);
+        posIncAtt.setPositionIncrement(0);
+        termAtt.setEmpty().append("hte");
+        bufferedState = null;
+        return true;
+      } else if (input.incrementToken()) {
+        if (termAtt.toString().equals("the")) {
+          bufferedState = captureState();
+        }
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      bufferedState = null;
+    }
+  }
+  
+  public void testFirstPosInc() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filter = new MockSynonymFilter(tokenizer);
+        StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+        stopfilter.setEnablePositionIncrements(false);
+        return new TokenStreamComponents(tokenizer, stopfilter);
+      }
+    };
+    
+    assertAnalyzesTo(analyzer, "the quick brown fox",
+        new String[] { "hte", "quick", "brown", "fox" },
+        new int[] { 1, 1, 1, 1} );
+  }
 }

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java Fri Mar 16 13:06:30 2012
@@ -240,27 +240,27 @@ public class TestSlowSynonymFilter exten
     assertTokenizesTo(map, tokens("a,5"), 
         new String[] { "aa" },
         new int[] { 5 });
-    assertTokenizesTo(map, tokens("a,0"),
-        new String[] { "aa" },
-        new int[] { 0 });
+    assertTokenizesTo(map, tokens("b,1 a,0"),
+        new String[] { "b", "aa" },
+        new int[] { 1, 0 });
 
     // test that offset of first replacement is ignored (always takes the orig offset)
     map.add(strings("b"), tokens("bb,100"), orig, merge);
     assertTokenizesTo(map, tokens("b,5"),
         new String[] { "bb" },
         new int[] { 5 });
-    assertTokenizesTo(map, tokens("b,0"),
-        new String[] { "bb" },
-        new int[] { 0 });
+    assertTokenizesTo(map, tokens("c,1 b,0"),
+        new String[] { "c", "bb" },
+        new int[] { 1, 0 });
 
     // test that subsequent tokens are adjusted accordingly
     map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
     assertTokenizesTo(map, tokens("c,5"),
         new String[] { "cc", "c2" },
         new int[] { 5, 2 });
-    assertTokenizesTo(map, tokens("c,0"),
-        new String[] { "cc", "c2" },
-        new int[] { 0, 2 });
+    assertTokenizesTo(map, tokens("d,1 c,0"),
+        new String[] { "d", "cc", "c2" },
+        new int[] { 1, 0, 2 });
   }
 
 
@@ -275,27 +275,27 @@ public class TestSlowSynonymFilter exten
     assertTokenizesTo(map, tokens("a,5"),
         new String[] { "a", "aa" },
         new int[] { 5, 0 });
-    assertTokenizesTo(map, tokens("a,0"),
-        new String[] { "a", "aa" },
-        new int[] { 0, 0 });
+    assertTokenizesTo(map, tokens("b,1 a,0"),
+        new String[] { "b", "a", "aa" },
+        new int[] { 1, 0, 0 });
 
     // test that offset of first replacement is ignored (always takes the orig offset)
     map.add(strings("b"), tokens("bb,100"), orig, merge);
     assertTokenizesTo(map, tokens("b,5"),
         new String[] { "b", "bb" },
         new int[] { 5, 0 });
-    assertTokenizesTo(map, tokens("b,0"),
-        new String[] { "b", "bb" },
-        new int[] { 0, 0 });
+    assertTokenizesTo(map, tokens("c,1 b,0"),
+        new String[] { "c", "b", "bb" },
+        new int[] { 1, 0, 0 });
 
     // test that subsequent tokens are adjusted accordingly
     map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
     assertTokenizesTo(map, tokens("c,5"),
         new String[] { "c", "cc", "c2" },
         new int[] { 5, 0, 2 });
-    assertTokenizesTo(map, tokens("c,0"),
-        new String[] { "c", "cc", "c2" },
-        new int[] { 0, 0, 2 });
+    assertTokenizesTo(map, tokens("d,1 c,0"),
+        new String[] { "d", "c", "cc", "c2" },
+        new int[] { 1, 0, 0, 2 });
   }