You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/16 14:06:30 UTC
svn commit: r1301478 - in /lucene/dev/trunk: lucene/
lucene/test-framework/src/java/org/apache/lucene/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/util/
modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/ mod...
Author: rmuir
Date: Fri Mar 16 13:06:30 2012
New Revision: 1301478
URL: http://svn.apache.org/viewvc?rev=1301478&view=rev
Log:
LUCENE-3848: don't produce tokenstreams that start with posinc=0
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Mar 16 13:06:30 2012
@@ -780,6 +780,12 @@ Changes in Runtime Behavior
boost on a field that omits norms. Because the index-time boost
is multiplied into the norm, previously your boost would be
silently discarded. (Tomás Fernández Löbbe, Hoss Man, Robert Muir)
+
+* LUCENE-3848: Fix tokenstreams to not produce a stream with an initial
+ position increment of 0: which is out of bounds (overlapping with a
+ non-existant previous term). Consumers such as IndexWriter and QueryParser
+ still check for and silently correct this situation today, but at some point
+ in the future they may throw an exception. (Mike McCandless, Robert Muir)
Security fixes
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Fri Mar 16 13:06:30 2012
@@ -157,7 +157,11 @@ public abstract class BaseTokenStreamTes
}
}
if (posIncrAtt != null) {
- assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
+ if (i == 0) {
+ assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
+ } else {
+ assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
+ }
}
if (posLengthAtt != null) {
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java Fri Mar 16 13:06:30 2012
@@ -33,6 +33,7 @@ public abstract class FilteringTokenFilt
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
+ private boolean first = true; // only used when not preserving gaps
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
super(input);
@@ -58,6 +59,13 @@ public abstract class FilteringTokenFilt
} else {
while (input.incrementToken()) {
if (accept()) {
+ if (first) {
+ // first token having posinc=0 is illegal.
+ if (posIncrAtt.getPositionIncrement() == 0) {
+ posIncrAtt.setPositionIncrement(1);
+ }
+ first = false;
+ }
return true;
}
}
@@ -66,6 +74,12 @@ public abstract class FilteringTokenFilt
return false;
}
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ first = true;
+ }
+
/**
* @see #setEnablePositionIncrements(boolean)
*/
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java Fri Mar 16 13:06:30 2012
@@ -121,6 +121,8 @@ public final class WikipediaTokenizer ex
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+
+ private boolean first;
/**
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
@@ -209,8 +211,13 @@ public final class WikipediaTokenizer ex
//output the untokenized Token first
collapseAndSaveTokens(tokenType, type);
}
- posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
+ int posinc = scanner.getPositionIncrement();
+ if (first && posinc == 0) {
+ posinc = 1; // don't emit posinc=0 for the first token!
+ }
+ posIncrAtt.setPositionIncrement(posinc);
typeAtt.setType(type);
+ first = false;
return true;
}
@@ -308,6 +315,7 @@ public final class WikipediaTokenizer ex
super.reset();
tokens = null;
scanner.reset();
+ first = true;
}
@Override
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java Fri Mar 16 13:06:30 2012
@@ -17,13 +17,17 @@ package org.apache.lucene.analysis.core;
*/
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Set;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@@ -120,4 +124,56 @@ public class TestStopFilter extends Base
System.out.println(s);
}
}
+
+ // stupid filter that inserts synonym of 'hte' for 'the'
+ private class MockSynonymFilter extends TokenFilter {
+ State bufferedState;
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ MockSynonymFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (bufferedState != null) {
+ restoreState(bufferedState);
+ posIncAtt.setPositionIncrement(0);
+ termAtt.setEmpty().append("hte");
+ bufferedState = null;
+ return true;
+ } else if (input.incrementToken()) {
+ if (termAtt.toString().equals("the")) {
+ bufferedState = captureState();
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ bufferedState = null;
+ }
+ }
+
+ public void testFirstPosInc() throws Exception {
+ Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenFilter filter = new MockSynonymFilter(tokenizer);
+ StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+ stopfilter.setEnablePositionIncrements(false);
+ return new TokenStreamComponents(tokenizer, stopfilter);
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "the quick brown fox",
+ new String[] { "hte", "quick", "brown", "fox" },
+ new int[] { 1, 1, 1, 1} );
+ }
}
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java?rev=1301478&r1=1301477&r2=1301478&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java Fri Mar 16 13:06:30 2012
@@ -240,27 +240,27 @@ public class TestSlowSynonymFilter exten
assertTokenizesTo(map, tokens("a,5"),
new String[] { "aa" },
new int[] { 5 });
- assertTokenizesTo(map, tokens("a,0"),
- new String[] { "aa" },
- new int[] { 0 });
+ assertTokenizesTo(map, tokens("b,1 a,0"),
+ new String[] { "b", "aa" },
+ new int[] { 1, 0 });
// test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokenizesTo(map, tokens("b,5"),
new String[] { "bb" },
new int[] { 5 });
- assertTokenizesTo(map, tokens("b,0"),
- new String[] { "bb" },
- new int[] { 0 });
+ assertTokenizesTo(map, tokens("c,1 b,0"),
+ new String[] { "c", "bb" },
+ new int[] { 1, 0 });
// test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokenizesTo(map, tokens("c,5"),
new String[] { "cc", "c2" },
new int[] { 5, 2 });
- assertTokenizesTo(map, tokens("c,0"),
- new String[] { "cc", "c2" },
- new int[] { 0, 2 });
+ assertTokenizesTo(map, tokens("d,1 c,0"),
+ new String[] { "d", "cc", "c2" },
+ new int[] { 1, 0, 2 });
}
@@ -275,27 +275,27 @@ public class TestSlowSynonymFilter exten
assertTokenizesTo(map, tokens("a,5"),
new String[] { "a", "aa" },
new int[] { 5, 0 });
- assertTokenizesTo(map, tokens("a,0"),
- new String[] { "a", "aa" },
- new int[] { 0, 0 });
+ assertTokenizesTo(map, tokens("b,1 a,0"),
+ new String[] { "b", "a", "aa" },
+ new int[] { 1, 0, 0 });
// test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokenizesTo(map, tokens("b,5"),
new String[] { "b", "bb" },
new int[] { 5, 0 });
- assertTokenizesTo(map, tokens("b,0"),
- new String[] { "b", "bb" },
- new int[] { 0, 0 });
+ assertTokenizesTo(map, tokens("c,1 b,0"),
+ new String[] { "c", "b", "bb" },
+ new int[] { 1, 0, 0 });
// test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokenizesTo(map, tokens("c,5"),
new String[] { "c", "cc", "c2" },
new int[] { 5, 0, 2 });
- assertTokenizesTo(map, tokens("c,0"),
- new String[] { "c", "cc", "c2" },
- new int[] { 0, 0, 2 });
+ assertTokenizesTo(map, tokens("d,1 c,0"),
+ new String[] { "d", "c", "cc", "c2" },
+ new int[] { 1, 0, 0, 2 });
}