You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2013/04/22 15:26:03 UTC
svn commit: r1470496 - in /lucene/dev/trunk/lucene/analysis:
common/src/java/org/apache/lucene/analysis/ngram/
common/src/test/org/apache/lucene/analysis/ngram/
morfologik/src/test/org/apache/lucene/analysis/morfologik/
Author: sarowe
Date: Mon Apr 22 13:26:03 2013
New Revision: 1470496
URL: http://svn.apache.org/r1470496
Log:
LUCENE-4810: first output token from EdgeNGramTokenFilter must be > 0
Modified:
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=1470496&r1=1470495&r2=1470496&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java Mon Apr 22 13:26:03 2013
@@ -75,6 +75,7 @@ public final class EdgeNGramTokenFilter
private int tokEnd; // only used if the length changed before this filter
private boolean hasIllegalOffsets; // only if the length changed before this filter
private int savePosIncr;
+ private boolean isFirstToken = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -138,9 +139,8 @@ public final class EdgeNGramTokenFilter
savePosIncr = posIncrAtt.getPositionIncrement();
}
}
- if (curGramSize <= maxGram) {
- if (! (curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
- || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit
+ if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
+ if (curGramSize <= curTermLength) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
int end = start + curGramSize;
@@ -152,12 +152,16 @@ public final class EdgeNGramTokenFilter
}
// first ngram gets increment, others don't
if (curGramSize == minGram) {
- posIncrAtt.setPositionIncrement(savePosIncr);
+ // Leave the first token position increment at the cleared-attribute value of 1
+ if ( ! isFirstToken) {
+ posIncrAtt.setPositionIncrement(savePosIncr);
+ }
} else {
posIncrAtt.setPositionIncrement(0);
}
termAtt.copyBuffer(curTermBuffer, start, curGramSize);
curGramSize++;
+ isFirstToken = false;
return true;
}
}
@@ -169,5 +173,6 @@ public final class EdgeNGramTokenFilter
public void reset() throws IOException {
super.reset();
curTermBuffer = null;
+ isFirstToken = true;
}
}
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1470496&r1=1470495&r2=1470496&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Mon Apr 22 13:26:03 2013
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Tokeni
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.position.PositionFilter;
import java.io.Reader;
import java.io.StringReader;
@@ -120,6 +121,21 @@ public class EdgeNGramTokenFilterTest ex
false);
}
+ public void testFirstTokenPositionIncrement() throws Exception {
+ TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false);
+ ts = new PositionFilter(ts, 0); // All but first token will get 0 position increment
+ EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3);
+ // The first token "a" will not be output, since it's smaller than the mingram size of 2.
+ // The second token on input to EdgeNGramTokenFilter will have position increment of 0,
+ // which should be increased to 1, since this is the first output token in the stream.
+ assertTokenStreamContents(filter,
+ new String[] { "ab", "abc" },
+ new int[] { 2, 2 },
+ new int[] { 4, 5 },
+ new int[] { 1, 0 }
+ );
+ }
+
public void testTokenizerPositions() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT, 1, 3);
assertTokenStreamContents(tokenizer,
Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1470496&r1=1470495&r2=1470496&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Mon Apr 22 13:26:03 2013
@@ -52,6 +52,14 @@ public class TestMorfologikAnalyzer exte
new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
+
+ assertAnalyzesToReuse(
+ a,
+ "T. Gl\u00FCcksberg",
+ new String[] { "to", "tom", "tona", "Gl\u00FCcksberg" },
+ new int[] { 0, 0, 0, 3 },
+ new int[] { 1, 1, 1, 13 },
+ new int[] { 1, 0, 0, 1 });
}
/** Test reuse of MorfologikFilter with leftover stems. */