You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/05/18 11:31:31 UTC
svn commit: r1484078 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/
lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/
Author: jpountz
Date: Sat May 18 09:31:31 2013
New Revision: 1484078
URL: http://svn.apache.org/r1484078
Log:
Fix EdgeNGramTokenFilter to correctly handle graph inputs.
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=1484078&r1=1484077&r2=1484078&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java Sat May 18 09:31:31 2013
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.revers
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.Version;
import java.io.IOException;
@@ -81,11 +82,12 @@ public final class EdgeNGramTokenFilter
private int tokEnd; // only used if the length changed before this filter
private boolean updateOffsets; // never if the length changed before this filter
private int savePosIncr;
- private boolean isFirstToken = true;
+ private int savePosLen;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
@@ -172,7 +174,8 @@ public final class EdgeNGramTokenFilter
// this is a synonym and don't adjust the offsets.
updateOffsets = (tokStart + curTermLength) == tokEnd;
}
- savePosIncr = posIncrAtt.getPositionIncrement();
+ savePosIncr += posIncrAtt.getPositionIncrement();
+ savePosLen = posLenAtt.getPositionLength();
}
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
@@ -188,16 +191,14 @@ public final class EdgeNGramTokenFilter
}
// first ngram gets increment, others don't
if (curGramSize == minGram) {
- // Leave the first token position increment at the cleared-attribute value of 1
- if ( ! isFirstToken) {
- posIncrAtt.setPositionIncrement(savePosIncr);
- }
+ posIncrAtt.setPositionIncrement(savePosIncr);
+ savePosIncr = 0;
} else {
posIncrAtt.setPositionIncrement(0);
}
+ posLenAtt.setPositionLength(savePosLen);
termAtt.copyBuffer(curTermBuffer, start, curGramSize);
curGramSize++;
- isFirstToken = false;
return true;
}
}
@@ -209,6 +210,6 @@ public final class EdgeNGramTokenFilter
public void reset() throws IOException {
super.reset();
curTermBuffer = null;
- isFirstToken = true;
+ savePosIncr = 0;
}
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1484078&r1=1484077&r2=1484078&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Sat May 18 09:31:31 2013
@@ -29,8 +29,10 @@ import org.apache.lucene.analysis.TokenF
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.Version;
@@ -247,4 +249,19 @@ public class EdgeNGramTokenFilterTest ex
};
checkAnalysisConsistency(random, b, random.nextBoolean(), "");
}
+
+ public void testGraphs() throws IOException {
+ TokenStream tk = new LetterTokenizer(Version.LUCENE_44, new StringReader("abc d efgh ij klmno p q"));
+ tk = new ShingleFilter(tk);
+ tk = new EdgeNGramTokenFilter(Version.LUCENE_44, tk, 7, 10);
+ tk.reset();
+ assertTokenStreamContents(tk,
+ new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
+ new int[] { 6,11,11,14 },
+ new int[] { 13,19,19,21 },
+ new int[] { 3,1,0,1 },
+ new int[] { 2,2,2,2 },
+ 23
+ );
+ }
}