You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by si...@apache.org on 2010/01/10 19:06:20 UTC
svn commit: r897672 - in /lucene/java/trunk/contrib: CHANGES.txt
analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
Author: simonw
Date: Sun Jan 10 18:06:19 2010
New Revision: 897672
URL: http://svn.apache.org/viewvc?rev=897672&view=rev
Log:
LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram was set to false
Modified:
lucene/java/trunk/contrib/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=897672&r1=897671&r2=897672&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sun Jan 10 18:06:19 2010
@@ -18,6 +18,9 @@
Bug fixes
+ * LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
+ was set to false. (Simon Willnauer)
+
* LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
characters. During reverse the filter created unpaired surrogates, which
will be replaced by U+FFFD by the indexer, but not at query time. The filter
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=897672&r1=897671&r2=897672&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Sun Jan 10 18:06:19 2010
@@ -182,7 +182,7 @@
shingleBufferPosition++;
return true;
}
- } else {
+ } else if (shingleBufferPosition % this.maxShingleSize == 0){
shingleBufferPosition++;
}
@@ -197,7 +197,7 @@
termBuffer = termAtt.resizeTermBuffer(termLength);
buf.getChars(0, termLength, termBuffer, 0);
termAtt.setTermLength(termLength);
- if ((! outputUnigrams) && shingleBufferPosition == 1) {
+ if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
posIncrAtt.setPositionIncrement(1);
} else {
posIncrAtt.setPositionIncrement(0);
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=897672&r1=897671&r2=897672&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Sun Jan 10 18:06:19 2010
@@ -200,6 +200,93 @@
"word", "shingle",
"word"
};
+
+ public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
+ createToken("please divide", 0, 13),
+ createToken("please divide this", 0, 18),
+ createToken("divide this", 7, 18),
+ createToken("divide this sentence", 7, 27),
+ createToken("this sentence", 14, 27),
+ createToken("this sentence into", 14, 32),
+ createToken("sentence into", 19, 32),
+ createToken("sentence into shingles", 19, 39),
+ createToken("into shingles", 28, 39),
+ };
+
+ public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle",
+ };
+
+ public static final Token[] FOUR_GRAM_TOKENS = new Token[] {
+ createToken("please", 0, 6),
+ createToken("please divide", 0, 13),
+ createToken("please divide this", 0, 18),
+ createToken("please divide this sentence", 0, 27),
+ createToken("divide", 7, 13),
+ createToken("divide this", 7, 18),
+ createToken("divide this sentence", 7, 27),
+ createToken("divide this sentence into", 7, 32),
+ createToken("this", 14, 18),
+ createToken("this sentence", 14, 27),
+ createToken("this sentence into", 14, 32),
+ createToken("this sentence into shingles", 14, 39),
+ createToken("sentence", 19, 27),
+ createToken("sentence into", 19, 32),
+ createToken("sentence into shingles", 19, 39),
+ createToken("into", 28, 32),
+ createToken("into shingles", 28, 39),
+ createToken("shingles", 33, 39)
+ };
+
+ public static final int[] FOUR_GRAM_POSITION_INCREMENTS = new int[] {
+ 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1
+ };
+
+ public static final String[] FOUR_GRAM_TYPES = new String[] {
+ "word", "shingle", "shingle", "shingle",
+ "word", "shingle", "shingle", "shingle",
+ "word", "shingle", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle",
+ "word"
+ };
+
+ public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
+ createToken("please divide", 0, 13),
+ createToken("please divide this", 0, 18),
+ createToken("please divide this sentence", 0, 27),
+ createToken("divide this", 7, 18),
+ createToken("divide this sentence", 7, 27),
+ createToken("divide this sentence into", 7, 32),
+ createToken("this sentence", 14, 27),
+ createToken("this sentence into", 14, 32),
+ createToken("this sentence into shingles", 14, 39),
+ createToken("sentence into", 19, 32),
+ createToken("sentence into shingles", 19, 39),
+ createToken("into shingles", 28, 39),
+ };
+
+ public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
+ 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+ };
+
+ public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+
+ };
@Override
@@ -272,8 +359,25 @@
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
true);
}
-
-
+
+ public void testTriGramFilterWithoutUnigrams() throws IOException {
+ this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+ TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+ false);
+ }
+
+ public void testFourGramFilter() throws IOException {
+ this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS,
+ FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES,
+ true);
+ }
+
+ public void testFourGramFilterWithoutUnigrams() throws IOException {
+ this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+ FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
+ FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
+ }
+
public void testReset() throws Exception {
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));