You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cp...@apache.org on 2017/02/28 17:26:53 UTC
[26/50] [abbrv] lucene-solr:jira/solr-9045: LUCENE-7708: Fix position
length attribute set by the ShingleFilter when outputUnigrams=false
LUCENE-7708: Fix position length attribute set by the ShingleFilter when outputUnigrams=false
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/57a42e4e
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/57a42e4e
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/57a42e4e
Branch: refs/heads/jira/solr-9045
Commit: 57a42e4ec54aebac40c1ef7dc93d933cd00dbe1e
Parents: cab3aae
Author: Jim Ferenczi <ji...@elastic.co>
Authored: Fri Feb 24 23:37:37 2017 +0100
Committer: Jim Ferenczi <ji...@elastic.co>
Committed: Fri Feb 24 23:37:37 2017 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 +
.../lucene/analysis/shingle/ShingleFilter.java | 7 +-
.../analysis/shingle/ShingleFilterTest.java | 94 +++++++++++++++++++-
3 files changed, 102 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/57a42e4e/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 1d45ab8..c119eaa 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -155,6 +155,10 @@ Bug Fixes
token graph, messing up phrase queries when it was used during query
parsing (Ere Maijala via Mike McCandless)
+* LUCENE-7708: ShingleFilter without unigram was producing a disconnected
+ token graph, messing up queries when it was used during query
+ parsing (Jim Ferenczi)
+
Improvements
* LUCENE-7055: Added Weight#scorerSupplier, which allows to estimate the cost
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/57a42e4e/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
index 5d99291..e3fa803 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@@ -343,7 +343,12 @@ public final class ShingleFilter extends TokenFilter {
noShingleOutput = false;
}
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
- posLenAtt.setPositionLength(builtGramSize);
+ if (outputUnigrams) {
+ posLenAtt.setPositionLength(builtGramSize);
+ } else {
+ // position length for this token is the number of position created by shingles of smaller size.
+ posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1));
+ }
isOutputHere = true;
gramSize.advance();
tokenAvailable = true;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/57a42e4e/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
index 192de38..5645900 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
@@ -30,7 +30,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class ShingleFilterTest extends BaseTokenStreamTestCase {
@@ -1239,7 +1239,6 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken(null);
filter.setTokenSeparator(null);
-
assertTokenStreamContents(filter,
new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
new int[] {0, 0, 0, 7, 7, 7},
@@ -1247,4 +1246,95 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
new int[] {1, 0, 0, 1, 0, 0},
20);
}
+
+ public void testPositionLength() throws Exception {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
+ filter.setOutputUnigrams(false);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+ assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
+ new String[] {"to be or not", "be or not to", "or not to be"},
+ new int[] {0, 3, 6},
+ new int[] {12, 15, 18},
+ null,
+ new int[] {1, 1, 1},
+ new int[] {1, 1, 1},
+ 18,
+ // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+ // finishing at the same position
+ false);
+
+
+ a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
+ filter.setOutputUnigrams(false);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+ assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
+ new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
+ "or not to be", "not to", "not to be", "to be"},
+ new int[] {0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13},
+ new int[] {5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18},
+ null,
+ new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1},
+ new int[] {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1},
+ 18,
+ // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+ // finishing at the same position
+ false);
+
+ a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
+ filter.setOutputUnigrams(false);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
+ new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
+ "or not to be", "not to be"},
+ new int[] {0, 0, 3, 3, 6, 6, 9},
+ new int[] {8, 12, 12, 15, 15, 18, 18},
+ null,
+ new int[] {1, 0, 1, 0, 1, 0, 1, 0},
+ new int[] {1, 2, 1, 2, 1, 2, 1, 2},
+ 18,
+ // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+ // finishing at the same position
+ false);
+
+ a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
+ filter.setOutputUnigrams(false);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+ assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
+ new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
+ "be or not to be", "or not to", "or not to be", "not to be"},
+ new int[] {0, 0, 0, 3, 3, 3, 6, 6, 9, 9},
+ new int[] {8, 12, 15, 12, 15, 18, 15, 18, 18},
+ null,
+ new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0},
+ new int[] {1, 2, 3, 1, 2, 3, 1, 2, 1},
+ 18,
+ // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+ // finishing at the same position
+ false);
+ }
}