You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ji...@apache.org on 2017/02/24 22:47:32 UTC

lucene-solr:branch_6x: LUCENE-7708: Fix position length attribute set by the ShingleFilter when outputUnigrams=false

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x e903f69ab -> 6c63df0b1


LUCENE-7708: Fix position length attribute set by the ShingleFilter when outputUnigrams=false


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6c63df0b
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6c63df0b
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6c63df0b

Branch: refs/heads/branch_6x
Commit: 6c63df0b15f735907438514f3b4b702680d74588
Parents: e903f69
Author: Jim Ferenczi <ji...@elastic.co>
Authored: Fri Feb 24 23:37:37 2017 +0100
Committer: Jim Ferenczi <ji...@elastic.co>
Committed: Fri Feb 24 23:38:21 2017 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  4 +
 .../lucene/analysis/shingle/ShingleFilter.java  |  7 +-
 .../analysis/shingle/ShingleFilterTest.java     | 94 +++++++++++++++++++-
 3 files changed, 102 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c63df0b/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 9681b6f..1ce5fd9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -95,6 +95,10 @@ Bug Fixes
   token graph, messing up phrase queries when it was used during query
   parsing (Ere Maijala via Mike McCandless)
 
+* LUCENE-7708: ShingleFilter without unigram was producing a disconnected
+  token graph, messing up queries when it was used during query
+  parsing (Jim Ferenczi)
+
 Improvements
 
 * LUCENE-7055: Added Weight#scorerSupplier, which allows to estimate the cost

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c63df0b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
index 5d99291..e3fa803 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@@ -343,7 +343,12 @@ public final class ShingleFilter extends TokenFilter {
           noShingleOutput = false;
         }
         offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
-        posLenAtt.setPositionLength(builtGramSize);
+        if (outputUnigrams) {
+          posLenAtt.setPositionLength(builtGramSize);
+        } else {
+          // position length for this token is the number of position created by shingles of smaller size.
+          posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1));
+        }
         isOutputHere = true;
         gramSize.advance();
         tokenAvailable = true;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c63df0b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
index 192de38..5645900 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
@@ -30,7 +30,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 
 public class ShingleFilterTest extends BaseTokenStreamTestCase {
 
@@ -1239,7 +1239,6 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
     filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
     filter.setFillerToken(null);
     filter.setTokenSeparator(null);
-
     assertTokenStreamContents(filter,
         new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
         new int[] {0, 0, 0, 7, 7, 7},
@@ -1247,4 +1246,95 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
         new int[] {1, 0, 0, 1, 0, 0},
         20);
   }
+
+  public void testPositionLength() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
+        filter.setOutputUnigrams(false);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+    };
+    assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
+        new String[] {"to be or not", "be or not to", "or not to be"},
+        new int[] {0, 3, 6},
+        new int[] {12, 15, 18},
+        null,
+        new int[] {1, 1, 1},
+        new int[] {1, 1, 1},
+        18,
+        // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+        // finishing at the same position
+        false);
+
+
+    a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
+        filter.setOutputUnigrams(false);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+    };
+    assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
+        new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
+            "or not to be", "not to", "not to be", "to be"},
+        new int[] {0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13},
+        new int[] {5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18},
+        null,
+        new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1},
+        new int[] {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1},
+        18,
+        // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+        // finishing at the same position
+        false);
+
+    a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
+        filter.setOutputUnigrams(false);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+    };
+
+    assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
+        new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
+            "or not to be", "not to be"},
+        new int[] {0, 0, 3, 3, 6, 6, 9},
+        new int[] {8, 12, 12, 15, 15, 18, 18},
+        null,
+        new int[] {1, 0, 1, 0, 1, 0, 1, 0},
+        new int[] {1, 2, 1, 2, 1, 2, 1, 2},
+        18,
+        // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+        // finishing at the same position
+        false);
+
+    a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
+        filter.setOutputUnigrams(false);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+    };
+    assertTokenStreamContents(a.tokenStream("", "to be or not to be"),
+        new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
+            "be or not to be", "or not to", "or not to be", "not to be"},
+        new int[] {0, 0, 0, 3, 3, 3, 6, 6, 9, 9},
+        new int[] {8, 12, 15, 12, 15, 18, 15, 18, 18},
+        null,
+        new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0},
+        new int[] {1, 2, 3, 1, 2, 3, 1, 2, 1},
+        18,
+        // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
+        // finishing at the same position
+        false);
+  }
 }