You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2019/02/19 15:25:45 UTC

[lucene-solr] branch branch_8_0 updated: LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths with gaps

This is an automated email from the ASF dual-hosted git repository.

romseygeek pushed a commit to branch branch_8_0
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8_0 by this push:
     new ee34c3e  LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths with gaps
ee34c3e is described below

commit ee34c3e18614acbace824531bf18332641f68919
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Tue Feb 19 13:55:57 2019 +0000

    LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths with gaps
---
 lucene/CHANGES.txt                                 |  3 ++
 .../util/graph/GraphTokenStreamFiniteStrings.java  | 17 +++++---
 .../graph/TestGraphTokenStreamFiniteStrings.java   | 48 +++++++++++++++++++++-
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index af0258d..a7954c9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -198,6 +198,9 @@ New Features
 * LUCENE-8655: Add a getter in FunctionScoreQuery class in order to access to the 
   underlying DoubleValuesSource. (GĂ©rald Quaire via Alan Woodward)
 
+* LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths 
+  containing gaps (Alan Woodward)
+
 Improvements
 
 * LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
index a700501..b6a9995 100644
--- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
+++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
@@ -211,26 +211,33 @@ public final class GraphTokenStreamFiniteStrings {
     int pos = -1;
     int prevIncr = 1;
     int state = -1;
+    int gap = 0;
     while (in.incrementToken()) {
       int currentIncr = posIncAtt.getPositionIncrement();
       if (pos == -1 && currentIncr < 1) {
         throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
       }
 
-      // always use inc 1 while building, but save original increment
-      int incr = Math.min(1, currentIncr);
-      if (incr > 0) {
-        pos += incr;
+      if (currentIncr == 0) {
+        if (gap > 0) {
+          pos -= gap;
+        }
+      }
+      else {
+        pos++;
+        gap = currentIncr - 1;
       }
 
-      int endPos = pos + posLengthAtt.getPositionLength();
+      int endPos = pos + posLengthAtt.getPositionLength() + gap;
       while (state < endPos) {
         state = builder.createState();
       }
 
       BytesRef term = termBytesAtt.getBytesRef();
       int id = getTermID(currentIncr, prevIncr, term);
+      //System.out.println("Adding transition: " + term.utf8ToString() + "@" + pos + "->" + endPos);
       builder.addTransition(pos, endPos, id);
+      pos += gap;
 
       // only save last increment on non-zero increment in case we have multiple stacked tokens
       if (currentIncr > 0) {
diff --git a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
index 44b7b7c..1739fa0 100644
--- a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
+++ b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
@@ -539,14 +539,16 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
   }
 
   public void testMultipleSidePaths() throws Exception {
+    // 0   1        2    3         4    5  6         7  8
+    // the ny:4/new york wifi:5/wi fi:4 [] wifi:2/wi fi network
     TokenStream ts = new CannedTokenStream(
         token("the", 1, 1),
         token("ny", 1, 4),
         token("new", 0, 1),
         token("york", 1, 1),
-        token("wifi", 1, 4),
+        token("wifi", 1, 5),
         token("wi", 0, 1),
-        token("fi", 1, 3),
+        token("fi", 1, 4),
         token("wifi", 2, 2),
         token("wi", 0, 1),
         token("fi", 1, 1),
@@ -596,4 +598,46 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
     terms = graph.getTerms("field", 7);
     assertArrayEquals(terms, new Term[] {new Term("field", "network")});
   }
+
+  public void testSidePathWithGap() throws Exception {
+    // 0    1               2  3  4             5
+    // king alfred:3/alfred [] [] great/awesome ruled
+    CannedTokenStream cts = new CannedTokenStream(
+        token("king", 1, 1),
+        token("alfred", 1, 4),
+        token("alfred", 0, 1),
+        token("great", 3, 1),
+        token("awesome", 0, 1),
+        token("ruled", 1, 1)
+    );
+    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts);
+    Iterator<TokenStream> it = graph.getFiniteStrings();
+    assertTrue(it.hasNext());
+    assertTokenStream(it.next(), new String[]{ "king", "alfred", "ruled" }, new int[]{ 1, 1, 1 });
+    assertTrue(it.hasNext());
+    assertTokenStream(it.next(), new String[]{ "king", "alfred", "great", "ruled"}, new int[]{ 1, 1, 3, 1 });
+    assertTrue(it.hasNext());
+    assertTokenStream(it.next(), new String[]{ "king", "alfred", "awesome", "ruled"}, new int[]{ 1, 1, 3, 1 });
+    assertFalse(it.hasNext());
+  }
+
+  public void testMultipleSidePathsWithGaps() throws Exception {
+    // king alfred:4/alfred [] [] saxons:3 [] wessex ruled
+    CannedTokenStream cts = new CannedTokenStream(
+        token("king", 1, 1),
+        token("alfred", 1, 4),
+        token("alfred", 0, 1),
+        token("saxons", 3, 3),
+        token("wessex", 2, 1),
+        token("ruled", 1, 1)
+    );
+    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts);
+    Iterator<TokenStream> it = graph.getFiniteStrings();
+    assertTrue(it.hasNext());
+    assertTokenStream(it.next(), new String[]{ "king", "alfred", "wessex", "ruled" }, new int[]{ 1, 1, 2, 1 });
+    assertTrue(it.hasNext());
+    assertTokenStream(it.next(), new String[]{ "king", "alfred", "saxons", "ruled" }, new int[]{ 1, 1, 3, 1 });
+    assertFalse(it.hasNext());
+  }
+
 }