You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2019/02/19 15:25:47 UTC
[lucene-solr] branch master updated: LUCENE-8697:
GraphTokenStreamFiniteStrings correctly handles side paths with gaps
This is an automated email from the ASF dual-hosted git repository.
romseygeek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 55b4d2d LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths with gaps
55b4d2d is described below
commit 55b4d2dcaa1dd713ffe861b5a19e7661cf5962bc
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Tue Feb 19 13:55:57 2019 +0000
LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths with gaps
---
lucene/CHANGES.txt | 3 ++
.../util/graph/GraphTokenStreamFiniteStrings.java | 17 +++++---
.../graph/TestGraphTokenStreamFiniteStrings.java | 48 +++++++++++++++++++++-
3 files changed, 61 insertions(+), 7 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f7f6009..130d796 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -223,6 +223,9 @@ New Features
* LUCENE-8655: Add a getter in FunctionScoreQuery class in order to access to the
underlying DoubleValuesSource. (GĂ©rald Quaire via Alan Woodward)
+* LUCENE-8697: GraphTokenStreamFiniteStrings correctly handles side paths
+ containing gaps (Alan Woodward)
+
Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
index a700501..b6a9995 100644
--- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
+++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
@@ -211,26 +211,33 @@ public final class GraphTokenStreamFiniteStrings {
int pos = -1;
int prevIncr = 1;
int state = -1;
+ int gap = 0;
while (in.incrementToken()) {
int currentIncr = posIncAtt.getPositionIncrement();
if (pos == -1 && currentIncr < 1) {
throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
}
- // always use inc 1 while building, but save original increment
- int incr = Math.min(1, currentIncr);
- if (incr > 0) {
- pos += incr;
+ if (currentIncr == 0) {
+ if (gap > 0) {
+ pos -= gap;
+ }
+ }
+ else {
+ pos++;
+ gap = currentIncr - 1;
}
- int endPos = pos + posLengthAtt.getPositionLength();
+ int endPos = pos + posLengthAtt.getPositionLength() + gap;
while (state < endPos) {
state = builder.createState();
}
BytesRef term = termBytesAtt.getBytesRef();
int id = getTermID(currentIncr, prevIncr, term);
+ //System.out.println("Adding transition: " + term.utf8ToString() + "@" + pos + "->" + endPos);
builder.addTransition(pos, endPos, id);
+ pos += gap;
// only save last increment on non-zero increment in case we have multiple stacked tokens
if (currentIncr > 0) {
diff --git a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
index 44b7b7c..1739fa0 100644
--- a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
+++ b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
@@ -539,14 +539,16 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
}
public void testMultipleSidePaths() throws Exception {
+ // 0 1 2 3 4 5 6 7 8
+ // the ny:4/new york wifi:5/wi fi:4 [] wifi:2/wi fi network
TokenStream ts = new CannedTokenStream(
token("the", 1, 1),
token("ny", 1, 4),
token("new", 0, 1),
token("york", 1, 1),
- token("wifi", 1, 4),
+ token("wifi", 1, 5),
token("wi", 0, 1),
- token("fi", 1, 3),
+ token("fi", 1, 4),
token("wifi", 2, 2),
token("wi", 0, 1),
token("fi", 1, 1),
@@ -596,4 +598,46 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
terms = graph.getTerms("field", 7);
assertArrayEquals(terms, new Term[] {new Term("field", "network")});
}
+
+ public void testSidePathWithGap() throws Exception {
+ // 0 1 2 3 4 5
+ // king alfred:3/alfred [] [] great/awesome ruled
+ CannedTokenStream cts = new CannedTokenStream(
+ token("king", 1, 1),
+ token("alfred", 1, 4),
+ token("alfred", 0, 1),
+ token("great", 3, 1),
+ token("awesome", 0, 1),
+ token("ruled", 1, 1)
+ );
+ GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts);
+ Iterator<TokenStream> it = graph.getFiniteStrings();
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{ "king", "alfred", "ruled" }, new int[]{ 1, 1, 1 });
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{ "king", "alfred", "great", "ruled"}, new int[]{ 1, 1, 3, 1 });
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{ "king", "alfred", "awesome", "ruled"}, new int[]{ 1, 1, 3, 1 });
+ assertFalse(it.hasNext());
+ }
+
+ public void testMultipleSidePathsWithGaps() throws Exception {
+ // king alfred:4/alfred [] [] saxons:3 [] wessex ruled
+ CannedTokenStream cts = new CannedTokenStream(
+ token("king", 1, 1),
+ token("alfred", 1, 4),
+ token("alfred", 0, 1),
+ token("saxons", 3, 3),
+ token("wessex", 2, 1),
+ token("ruled", 1, 1)
+ );
+ GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(cts);
+ Iterator<TokenStream> it = graph.getFiniteStrings();
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{ "king", "alfred", "wessex", "ruled" }, new int[]{ 1, 1, 2, 1 });
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{ "king", "alfred", "saxons", "ruled" }, new int[]{ 1, 1, 3, 1 });
+ assertFalse(it.hasNext());
+ }
+
}