You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2018/12/04 10:02:58 UTC
[1/5] lucene-solr:branch_7x: LUCENE-8586: Fix infinite loop in
Intervals.or()
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7_6 d30d6b89f -> 14bc64d51
refs/heads/branch_7x c2486254e -> 6ecd93c69
refs/heads/master 6c1116111 -> c78429a55
LUCENE-8586: Fix infinite loop in Intervals.or()
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6ecd93c6
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6ecd93c6
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6ecd93c6
Branch: refs/heads/branch_7x
Commit: 6ecd93c6939de8ca29ee2543bcfb3f5c8051357a
Parents: c248625
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 16:04:54 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:47:36 2018 +0000
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 ++
.../intervals/DisjunctionIntervalsSource.java | 47 +++++++++++++++++++-
.../lucene/search/intervals/TestIntervals.java | 14 ++++++
3 files changed, 62 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6ecd93c6/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 32f3251..1845686 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -92,6 +92,9 @@ Bug fixes
* LUCENE-8556: Use latitude and longitude instead of encoding values to check if triangle is ear
when using morton optimisation. (Ignacio Vera)
+* LUCENE-8586: Intervals.or() could get stuck in an infinite loop on certain indexes
+ (Alan Woodward)
+
New Features
* LUCENE-8496: Selective indexing - modify BKDReader/BKDWriter to allow users
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6ecd93c6/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
index 79c1bcf..c54f18b 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
@@ -146,7 +146,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
@Override
public int nextInterval() throws IOException {
- if (current == EMPTY) {
+ if (current == EMPTY || current == EXHAUSTED) {
if (intervalQueue.size() > 0) {
current = intervalQueue.top();
}
@@ -160,7 +160,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
}
}
if (intervalQueue.size() == 0) {
- current = EMPTY;
+ current = EXHAUSTED;
return NO_MORE_INTERVALS;
}
current = intervalQueue.top();
@@ -239,4 +239,47 @@ class DisjunctionIntervalsSource extends IntervalsSource {
}
};
+ private static final IntervalIterator EXHAUSTED = new IntervalIterator() {
+
+ @Override
+ public int docID() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long cost() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int start() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public int end() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public int nextInterval() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public float matchCost() {
+ return 0;
+ }
+ };
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6ecd93c6/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
index d7754be..6002b3a 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
@@ -56,6 +56,8 @@ public class TestIntervals extends LuceneTestCase {
"Porridge is great"
};
+ // 0 1 2 3 4 5 6 7 8 9
+ // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
private static String field2_docs[] = {
"In Xanadu did Kubla Khan a stately pleasure dome decree",
"Where Alph the sacred river ran through caverns measureless to man",
@@ -260,6 +262,18 @@ public class TestIntervals extends LuceneTestCase {
assertFalse(mi.next());
}
+ public void testCombinationDisjunction() throws IOException {
+ IntervalsSource source = Intervals.ordered(
+ Intervals.or(Intervals.term("alph"), Intervals.term("sacred")),
+ Intervals.term("measureless")
+ );
+ checkIntervals(source, "field2", 1, new int[][]{
+ {},
+ { 3, 8 },
+ {}, {}, {}, {}
+ });
+ }
+
public void testNesting() throws IOException {
IntervalsSource source = Intervals.unordered(
Intervals.term("pease"),
[2/5] lucene-solr:master: LUCENE-8586: Fix infinite loop in
Intervals.or()
Posted by ro...@apache.org.
LUCENE-8586: Fix infinite loop in Intervals.or()
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c78429a5
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c78429a5
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c78429a5
Branch: refs/heads/master
Commit: c78429a554d28611dacd90c388e6c34039b228d1
Parents: 75a053d
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 16:04:54 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:47:42 2018 +0000
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 ++
.../intervals/DisjunctionIntervalsSource.java | 47 +++++++++++++++++++-
.../lucene/search/intervals/TestIntervals.java | 14 ++++++
3 files changed, 62 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c78429a5/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0a41d70..3c2a409 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -305,6 +305,9 @@ Bug fixes
* LUCENE-8556: Use latitude and longitude instead of encoding values to check if triangle is ear
when using morton optimisation. (Ignacio Vera)
+* LUCENE-8586: Intervals.or() could get stuck in an infinite loop on certain indexes
+ (Alan Woodward)
+
New Features
* LUCENE-8496: Selective indexing - modify BKDReader/BKDWriter to allow users
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c78429a5/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
index 79c1bcf..c54f18b 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
@@ -146,7 +146,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
@Override
public int nextInterval() throws IOException {
- if (current == EMPTY) {
+ if (current == EMPTY || current == EXHAUSTED) {
if (intervalQueue.size() > 0) {
current = intervalQueue.top();
}
@@ -160,7 +160,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
}
}
if (intervalQueue.size() == 0) {
- current = EMPTY;
+ current = EXHAUSTED;
return NO_MORE_INTERVALS;
}
current = intervalQueue.top();
@@ -239,4 +239,47 @@ class DisjunctionIntervalsSource extends IntervalsSource {
}
};
+ private static final IntervalIterator EXHAUSTED = new IntervalIterator() {
+
+ @Override
+ public int docID() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long cost() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int start() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public int end() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public int nextInterval() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public float matchCost() {
+ return 0;
+ }
+ };
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c78429a5/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
index d7754be..6002b3a 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
@@ -56,6 +56,8 @@ public class TestIntervals extends LuceneTestCase {
"Porridge is great"
};
+ // 0 1 2 3 4 5 6 7 8 9
+ // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
private static String field2_docs[] = {
"In Xanadu did Kubla Khan a stately pleasure dome decree",
"Where Alph the sacred river ran through caverns measureless to man",
@@ -260,6 +262,18 @@ public class TestIntervals extends LuceneTestCase {
assertFalse(mi.next());
}
+ public void testCombinationDisjunction() throws IOException {
+ IntervalsSource source = Intervals.ordered(
+ Intervals.or(Intervals.term("alph"), Intervals.term("sacred")),
+ Intervals.term("measureless")
+ );
+ checkIntervals(source, "field2", 1, new int[][]{
+ {},
+ { 3, 8 },
+ {}, {}, {}, {}
+ });
+ }
+
public void testNesting() throws IOException {
IntervalsSource source = Intervals.unordered(
Intervals.term("pease"),
[5/5] lucene-solr:branch_7_6: LUCENE-8586: Fix infinite loop in
Intervals.or()
Posted by ro...@apache.org.
LUCENE-8586: Fix infinite loop in Intervals.or()
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/14bc64d5
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/14bc64d5
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/14bc64d5
Branch: refs/heads/branch_7_6
Commit: 14bc64d517bfa012fa00fc17da1d7173acf92c6b
Parents: d30d6b8
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 16:04:54 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:51:45 2018 +0000
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 ++
.../intervals/DisjunctionIntervalsSource.java | 47 +++++++++++++++++++-
.../lucene/search/intervals/TestIntervals.java | 14 ++++++
3 files changed, 62 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/14bc64d5/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 252d8ca..d5b7f6f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -46,6 +46,9 @@ Bug fixes
* LUCENE-8556: Use latitude and longitude instead of encoding values to check if triangle is ear
when using morton optimisation. (Ignacio Vera)
+* LUCENE-8586: Intervals.or() could get stuck in an infinite loop on certain indexes
+ (Alan Woodward)
+
New Features
* LUCENE-8496: Selective indexing - modify BKDReader/BKDWriter to allow users
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/14bc64d5/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
index 79c1bcf..c54f18b 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
@@ -146,7 +146,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
@Override
public int nextInterval() throws IOException {
- if (current == EMPTY) {
+ if (current == EMPTY || current == EXHAUSTED) {
if (intervalQueue.size() > 0) {
current = intervalQueue.top();
}
@@ -160,7 +160,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
}
}
if (intervalQueue.size() == 0) {
- current = EMPTY;
+ current = EXHAUSTED;
return NO_MORE_INTERVALS;
}
current = intervalQueue.top();
@@ -239,4 +239,47 @@ class DisjunctionIntervalsSource extends IntervalsSource {
}
};
+ private static final IntervalIterator EXHAUSTED = new IntervalIterator() {
+
+ @Override
+ public int docID() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long cost() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int start() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public int end() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public int nextInterval() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public float matchCost() {
+ return 0;
+ }
+ };
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/14bc64d5/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
index d7754be..6002b3a 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
@@ -56,6 +56,8 @@ public class TestIntervals extends LuceneTestCase {
"Porridge is great"
};
+ // 0 1 2 3 4 5 6 7 8 9
+ // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
private static String field2_docs[] = {
"In Xanadu did Kubla Khan a stately pleasure dome decree",
"Where Alph the sacred river ran through caverns measureless to man",
@@ -260,6 +262,18 @@ public class TestIntervals extends LuceneTestCase {
assertFalse(mi.next());
}
+ public void testCombinationDisjunction() throws IOException {
+ IntervalsSource source = Intervals.ordered(
+ Intervals.or(Intervals.term("alph"), Intervals.term("sacred")),
+ Intervals.term("measureless")
+ );
+ checkIntervals(source, "field2", 1, new int[][]{
+ {},
+ { 3, 8 },
+ {}, {}, {}, {}
+ });
+ }
+
public void testNesting() throws IOException {
IntervalsSource source = Intervals.unordered(
Intervals.term("pease"),
[3/5] lucene-solr:master: LUCENE-8564: Add GraphTokenFilter
Posted by ro...@apache.org.
LUCENE-8564: Add GraphTokenFilter
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f5867a14
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f5867a14
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f5867a14
Branch: refs/heads/master
Commit: f5867a1413189675cf69ea88412237e32429786e
Parents: 6c11161
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 12:17:58 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:47:42 2018 +0000
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 +
.../analysis/shingle/FixedShingleFilter.java | 269 ++++--------------
.../shingle/FixedShingleFilterTest.java | 19 +-
.../lucene/analysis/GraphTokenFilter.java | 284 +++++++++++++++++++
.../lucene/analysis/TestGraphTokenFilter.java | 236 +++++++++++++++
.../lucene/analysis/TestGraphTokenizers.java | 2 +-
.../java/org/apache/lucene/analysis/Token.java | 7 +
7 files changed, 591 insertions(+), 230 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 349d64d..fc609a5 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -155,6 +155,10 @@ New Features
* LUCENE-8216: Added a new BM25FQuery in sandbox to blend statistics across several fields
using the BM25F formula. (Adrien Grand, Jim Ferenczi)
+* LUCENE-8564: GraphTokenFilter is an abstract class useful for token filters that need
+ to read-ahead in the token stream and take into account graph structures. This
+ also changes FixedShingleFilter to extend GraphTokenFilter (Alan Woodward)
+
Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
index cf82363..8f7eb95 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
@@ -18,16 +18,14 @@
package org.apache.lucene.analysis.shingle;
import java.io.IOException;
-import java.util.ArrayDeque;
-import java.util.Deque;
-import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.GraphTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.AttributeSource;
/**
* A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
@@ -47,27 +45,20 @@ import org.apache.lucene.util.AttributeSource;
*
* @lucene.experimental
*/
-public final class FixedShingleFilter extends TokenFilter {
+public final class FixedShingleFilter extends GraphTokenFilter {
- private final Deque<Token> tokenPool = new ArrayDeque<>();
-
- private static final int MAX_SHINGLE_STACK_SIZE = 1000;
private static final int MAX_SHINGLE_SIZE = 4;
private final int shingleSize;
private final String tokenSeparator;
-
- private final Token gapToken = new Token(new AttributeSource());
- private final Token endToken = new Token(new AttributeSource());
+ private final String fillerToken;
private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private Token[] currentShingleTokens;
- private int currentShingleStackSize;
- private boolean inputStreamExhausted = false;
+ private final CharTermAttribute buffer = new CharTermAttributeImpl();
/**
* Creates a FixedShingleFilter over an input token stream
@@ -89,228 +80,82 @@ public final class FixedShingleFilter extends TokenFilter {
*/
public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
super(input);
+
if (shingleSize <= 1 || shingleSize > MAX_SHINGLE_SIZE) {
throw new IllegalArgumentException("Shingle size must be between 2 and " + MAX_SHINGLE_SIZE + ", got " + shingleSize);
}
this.shingleSize = shingleSize;
this.tokenSeparator = tokenSeparator;
- this.gapToken.termAtt.setEmpty().append(fillerToken);
- this.currentShingleTokens = new Token[shingleSize];
+ this.fillerToken = fillerToken;
}
@Override
public boolean incrementToken() throws IOException {
- int posInc = 0;
- if (nextShingle() == false) {
- Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
- if (nextRoot == endToken)
- return false;
- recycleToken(currentShingleTokens[0]);
- if (resetShingleRoot(nextRoot) == false) {
+
+ int shinglePosInc;
+ if (incrementGraph() == false) {
+ if (incrementBaseToken() == false) {
return false;
}
- posInc = currentShingleTokens[0].posInc();
+ // starting a shingle at a new base position, use base position increment
+ shinglePosInc = incAtt.getPositionIncrement();
}
- clearAttributes();
- incAtt.setPositionIncrement(posInc);
- offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
- termAtt.setEmpty();
- termAtt.append(currentShingleTokens[0].term());
- typeAtt.setType("shingle");
- for (int i = 1; i < shingleSize; i++) {
- termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
- }
- return true;
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- this.tokenPool.clear();
- this.currentShingleTokens[0] = null;
- this.inputStreamExhausted = false;
- this.currentShingleStackSize = 0;
- }
-
- @Override
- public void end() throws IOException {
- if (inputStreamExhausted == false) {
- finishInnerStream();
+ else {
+ // starting a new shingle at the same base with a different graph, use a 0
+ // position increment
+ shinglePosInc = 0;
}
- clearAttributes();
- this.offsetAtt.setOffset(0, endToken.endOffset());
- this.incAtt.setPositionIncrement(endToken.posInc());
- }
-
- private void finishInnerStream() throws IOException {
- input.end();
- inputStreamExhausted = true;
- // check for gaps at the end of the tokenstream
- endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
- OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
- endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
- }
- private Token lastTokenInShingle() {
- int lastTokenIndex = shingleSize - 1;
- while (currentShingleTokens[lastTokenIndex] == gapToken) {
- lastTokenIndex--;
- }
- return currentShingleTokens[lastTokenIndex];
- }
+ final int startOffset = offsetAtt.startOffset();
+ int endOffset = offsetAtt.endOffset();
+ this.buffer.setEmpty();
+ this.buffer.append(termAtt);
- private boolean resetShingleRoot(Token token) throws IOException {
- this.currentShingleTokens[0] = token;
+ // build the shingle by iterating over the current graph, adding
+ // filler tokens if we encounter gaps
for (int i = 1; i < shingleSize; i++) {
- Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
- if (current == endToken) {
- if (endToken.posInc() + i >= shingleSize) {
- // end tokens are a special case, because their posIncs are always
- // due to stopwords. Therefore, we can happily append gap tokens
- // to the end of the current shingle
- for (int j = i; j < shingleSize; j++) {
- this.currentShingleTokens[i] = gapToken;
- i++;
- }
- return true;
+ if (incrementGraphToken() == false) {
+ // we've reached the end of the token stream, check for trailing
+ // positions and add fillers if necessary
+ int trailingPositions = getTrailingPositions();
+ if (i + trailingPositions < shingleSize) {
+ // not enough trailing positions to make a full shingle
+ return false;
}
- return false;
- }
- if (current.posInc() > 1) {
- // insert gaps into the shingle list
- for (int j = 1; j < current.posInc(); j++) {
- this.currentShingleTokens[i] = gapToken;
+ while (i < shingleSize) {
+ this.buffer.append(tokenSeparator).append(fillerToken);
i++;
- if (i >= shingleSize)
- return true;
}
+ break;
}
- this.currentShingleTokens[i] = current;
- }
- return true;
- }
-
- private boolean nextShingle() throws IOException {
- return currentShingleTokens[0] != null && advanceStack();
- }
-
- // check if the next token in the tokenstream is at the same position as this one
- private boolean lastInStack(Token token) throws IOException {
- Token next = nextTokenInStream(token);
- return next == endToken || next.posInc() != 0;
- }
-
- private boolean advanceStack() throws IOException {
- for (int i = shingleSize - 1; i >= 1; i--) {
- if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
- currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
- for (int j = i + 1; j < shingleSize; j++) {
- currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
+ int posInc = incAtt.getPositionIncrement();
+ if (posInc > 1) {
+ // if we have a posInc > 1, we need to fill in the gaps
+ if (i + posInc > shingleSize) {
+ // if the posInc is greater than the shingle size, we need to add fillers
+ // up to the shingle size but no further
+ while (i < shingleSize) {
+ this.buffer.append(tokenSeparator).append(fillerToken);
+ i++;
+ }
+ break;
}
- if (currentShingleStackSize++ > MAX_SHINGLE_STACK_SIZE) {
- throw new IllegalStateException("Too many shingles (> " + MAX_SHINGLE_STACK_SIZE + ") at term [" + currentShingleTokens[0].term() + "]");
+ // otherwise just add them in as far as we need
+ while (posInc > 1) {
+ this.buffer.append(tokenSeparator).append(fillerToken);
+ posInc--;
+ i++;
}
- return true;
}
+ this.buffer.append(tokenSeparator).append(termAtt);
+ endOffset = offsetAtt.endOffset();
}
- currentShingleStackSize = 0;
- return false;
- }
-
- private Token newToken() {
- Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
- token.reset(this);
- return token;
- }
-
- private void recycleToken(Token token) {
- if (token == null)
- return;
- token.nextToken = null;
- tokenPool.add(token);
- }
-
- // for testing
- int instantiatedTokenCount() {
- int tokenCount = tokenPool.size() + 1;
- if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
- return tokenCount;
- for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
- tokenCount++;
- }
- return tokenCount;
- }
-
- private Token nextTokenInGraph(Token token) throws IOException {
- do {
- token = nextTokenInStream(token);
- if (token == endToken) {
- return endToken;
- }
- } while (token.posInc() == 0);
- return token;
- }
-
- private Token nextTokenInStream(Token token) throws IOException {
- if (token != null && token.nextToken != null) {
- return token.nextToken;
- }
- if (input.incrementToken() == false) {
- finishInnerStream();
- if (token == null) {
- return endToken;
- } else {
- token.nextToken = endToken;
- return endToken;
- }
- }
- if (token == null) {
- return newToken();
- }
- token.nextToken = newToken();
- return token.nextToken;
- }
-
- private static class Token {
- final AttributeSource attSource;
- final PositionIncrementAttribute posIncAtt;
- final CharTermAttribute termAtt;
- final OffsetAttribute offsetAtt;
-
- Token nextToken;
-
- Token(AttributeSource attSource) {
- this.attSource = attSource;
- this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
- this.termAtt = attSource.addAttribute(CharTermAttribute.class);
- this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
- }
-
- int posInc() {
- return this.posIncAtt.getPositionIncrement();
- }
-
- CharSequence term() {
- return this.termAtt;
- }
-
- int startOffset() {
- return this.offsetAtt.startOffset();
- }
-
- int endOffset() {
- return this.offsetAtt.endOffset();
- }
-
- void reset(AttributeSource attSource) {
- attSource.copyTo(this.attSource);
- this.nextToken = null;
- }
-
- @Override
- public String toString() {
- return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
- }
+ clearAttributes();
+ this.offsetAtt.setOffset(startOffset, endOffset);
+ this.incAtt.setPositionIncrement(shinglePosInc);
+ this.termAtt.setEmpty().append(buffer);
+ this.typeAtt.setType("shingle");
+ return true;
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
index f5031b3..85c7dc6 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
@@ -163,6 +163,8 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
public void testIncomingGraphs() throws IOException {
+ // b/a c b/a d
+
TokenStream ts = new CannedTokenStream(
new Token("b", 0, 1),
new Token("a", 0, 0, 1),
@@ -208,21 +210,4 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
assertEquals("Shingle size must be between 2 and 4, got 5", e2.getMessage());
}
- public void testShingleCountLimits() {
-
- Token[] tokens = new Token[5000];
- tokens[0] = new Token("term", 1, 0, 1);
- tokens[1] = new Token("term1", 1, 2, 3);
- for (int i = 2; i < 5000; i++) {
- tokens[i] = new Token("term" + i, 0, 2, 3);
- }
-
- Exception e = expectThrows(IllegalStateException.class, () -> {
- TokenStream ts = new FixedShingleFilter(new CannedTokenStream(tokens), 2);
- ts.reset();
- while (ts.incrementToken()) {}
- });
- assertEquals("Too many shingles (> 1000) at term [term]", e.getMessage());
- }
-
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java
new file mode 100644
index 0000000..9c1e02e
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
+import java.util.List;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * An abstract TokenFilter that exposes its input stream as a graph
+ *
+ * Call {@link #incrementBaseToken()} to move the root of the graph to the next
+ * position in the TokenStream, {@link #incrementGraphToken()} to move along
+ * the current graph, and {@link #incrementGraph()} to reset to the next graph
+ * based at the current root.
+ *
+ * For example, given the stream 'a b/c:2 d e`, then with the base token at
+ * 'a', incrementGraphToken() will produce the stream 'a b d e', and then
+ * after calling incrementGraph() will produce the stream 'a c e'.
+ */
+public abstract class GraphTokenFilter extends TokenFilter {
+
+ private final Deque<Token> tokenPool = new ArrayDeque<>();
+ private final List<Token> currentGraph = new ArrayList<>();
+
+ /**
+ * The maximum permitted number of routes through a graph
+ */
+ public static final int MAX_GRAPH_STACK_SIZE = 1000;
+
+ /**
+ * The maximum permitted read-ahead in the token stream
+ */
+ public static final int MAX_TOKEN_CACHE_SIZE = 100;
+
+ private Token baseToken;
+ private int graphDepth;
+ private int graphPos;
+ private int trailingPositions = -1;
+ private int finalOffsets = -1;
+
+ private int stackSize;
+ private int cacheSize;
+
+ private final PositionIncrementAttribute posIncAtt;
+ private final OffsetAttribute offsetAtt;
+
+ /**
+ * Create a new GraphTokenFilter
+ */
+ public GraphTokenFilter(TokenStream input) {
+ super(input);
+ this.posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
+ this.offsetAtt = input.addAttribute(OffsetAttribute.class);
+ }
+
+ /**
+ * Move the root of the graph to the next token in the wrapped TokenStream
+ *
+ * @return {@code false} if the underlying stream is exhausted
+ */
+ protected final boolean incrementBaseToken() throws IOException {
+ stackSize = 0;
+ graphDepth = 0;
+ graphPos = 0;
+ Token oldBase = baseToken;
+ baseToken = nextTokenInStream(baseToken);
+ if (baseToken == null) {
+ return false;
+ }
+ currentGraph.clear();
+ currentGraph.add(baseToken);
+ baseToken.attSource.copyTo(this);
+ recycleToken(oldBase);
+ return true;
+ }
+
+ /**
+ * Move to the next token in the current route through the graph
+ *
+ * @return {@code false} if there are not more tokens in the current graph
+ */
+ protected final boolean incrementGraphToken() throws IOException {
+ if (graphPos < graphDepth) {
+ graphPos++;
+ currentGraph.get(graphPos).attSource.copyTo(this);
+ return true;
+ }
+ Token token = nextTokenInGraph(currentGraph.get(graphDepth));
+ if (token == null) {
+ return false;
+ }
+ graphDepth++;
+ graphPos++;
+ currentGraph.add(graphDepth, token);
+ token.attSource.copyTo(this);
+ return true;
+ }
+
+ /**
+ * Reset to the root token again, and move down the next route through the graph
+ *
+ * @return false if there are no more routes through the graph
+ */
+ protected final boolean incrementGraph() throws IOException {
+ if (baseToken == null) {
+ return false;
+ }
+ graphPos = 0;
+ for (int i = graphDepth; i >= 1; i--) {
+ if (lastInStack(currentGraph.get(i)) == false) {
+ currentGraph.set(i, nextTokenInStream(currentGraph.get(i)));
+ for (int j = i + 1; j < graphDepth; j++) {
+ currentGraph.set(j, nextTokenInGraph(currentGraph.get(j)));
+ }
+ if (stackSize++ > MAX_GRAPH_STACK_SIZE) {
+ throw new IllegalStateException("Too many graph paths (> " + MAX_GRAPH_STACK_SIZE + ")");
+ }
+ currentGraph.get(0).attSource.copyTo(this);
+ graphDepth = i;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Return the number of trailing positions at the end of the graph
+ *
+ * NB this should only be called after {@link #incrementGraphToken()} has returned {@code false}
+ */
+ public int getTrailingPositions() {
+ return trailingPositions;
+ }
+
+ @Override
+ public void end() throws IOException {
+ if (trailingPositions == -1) {
+ input.end();
+ trailingPositions = posIncAtt.getPositionIncrement();
+ finalOffsets = offsetAtt.endOffset();
+ }
+ else {
+ endAttributes();
+ this.posIncAtt.setPositionIncrement(trailingPositions);
+ this.offsetAtt.setOffset(finalOffsets, finalOffsets);
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ // new attributes can be added between reset() calls, so we can't reuse
+ // token objects from a previous run
+ tokenPool.clear();
+ cacheSize = 0;
+ graphDepth = 0;
+ trailingPositions = -1;
+ finalOffsets = -1;
+ baseToken = null;
+ }
+
+ int cachedTokenCount() {
+ return cacheSize;
+ }
+
+ private Token newToken() {
+ if (tokenPool.size() == 0) {
+ cacheSize++;
+ if (cacheSize > MAX_TOKEN_CACHE_SIZE) {
+ throw new IllegalStateException("Too many cached tokens (> " + MAX_TOKEN_CACHE_SIZE + ")");
+ }
+ return new Token(this.cloneAttributes());
+ }
+ Token token = tokenPool.removeFirst();
+ token.reset(input);
+ return token;
+ }
+
+ private void recycleToken(Token token) {
+ if (token == null)
+ return;
+ token.nextToken = null;
+ tokenPool.add(token);
+ }
+
+ private Token nextTokenInGraph(Token token) throws IOException {
+ int remaining = token.length();
+ do {
+ token = nextTokenInStream(token);
+ if (token == null) {
+ return null;
+ }
+ remaining -= token.posInc();
+ } while (remaining > 0);
+ return token;
+ }
+
+ // check if the next token in the tokenstream is at the same position as this one
+ private boolean lastInStack(Token token) throws IOException {
+ Token next = nextTokenInStream(token);
+ return next == null || next.posInc() != 0;
+ }
+
+ private Token nextTokenInStream(Token token) throws IOException {
+ if (token != null && token.nextToken != null) {
+ return token.nextToken;
+ }
+ if (this.trailingPositions != -1) {
+ // already hit the end
+ return null;
+ }
+ if (input.incrementToken() == false) {
+ input.end();
+ trailingPositions = posIncAtt.getPositionIncrement();
+ finalOffsets = offsetAtt.endOffset();
+ return null;
+ }
+ if (token == null) {
+ return newToken();
+ }
+ token.nextToken = newToken();
+ return token.nextToken;
+ }
+
+ private static class Token {
+
+ final AttributeSource attSource;
+ final PositionIncrementAttribute posIncAtt;
+ final PositionLengthAttribute lengthAtt;
+ Token nextToken;
+
+ Token(AttributeSource attSource) {
+ this.attSource = attSource;
+ this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
+ boolean hasLengthAtt = attSource.hasAttribute(PositionLengthAttribute.class);
+ this.lengthAtt = hasLengthAtt ? attSource.addAttribute(PositionLengthAttribute.class) : null;
+ }
+
+ int posInc() {
+ return this.posIncAtt.getPositionIncrement();
+ }
+
+ int length() {
+ if (this.lengthAtt == null) {
+ return 1;
+ }
+ return this.lengthAtt.getPositionLength();
+ }
+
+ void reset(AttributeSource attSource) {
+ attSource.copyTo(this.attSource);
+ this.nextToken = null;
+ }
+
+ @Override
+ public String toString() {
+ return attSource.toString();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenFilter.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenFilter.java
new file mode 100644
index 0000000..d3a476f
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenFilter.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+public class TestGraphTokenFilter extends BaseTokenStreamTestCase {
+
+ static class TestFilter extends GraphTokenFilter {
+
+ public TestFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ return incrementBaseToken();
+ }
+ }
+
+ public void testGraphTokenStream() throws IOException {
+
+ TestGraphTokenizers.GraphTokenizer tok = new TestGraphTokenizers.GraphTokenizer();
+ GraphTokenFilter graph = new TestFilter(tok);
+
+ CharTermAttribute termAtt = graph.addAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = graph.addAttribute(PositionIncrementAttribute.class);
+
+ tok.setReader(new StringReader("a b/c d e/f:3 g/h i j k"));
+ tok.reset();
+
+ assertFalse(graph.incrementGraph());
+ assertEquals(0, graph.cachedTokenCount());
+
+ assertTrue(graph.incrementBaseToken());
+ assertEquals("a", termAtt.toString());
+ assertEquals(1, posIncAtt.getPositionIncrement());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("b", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("d", termAtt.toString());
+ assertTrue(graph.incrementGraph());
+ assertEquals("a", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("c", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("d", termAtt.toString());
+ assertFalse(graph.incrementGraph());
+ assertEquals(5, graph.cachedTokenCount());
+
+ assertTrue(graph.incrementBaseToken());
+ assertEquals("b", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("d", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("e", termAtt.toString());
+ assertTrue(graph.incrementGraph());
+ assertEquals("b", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("d", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("f", termAtt.toString());
+ assertFalse(graph.incrementGraph());
+ assertEquals(6, graph.cachedTokenCount());
+
+ assertTrue(graph.incrementBaseToken());
+ assertEquals("c", termAtt.toString());
+ assertEquals(0, posIncAtt.getPositionIncrement());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("d", termAtt.toString());
+ assertFalse(graph.incrementGraph());
+ assertEquals(6, graph.cachedTokenCount());
+
+ assertTrue(graph.incrementBaseToken());
+ assertEquals("d", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("e", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("g", termAtt.toString());
+ assertTrue(graph.incrementGraph());
+ assertEquals("d", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("e", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("h", termAtt.toString());
+ assertTrue(graph.incrementGraph());
+ assertEquals("d", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("f", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("j", termAtt.toString());
+ assertFalse(graph.incrementGraph());
+ assertEquals(8, graph.cachedTokenCount());
+
+ //tok.setReader(new StringReader("a b/c d e/f:3 g/h i j k"));
+
+ assertTrue(graph.incrementBaseToken());
+ assertEquals("e", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("g", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("i", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("j", termAtt.toString());
+ assertTrue(graph.incrementGraph());
+ assertEquals("e", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("h", termAtt.toString());
+ assertFalse(graph.incrementGraph());
+ assertEquals(8, graph.cachedTokenCount());
+
+ assertTrue(graph.incrementBaseToken());
+ assertEquals("f", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("j", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("k", termAtt.toString());
+ assertFalse(graph.incrementGraphToken());
+ assertFalse(graph.incrementGraph());
+ assertEquals(8, graph.cachedTokenCount());
+
+ assertTrue(graph.incrementBaseToken());
+ assertEquals("g", termAtt.toString());
+ assertTrue(graph.incrementGraphToken());
+ assertEquals("i", termAtt.toString());
+ assertFalse(graph.incrementGraph());
+ assertEquals(8, graph.cachedTokenCount());
+
+ assertTrue(graph.incrementBaseToken());
+ assertEquals("h", termAtt.toString());
+ assertFalse(graph.incrementGraph());
+ assertEquals(8, graph.cachedTokenCount());
+
+ assertTrue(graph.incrementBaseToken());
+ assertTrue(graph.incrementBaseToken());
+ assertTrue(graph.incrementBaseToken());
+ assertEquals("k", termAtt.toString());
+ assertFalse(graph.incrementGraphToken());
+ assertEquals(0, graph.getTrailingPositions());
+ assertFalse(graph.incrementGraph());
+ assertFalse(graph.incrementBaseToken());
+ assertEquals(8, graph.cachedTokenCount());
+
+ }
+
+ public void testTrailingPositions() throws IOException {
+
+ // a/b:2 c _
+ CannedTokenStream cts = new CannedTokenStream(1, 5,
+ new Token("a", 0, 1),
+ new Token("b", 0, 0, 1, 2),
+ new Token("c", 1, 2, 3)
+ );
+
+ GraphTokenFilter gts = new TestFilter(cts);
+ assertFalse(gts.incrementGraph());
+ assertTrue(gts.incrementBaseToken());
+ assertTrue(gts.incrementGraphToken());
+ assertFalse(gts.incrementGraphToken());
+ assertEquals(1, gts.getTrailingPositions());
+ assertFalse(gts.incrementGraph());
+ assertTrue(gts.incrementBaseToken());
+ assertFalse(gts.incrementGraphToken());
+ assertEquals(1, gts.getTrailingPositions());
+ assertFalse(gts.incrementGraph());
+ }
+
+ public void testMaximumGraphCacheSize() throws IOException {
+
+ Token[] tokens = new Token[GraphTokenFilter.MAX_TOKEN_CACHE_SIZE + 5];
+ for (int i = 0; i < GraphTokenFilter.MAX_TOKEN_CACHE_SIZE + 5; i++) {
+ tokens[i] = new Token("a", 1, i * 2, i * 2 + 1);
+ }
+
+ GraphTokenFilter gts = new TestFilter(new CannedTokenStream(tokens));
+ Exception e = expectThrows(IllegalStateException.class, () -> {
+ gts.reset();
+ gts.incrementBaseToken();
+ while (true) {
+ gts.incrementGraphToken();
+ }
+ });
+ assertEquals("Too many cached tokens (> 100)", e.getMessage());
+
+ gts.reset();
+ // after reset, the cache should be cleared and so we can read ahead once more
+ gts.incrementBaseToken();
+ gts.incrementGraphToken();
+
+ }
+
+ public void testGraphPathCountLimits() {
+
+ Token[] tokens = new Token[50];
+ tokens[0] = new Token("term", 1, 0, 1);
+ tokens[1] = new Token("term1", 1, 2, 3);
+ for (int i = 2; i < 50; i++) {
+ tokens[i] = new Token("term" + i, i % 2, 2, 3);
+ }
+
+ Exception e = expectThrows(IllegalStateException.class, () -> {
+ GraphTokenFilter graph = new TestFilter(new CannedTokenStream(tokens));
+ graph.reset();
+ graph.incrementBaseToken();
+ for (int i = 0; i < 10; i++) {
+ graph.incrementGraphToken();
+ }
+ while (graph.incrementGraph()) {
+ for (int i = 0; i < 10; i++) {
+ graph.incrementGraphToken();
+ }
+ }
+ });
+ assertEquals("Too many graph paths (> 1000)", e.getMessage());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
index 7e98662..8d3b8e1 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
@@ -56,7 +56,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
// you cannot turn on MockCharFilter when random
// testing...
- private static class GraphTokenizer extends Tokenizer {
+ public static final class GraphTokenizer extends Tokenizer {
private List<Token> tokens;
private int upto;
private int inputLength;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
index 9994175..18b6b8b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
@@ -82,6 +82,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
setPositionIncrement(posInc);
}
+ public Token(CharSequence text, int posInc, int start, int end, int posLength) {
+ append(text);
+ setOffset(start, end);
+ setPositionIncrement(posInc);
+ setPositionLength(posLength);
+ }
+
/**
* {@inheritDoc}
* @see FlagsAttribute
[4/5] lucene-solr:master: LUCENE-8509: WordDelimiterGraphFilter no
longer adjusts offsets by default
Posted by ro...@apache.org.
LUCENE-8509: WordDelimiterGraphFilter no longer adjusts offsets by default
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/75a053dd
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/75a053dd
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/75a053dd
Branch: refs/heads/master
Commit: 75a053dd696d6e632755e613380450f22c78c91b
Parents: f5867a1
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 13:36:21 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:47:42 2018 +0000
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 ++
.../miscellaneous/WordDelimiterGraphFilter.java | 17 ++++----
.../WordDelimiterGraphFilterFactory.java | 5 ++-
.../TestWordDelimiterGraphFilter.java | 42 +++++++++++---------
4 files changed, 41 insertions(+), 27 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75a053dd/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fc609a5..0a41d70 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -142,6 +142,10 @@ Changes in Runtime Behavior
anymore. This doesn't affect ordering as this is a constant factor which is
the same for every document. (Luca Cavanna via Adrien Grand)
+* LUCENE-8509: WordDelimiterGraphFilter will no longer set the offsets of internal
+ tokens by default, preventing a number of bugs when the filter is chained with
+ tokenfilters that change the length of their tokens (Alan Woodward)
+
New Features
* LUCENE-8340: LongPoint#newDistanceQuery may be used to boost scores based on
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75a053dd/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index a438213..00ace5b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -191,6 +191,8 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
// used for concatenating runs of similar typed subwords (word,number)
private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
+ private final boolean adjustInternalOffsets;
+
// number of subwords last output by concat.
private int lastConcatCount;
@@ -206,10 +208,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
private int savedEndOffset;
private AttributeSource.State savedState;
private int lastStartOffset;
-
- // if length by start + end offsets doesn't match the term text then assume
- // this is a synonym and don't adjust the offsets.
- private boolean hasIllegalOffsets;
+ private boolean adjustingOffsets;
private int wordPos;
@@ -217,11 +216,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
* Creates a new WordDelimiterGraphFilter
*
* @param in TokenStream to be filtered
+ * @param adjustInternalOffsets if the offsets of partial terms should be adjusted
* @param charTypeTable table containing character types
* @param configurationFlags Flags configuring the filter
* @param protWords If not null is the set of tokens to protect from being delimited
*/
- public WordDelimiterGraphFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
+ public WordDelimiterGraphFilter(TokenStream in, boolean adjustInternalOffsets, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
super(in);
if ((configurationFlags &
~(GENERATE_WORD_PARTS |
@@ -240,6 +240,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
this.protWords = protWords;
this.iterator = new WordDelimiterIterator(
charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
+ this.adjustInternalOffsets = adjustInternalOffsets;
}
/**
@@ -251,7 +252,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
* @param protWords If not null is the set of tokens to protect from being delimited
*/
public WordDelimiterGraphFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
- this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
+ this(in, false, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
}
/** Iterates all words parts and concatenations, buffering up the term parts we should return. */
@@ -261,7 +262,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
// if length by start + end offsets doesn't match the term's text then set offsets for all our word parts/concats to the incoming
// offsets. this can happen if WDGF is applied to an injected synonym, or to a stem'd form, etc:
- hasIllegalOffsets = (savedEndOffset - savedStartOffset != savedTermLength);
+ adjustingOffsets = adjustInternalOffsets && savedEndOffset - savedStartOffset == savedTermLength;
bufferedLen = 0;
lastConcatCount = 0;
@@ -391,7 +392,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
int startOffset;
int endOffset;
- if (hasIllegalOffsets) {
+ if (adjustingOffsets == false) {
startOffset = savedStartOffset;
endOffset = savedEndOffset;
} else {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75a053dd/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
index 613aedc..4666c7d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
@@ -53,12 +53,14 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.*;
public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
public static final String TYPES = "types";
+ public static final String OFFSETS = "adjustOffsets";
private final String wordFiles;
private final String types;
private final int flags;
byte[] typeTable = null;
private CharArraySet protectedWords = null;
+ private boolean adjustOffsets = false;
/** Creates a new WordDelimiterGraphFilterFactory */
public WordDelimiterGraphFilterFactory(Map<String, String> args) {
@@ -94,6 +96,7 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implemen
wordFiles = get(args, PROTECTED_TOKENS);
types = get(args, TYPES);
this.flags = flags;
+ this.adjustOffsets = getBoolean(args, OFFSETS, true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -117,7 +120,7 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implemen
@Override
public TokenFilter create(TokenStream input) {
- return new WordDelimiterGraphFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+ return new WordDelimiterGraphFilter(input, adjustOffsets, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
flags, protectedWords);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75a053dd/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index 65d3b02..e3f3f65 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -64,7 +64,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
// test that subwords and catenated subwords have
// the correct offsets.
- WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)),
+ true, DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foobar", "foo", "bar" },
@@ -72,7 +73,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
new int[] { 12, 8, 12 });
// with illegal offsets:
- wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), true, DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foobar", "foo", "bar" },
new int[] { 5, 5, 5 },
@@ -81,7 +82,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
public void testOffsetChange() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)),
+ true, DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -91,7 +93,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
public void testOffsetChange2() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)),
+ true, DEFAULT_WORD_DELIM_TABLE, flags, null);
// illegal offsets:
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
@@ -101,7 +104,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
public void testOffsetChange3() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)),
+ true, DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
new int[] { 8 },
@@ -110,7 +114,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
public void testOffsetChange4() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)),
+ true, DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foobar", "foo", "bar"},
@@ -120,7 +125,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
public void doSplit(final String input, String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
- WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input),
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input), false,
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, output);
@@ -182,7 +187,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
// the correct offsets.
Token token = new Token("foo-bar", 5, 12);
token.setType("mytype");
- WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), flags, null);
assertTokenStreamContents(wdf,
new String[] {"foobar", "foo", "bar"},
@@ -235,7 +240,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
- tokenizer,
+ tokenizer, true, DEFAULT_WORD_DELIM_TABLE,
flags, protWords));
}
};
@@ -272,7 +277,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
- new LargePosIncTokenFilter(tokenizer),
+ new LargePosIncTokenFilter(tokenizer), true, DEFAULT_WORD_DELIM_TABLE,
flags, protWords));
}
};
@@ -317,7 +322,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(tokenizer, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
- return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, flags, protWords));
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, true, DEFAULT_WORD_DELIM_TABLE, flags, protWords));
}
};
@@ -350,8 +355,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
"abc-def klm-nop kpop",
new String[] {"abc", "def", "klm-nop", "kpop"},
- new int[]{0, 4, 8, 16},
- new int[]{3, 7, 15, 20},
+ new int[]{0, 0, 8, 16},
+ new int[]{7, 7, 15, 20},
null,
new int[]{1, 1, 1, 1},
null,
@@ -384,7 +389,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
}
};
@@ -414,8 +419,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
assertAnalyzesTo(a, "abc-def-123-456",
new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" },
- new int[] { 0, 0, 0, 0, 4, 8, 8, 12 },
- new int[] { 15, 15, 7, 3, 7, 15, 11, 15 },
+ new int[] { 0, 0, 0, 0, 0, 0, 0, 0 },
+ new int[] { 15, 15, 15, 15, 15, 15, 15, 15 },
null,
new int[] { 1, 0, 0, 0, 1, 1, 0, 1 },
null,
@@ -954,7 +959,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
}
public void testEmptyString() throws Exception {
- WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)), DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)),
+ GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
wdf.reset();
assertTrue(wdf.incrementToken());
assertFalse(wdf.incrementToken());
@@ -967,7 +973,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
new Token("foo-bar", 0, 7));
CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
- WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
assertGraphStrings(wdf,
"foo17-bar foo bar",
"foo17-bar foo-bar",