You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2018/05/29 15:11:38 UTC
lucene-solr:master: LUCENE-8273: Adjust position increments when
filtering stacked tokens
Repository: lucene-solr
Updated Branches:
refs/heads/master 34741a863 -> 4ea9d2ea8
LUCENE-8273: Adjust position increments when filtering stacked tokens
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/4ea9d2ea
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/4ea9d2ea
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/4ea9d2ea
Branch: refs/heads/master
Commit: 4ea9d2ea8cbb036bac6aa1e61161afc65d04a1be
Parents: 34741a8
Author: Alan Woodward <ro...@apache.org>
Authored: Tue May 29 15:57:03 2018 +0100
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue May 29 15:59:36 2018 +0100
----------------------------------------------------------------------
.../miscellaneous/ConditionalTokenFilter.java | 14 +++++++
.../TestConditionalTokenFilter.java | 39 ++++++++++++++++++++
2 files changed, 53 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4ea9d2ea/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
index 7de4fbd..e41ce82 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
@@ -168,6 +168,16 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
return false;
}
if (shouldFilter()) {
+ // we're chopping the underlying Tokenstream up into fragments, and presenting
+ // only those parts of it that pass the filter to the delegate, so the delegate is
+ // in effect seeing multiple tokenstream snippets. Tokenstreams can't have an initial
+ // position increment of 0, so if the snippet starts on a stacked token we need to
+ // offset it here and then correct the increment back again after delegation
+ boolean adjustPosition = false;
+ if (posIncAtt.getPositionIncrement() == 0) {
+ posIncAtt.setPositionIncrement(1);
+ adjustPosition = true;
+ }
lastTokenFiltered = true;
state = TokenState.PREBUFFERING;
// we determine that the delegate has emitted all the tokens it can at the current
@@ -178,6 +188,10 @@ public abstract class ConditionalTokenFilter extends TokenFilter {
boolean more = delegate.incrementToken();
if (more) {
state = TokenState.DELEGATING;
+ if (adjustPosition) {
+ int posInc = posIncAtt.getPositionIncrement();
+ posIncAtt.setPositionIncrement(posInc - 1);
+ }
}
else {
lastTokenFiltered = false;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4ea9d2ea/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
index 511c725..e0bbac4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
@@ -48,6 +48,7 @@ import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
@@ -330,6 +331,44 @@ public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(ts, new String[]{"jvboq"});
}
+ public void testInternalPositionAdjustment() throws IOException {
+ // check that the partial TokenStream sent to the condition filter begins with a posInc of 1,
+ // even if the input stream has a posInc of 0 at that position, and that the filtered stream
+ // has the correct posInc afterwards
+ TokenStream ts = whitespaceMockTokenizer("one two three");
+ ts = new KeywordRepeatFilter(ts);
+ ts = new NonRandomSkippingFilter(ts, PositionAssertingTokenFilter::new, false, true, true, true, true, false);
+
+ assertTokenStreamContents(ts,
+ new String[]{ "one", "one", "two", "two", "three", "three" },
+ new int[]{ 1, 0, 1, 0, 1, 0});
+ }
+
+ private static final class PositionAssertingTokenFilter extends TokenFilter {
+
+ boolean reset = false;
+ final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ protected PositionAssertingTokenFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ this.reset = true;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (reset) {
+ assertEquals(1, posIncAtt.getPositionIncrement());
+ }
+ reset = false;
+ return input.incrementToken();
+ }
+ }
+
private static class RandomSkippingFilter extends ConditionalTokenFilter {
Random random;