You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2022/09/23 11:20:43 UTC
[lucene] branch branch_9x updated: Fix repeating token sentence boundary bug (#11734)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new b79edbb54ac Fix repeating token sentence boundary bug (#11734)
b79edbb54ac is described below
commit b79edbb54acd3a956435c126fb35ed49c3a2e7e0
Author: Luke Kot-Zaniewski <lu...@gmail.com>
AuthorDate: Fri Sep 23 06:59:46 2022 -0400
Fix repeating token sentence boundary bug (#11734)
Signed-off-by: lkotzaniewsk <lk...@bloomberg.net>
Co-authored-by: Dawid Weiss <da...@gmail.com>
---
lucene/CHANGES.txt | 6 ++
.../analysis/opennlp/OpenNLPChunkerFilter.java | 63 +++++++----------
.../analysis/opennlp/OpenNLPLemmatizerFilter.java | 63 +++++++----------
.../lucene/analysis/opennlp/OpenNLPPOSFilter.java | 63 ++++++++---------
.../lucene/analysis/opennlp/OpenNLPTokenizer.java | 19 +++--
.../opennlp/SentenceAttributeExtractor.java | 81 +++++++++++++++++++++
.../analysis/opennlp/data/early-exit-bug-input.txt | 23 ++++++
.../opennlp/data/early-exit-bug-output.txt | 32 +++++++++
.../opennlp/TestOpenNLPChunkerFilterFactory.java | 12 ++++
.../TestOpenNLPLemmatizerFilterFactory.java | 82 ++++++++++++++++++++++
.../opennlp/TestOpenNLPPOSFilterFactory.java | 24 +++++++
.../tokenattributes/SentenceAttribute.java | 42 +++++++++++
.../tokenattributes/SentenceAttributeImpl.java | 80 +++++++++++++++++++++
13 files changed, 471 insertions(+), 119 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c5ad8e05651..15e97e0afa8 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -24,6 +24,12 @@ Bug Fixes
trying to apply a dictionary whose size is greater than the maximum supported
window size for LZ4. (Adrien Grand)
+* GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream.
+ (Luke Kot-Zaniewski)
+
+* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
+ (Luke Kot-Zaniewski)
+
Other
---------------------
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
index 00932278337..6510bbce8d4 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IgnoreRandomChains;
@@ -36,76 +36,65 @@ import org.apache.lucene.util.IgnoreRandomChains;
*/
@IgnoreRandomChains(reason = "other filters must precede this one (see docs)")
public final class OpenNLPChunkerFilter extends TokenFilter {
-
- private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
private int tokenNum = 0;
- private boolean moreTokensAvailable = true;
- private String[] sentenceTerms = null;
- private String[] sentenceTermPOSTags = null;
-
private final NLPChunkerOp chunkerOp;
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final SentenceAttributeExtractor sentenceAttributeExtractor;
public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
super(input);
this.chunkerOp = chunkerOp;
+ sentenceAttributeExtractor =
+ new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
}
@Override
- public final boolean incrementToken() throws IOException {
- if (!moreTokensAvailable) {
- clear();
- return false;
- }
- if (tokenNum == sentenceTokenAttrs.size()) {
- nextSentence();
- if (sentenceTerms == null) {
- clear();
+ public boolean incrementToken() throws IOException {
+ List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
+ boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
+ if (isEndOfCurrentSentence) {
+ boolean noSentencesLeft =
+ sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
+ if (noSentencesLeft) {
return false;
}
- assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
- tokenNum = 0;
}
clearAttributes();
sentenceTokenAttrs.get(tokenNum++).copyTo(this);
return true;
}
- private void nextSentence() throws IOException {
+ private List<AttributeSource> nextSentence() throws IOException {
+ tokenNum = 0;
List<String> termList = new ArrayList<>();
List<String> posTagList = new ArrayList<>();
- sentenceTokenAttrs.clear();
- boolean endOfSentence = false;
- while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
- termList.add(termAtt.toString());
- posTagList.add(typeAtt.type());
- endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
- sentenceTokenAttrs.add(input.cloneAttributes());
+ for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
+ termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
+ posTagList.add(attributeSource.getAttribute(TypeAttribute.class).type());
}
- sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
- sentenceTermPOSTags =
- posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
+ String[] sentenceTerms = termList.toArray(new String[0]);
+ String[] sentenceTermPOSTags = posTagList.toArray(new String[0]);
+ assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
+ return sentenceAttributeExtractor.getSentenceAttributes();
}
private void assignTokenTypes(String[] tags) {
for (int i = 0; i < tags.length; ++i) {
- sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
+ sentenceAttributeExtractor
+ .getSentenceAttributes()
+ .get(i)
+ .getAttribute(TypeAttribute.class)
+ .setType(tags[i]);
}
}
@Override
public void reset() throws IOException {
super.reset();
- moreTokensAvailable = true;
+ sentenceAttributeExtractor.reset();
clear();
}
private void clear() {
- sentenceTokenAttrs.clear();
- sentenceTerms = null;
- sentenceTermPOSTags = null;
tokenNum = 0;
}
}
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
index af14f03cf21..53aab1e46c6 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
@@ -24,10 +24,7 @@ import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IgnoreRandomChains;
@@ -46,37 +43,28 @@ import org.apache.lucene.util.IgnoreRandomChains;
public class OpenNLPLemmatizerFilter extends TokenFilter {
private final NLPLemmatizerOp lemmatizerOp;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
- private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
- private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
- private boolean moreTokensAvailable = true;
- private String[] sentenceTokens = null; // non-keyword tokens
- private String[] sentenceTokenTypes = null; // types for non-keyword tokens
- private String[] lemmas = null; // lemmas for non-keyword tokens
+ private final SentenceAttributeExtractor sentenceAttributeExtractor;
+ private String[] lemmas = new String[0]; // lemmas for non-keyword tokens
private int lemmaNum = 0; // lemma counter
public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
super(input);
this.lemmatizerOp = lemmatizerOp;
+ sentenceAttributeExtractor =
+ new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
}
@Override
public final boolean incrementToken() throws IOException {
- if (!moreTokensAvailable) {
- clear();
- return false;
- }
- if (sentenceTokenAttrsIter == null || !sentenceTokenAttrsIter.hasNext()) {
- nextSentence();
- if (sentenceTokens == null) { // zero non-keyword tokens
- clear();
+ boolean isEndOfCurrentSentence = lemmaNum >= lemmas.length;
+ if (isEndOfCurrentSentence) {
+ boolean noSentencesLeft =
+ sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
+ if (noSentencesLeft) {
return false;
}
- lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
- lemmaNum = 0;
- sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
}
clearAttributes();
sentenceTokenAttrsIter.next().copyTo(this);
@@ -86,36 +74,35 @@ public class OpenNLPLemmatizerFilter extends TokenFilter {
return true;
}
- private void nextSentence() throws IOException {
+ private List<AttributeSource> nextSentence() throws IOException {
+ lemmaNum = 0;
List<String> tokenList = new ArrayList<>();
List<String> typeList = new ArrayList<>();
- sentenceTokenAttrs.clear();
- boolean endOfSentence = false;
- while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
- if (!keywordAtt.isKeyword()) {
- tokenList.add(termAtt.toString());
- typeList.add(typeAtt.type());
+ List<AttributeSource> sentenceAttributes =
+ sentenceAttributeExtractor.extractSentenceAttributes();
+ for (AttributeSource attributeSource : sentenceAttributes) {
+ if (!attributeSource.getAttribute(KeywordAttribute.class).isKeyword()) {
+ tokenList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
+ typeList.add(attributeSource.getAttribute(TypeAttribute.class).type());
}
- endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
- sentenceTokenAttrs.add(input.cloneAttributes());
}
- sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
- sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
+ String[] sentenceTokens = tokenList.toArray(new String[0]);
+ String[] sentenceTokenTypes = typeList.toArray(new String[0]);
+ lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
+ sentenceTokenAttrsIter = sentenceAttributes.iterator();
+ return sentenceAttributeExtractor.getSentenceAttributes();
}
@Override
public void reset() throws IOException {
super.reset();
- moreTokensAvailable = true;
+ sentenceAttributeExtractor.reset();
clear();
}
private void clear() {
- sentenceTokenAttrs.clear();
sentenceTokenAttrsIter = null;
- sentenceTokens = null;
- sentenceTokenTypes = null;
- lemmas = null;
+ lemmas = new String[0];
lemmaNum = 0;
}
}
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
index 2cb3ab595fc..d892c42f33b 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IgnoreRandomChains;
@@ -33,65 +33,62 @@ import org.apache.lucene.util.IgnoreRandomChains;
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
public final class OpenNLPPOSFilter extends TokenFilter {
- private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
- String[] tags = null;
private int tokenNum = 0;
- private boolean moreTokensAvailable = true;
-
private final NLPPOSTaggerOp posTaggerOp;
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final SentenceAttributeExtractor sentenceAttributeExtractor;
public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
super(input);
this.posTaggerOp = posTaggerOp;
+ sentenceAttributeExtractor =
+ new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
}
@Override
- public final boolean incrementToken() throws IOException {
- if (!moreTokensAvailable) {
- clear();
- return false;
- }
- if (tokenNum
- == sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
- String[] sentenceTokens = nextSentence();
- if (sentenceTokens == null) {
- clear();
+ public boolean incrementToken() throws IOException {
+ List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
+ boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
+ if (isEndOfCurrentSentence) {
+ boolean noSentencesLeft =
+ sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
+ if (noSentencesLeft) {
return false;
}
- tags = posTaggerOp.getPOSTags(sentenceTokens);
- tokenNum = 0;
}
clearAttributes();
- sentenceTokenAttrs.get(tokenNum).copyTo(this);
- typeAtt.setType(tags[tokenNum++]);
+ sentenceTokenAttrs.get(tokenNum++).copyTo(this);
return true;
}
- private String[] nextSentence() throws IOException {
+ private List<AttributeSource> nextSentence() throws IOException {
+ tokenNum = 0;
List<String> termList = new ArrayList<>();
- sentenceTokenAttrs.clear();
- boolean endOfSentence = false;
- while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
- termList.add(termAtt.toString());
- endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
- sentenceTokenAttrs.add(input.cloneAttributes());
+ for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
+ termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
+ }
+ String[] sentenceTerms = termList.toArray(new String[0]);
+ assignTokenTypes(posTaggerOp.getPOSTags(sentenceTerms));
+ return sentenceAttributeExtractor.getSentenceAttributes();
+ }
+
+ private void assignTokenTypes(String[] tags) {
+ for (int i = 0; i < tags.length; ++i) {
+ sentenceAttributeExtractor
+ .getSentenceAttributes()
+ .get(i)
+ .getAttribute(TypeAttribute.class)
+ .setType(tags[i]);
}
- return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
}
@Override
public void reset() throws IOException {
super.reset();
- moreTokensAvailable = true;
+ sentenceAttributeExtractor.reset();
clear();
}
private void clear() {
- sentenceTokenAttrs.clear();
- tags = null;
tokenNum = 0;
}
}
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
index c31f5c11ea0..b9903630b25 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
@@ -22,28 +22,27 @@ import opennlp.tools.util.Span;
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IgnoreRandomChains;
/**
- * Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
- * the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
- * apply operations to tokens one sentence at a time.
+ * Run OpenNLP SentenceDetector and Tokenizer. The index of each sentence is stored in
+ * SentenceAttribute.
*/
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
- public static int EOS_FLAG_BIT = 1;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final SentenceAttribute sentenceAtt = addAttribute(SentenceAttribute.class);
private Span[] termSpans = null;
private int termNum = 0;
private int sentenceStart = 0;
+ private int sentenceIndex = -1;
private NLPTokenizerOp tokenizerOp = null;
@@ -71,6 +70,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
termSpans = tokenizerOp.getTerms(sentenceText);
termNum = 0;
+ sentenceIndex++;
}
@Override
@@ -84,11 +84,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
offsetAtt.setOffset(
correctOffset(offset + sentenceStart + term.getStart()),
correctOffset(offset + sentenceStart + term.getEnd()));
- if (termNum == termSpans.length - 1) {
- flagsAtt.setFlags(
- flagsAtt.getFlags()
- | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
- }
+ sentenceAtt.setSentenceIndex(sentenceIndex);
++termNum;
return true;
}
@@ -98,5 +94,6 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
super.reset();
termSpans = null;
termNum = sentenceStart = 0;
+ sentenceIndex = -1;
}
}
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
new file mode 100644
index 00000000000..34c88e5cf62
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Iterate through sentence tokens and cache their attributes. Could consider moving this to a more
+ * central location to be used by other sentence-aware components.
+ *
+ * <p>May want to consider making this its own Filter so that extracted sentence token attributes
+ * can be shared by downstream sentence-aware filters.
+ */
+public class SentenceAttributeExtractor {
+
+ private final TokenStream input;
+ private final SentenceAttribute sentenceAtt;
+ private final List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+ private AttributeSource prevAttributeSource;
+ private int currSentence = 0;
+ private boolean hasNextToken = true;
+
+ public SentenceAttributeExtractor(TokenStream input, SentenceAttribute sentenceAtt) {
+ this.input = input;
+ this.sentenceAtt = sentenceAtt;
+ }
+
+ // If this class were a stand-alone filter it could conceivably extract the attributes once
+ // and cache a reference to those attributes in SentenceAttribute. That way downstream filters
+ // could read the full sentence without having to independently extract it.
+ public List<AttributeSource> extractSentenceAttributes() throws IOException {
+ sentenceTokenAttrs.clear();
+ boolean hasNext;
+ do {
+ hasNextToken = input.incrementToken();
+ int currSentenceTmp = sentenceAtt.getSentenceIndex();
+ hasNext = (currSentence == currSentenceTmp && hasNextToken);
+ currSentence = currSentenceTmp;
+ if (prevAttributeSource != null) {
+ sentenceTokenAttrs.add(prevAttributeSource);
+ }
+ prevAttributeSource = input.cloneAttributes();
+ } while (hasNext);
+ return sentenceTokenAttrs;
+ }
+
+ public List<AttributeSource> getSentenceAttributes() {
+ return sentenceTokenAttrs;
+ }
+
+ public boolean allSentencesProcessed() {
+ return !hasNextToken;
+ }
+
+ public void reset() {
+ hasNextToken = true;
+ sentenceTokenAttrs.clear();
+ currSentence = 0;
+ prevAttributeSource = null;
+ }
+}
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
new file mode 100644
index 00000000000..1e202557df1
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
@@ -0,0 +1,23 @@
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+This should hopefully get analyzed.
+x
+And so should this.
\ No newline at end of file
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
new file mode 100644
index 00000000000..3678c205724
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
@@ -0,0 +1,32 @@
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+This This should should hopefully hopefully get get analyzed analyzed . .
+x x
+And And so so should should this this . .
\ No newline at end of file
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
index 1132be3ffe6..708f7bafc17 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
@@ -114,4 +114,16 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
true,
toPayloads(SENTENCES_chunks));
}
+
+ public void testEmptyField() throws Exception {
+ CustomAnalyzer analyzer =
+ CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer(
+ "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
+ .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
+ .build();
+ assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
+ }
}
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
index d74bc451d93..681ec91eb22 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
@@ -17,6 +17,11 @@
package org.apache.lucene.analysis.opennlp;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.stream.Collectors;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
@@ -108,6 +113,10 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
"IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."
};
+ private static final String NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD = "period";
+
+ private static final String[] NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms = {"period", "period"};
+
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
@@ -290,4 +299,77 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
null,
true);
}
+
+ public void testNoBreakWithRepeatKeywordFilter() throws Exception {
+ CustomAnalyzer analyzer =
+ CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer(
+ "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+ .addTokenFilter(KeywordRepeatFilterFactory.class)
+ .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+ .build();
+ assertAnalyzesTo(
+ analyzer,
+ NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD,
+ NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms,
+ null,
+ null,
+ null,
+ null,
+ null,
+ true);
+ }
+
+ // checks for bug described in https://github.com/apache/lucene/issues/11771
+ public void testPreventEarlyExit() throws IOException {
+ InputStream earlyExitInput = null;
+ InputStream earlyExitOutput = null;
+ try {
+ ClasspathResourceLoader loader = new ClasspathResourceLoader(getClass());
+ earlyExitInput = loader.openResource("data/early-exit-bug-input.txt");
+ String earlyExitInputText = new String(earlyExitInput.readAllBytes(), StandardCharsets.UTF_8);
+ earlyExitOutput = loader.openResource("data/early-exit-bug-output.txt");
+ String earlyExitOutputText =
+ new String(earlyExitOutput.readAllBytes(), StandardCharsets.UTF_8);
+ String[] earlyExitOutputTexts =
+ Arrays.stream(earlyExitOutputText.split("\\s"))
+ .filter(text -> text != "")
+ .collect(Collectors.joining(" "))
+ .split(" ");
+ CustomAnalyzer analyzer =
+ CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer(
+ "opennlp",
+ "tokenizerModel",
+ tokenizerModelFile,
+ "sentenceModel",
+ sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+ .addTokenFilter(KeywordRepeatFilterFactory.class)
+ .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+ .build();
+ assertAnalyzesTo(
+ analyzer, earlyExitInputText, earlyExitOutputTexts, null, null, null, null, null, true);
+ } finally {
+ if (earlyExitInput != null) {
+ earlyExitInput.close();
+ }
+ if (earlyExitOutput != null) {
+ earlyExitOutput.close();
+ }
+ }
+ }
+
+ public void testEmptyField() throws Exception {
+ CustomAnalyzer analyzer =
+ CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer(
+ "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+ .addTokenFilter(KeywordRepeatFilterFactory.class)
+ .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+ .build();
+ assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
+ }
}
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
index e9ac0f796d6..bd3e649334f 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.ClasspathResourceLoader;
@@ -66,6 +67,7 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
private static final String[] NO_BREAK_terms = {"No", "period"};
private static final int[] NO_BREAK_startOffsets = {0, 3};
private static final int[] NO_BREAK_endOffsets = {2, 9};
+ private static final String[] NO_BREAK_KEYWORD_REPEAT_terms = {"No", "No", "period", "period"};
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
@@ -144,4 +146,26 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
null,
true);
}
+
+ public void testNoBreakWithRepeatKeywordFilter() throws Exception {
+ CustomAnalyzer analyzer =
+ CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer(
+ "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter(KeywordRepeatFilterFactory.class)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .build();
+ assertAnalyzesTo(
+ analyzer, NO_BREAK, NO_BREAK_KEYWORD_REPEAT_terms, null, null, null, null, null, true);
+ }
+
+ public void testEmptyField() throws Exception {
+ CustomAnalyzer analyzer =
+ CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer(
+ "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .build();
+ assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
new file mode 100644
index 00000000000..cbae8a4f120
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.tokenattributes;
+
+import org.apache.lucene.util.Attribute;
+
+/**
+ * This attribute tracks what sentence a given token belongs to as well as potentially other
+ * sentence specific attributes.
+ */
+public interface SentenceAttribute extends Attribute {
+
+ /**
+ * Get the sentence index for the current token
+ *
+ * @return The index of the sentence
+ * @see #getSentenceIndex()
+ */
+ int getSentenceIndex();
+
+ /**
+ * Set the sentence of the current token
+ *
+ * @see #setSentenceIndex(int sentenceIndex)
+ */
+ void setSentenceIndex(int sentenceIndex);
+}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
new file mode 100644
index 00000000000..9911222ca15
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.tokenattributes;
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/**
+ * Default implementation of {@link SentenceAttribute}.
+ *
+ * <p>The current implementation is coincidentally identical to {@link FlagsAttributeImpl} It was
+ * decided to keep it separate because this attribute will NOT be an implied bitmap. Also, this
+ * class may hold other sentence specific data in the future.
+ */
+public class SentenceAttributeImpl extends AttributeImpl implements SentenceAttribute {
+
+ private int index = 0;
+
+ /** Initialize this attribute to default */
+ public SentenceAttributeImpl() {}
+
+ @Override
+ public void clear() {
+ index = 0;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+
+ if (other instanceof SentenceAttributeImpl) {
+ return ((SentenceAttributeImpl) other).index == index;
+ }
+
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return index;
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ SentenceAttribute t = (SentenceAttribute) target;
+ t.setSentenceIndex(index);
+ }
+
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ reflector.reflect(SentenceAttribute.class, "sentences", index);
+ }
+
+ @Override
+ public int getSentenceIndex() {
+ return index;
+ }
+
+ @Override
+ public void setSentenceIndex(int sentence) {
+ this.index = sentence;
+ }
+}