You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2022/09/23 11:20:43 UTC

[lucene] branch branch_9x updated: Fix repeating token sentence boundary bug (#11734)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new b79edbb54ac Fix repeating token sentence boundary bug (#11734)
b79edbb54ac is described below

commit b79edbb54acd3a956435c126fb35ed49c3a2e7e0
Author: Luke Kot-Zaniewski <lu...@gmail.com>
AuthorDate: Fri Sep 23 06:59:46 2022 -0400

    Fix repeating token sentence boundary bug (#11734)
    
    Signed-off-by: lkotzaniewsk <lk...@bloomberg.net>
    Co-authored-by: Dawid Weiss <da...@gmail.com>
---
 lucene/CHANGES.txt                                 |  6 ++
 .../analysis/opennlp/OpenNLPChunkerFilter.java     | 63 +++++++----------
 .../analysis/opennlp/OpenNLPLemmatizerFilter.java  | 63 +++++++----------
 .../lucene/analysis/opennlp/OpenNLPPOSFilter.java  | 63 ++++++++---------
 .../lucene/analysis/opennlp/OpenNLPTokenizer.java  | 19 +++--
 .../opennlp/SentenceAttributeExtractor.java        | 81 +++++++++++++++++++++
 .../analysis/opennlp/data/early-exit-bug-input.txt | 23 ++++++
 .../opennlp/data/early-exit-bug-output.txt         | 32 +++++++++
 .../opennlp/TestOpenNLPChunkerFilterFactory.java   | 12 ++++
 .../TestOpenNLPLemmatizerFilterFactory.java        | 82 ++++++++++++++++++++++
 .../opennlp/TestOpenNLPPOSFilterFactory.java       | 24 +++++++
 .../tokenattributes/SentenceAttribute.java         | 42 +++++++++++
 .../tokenattributes/SentenceAttributeImpl.java     | 80 +++++++++++++++++++++
 13 files changed, 471 insertions(+), 119 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c5ad8e05651..15e97e0afa8 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -24,6 +24,12 @@ Bug Fixes
   trying to apply a dictionary whose size is greater than the maximum supported
   window size for LZ4. (Adrien Grand)
 
+* GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream.
+  (Luke Kot-Zaniewski)
+
+* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
+  (Luke Kot-Zaniewski)
+
 Other
 ---------------------
 * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
index 00932278337..6510bbce8d4 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IgnoreRandomChains;
@@ -36,76 +36,65 @@ import org.apache.lucene.util.IgnoreRandomChains;
  */
 @IgnoreRandomChains(reason = "other filters must precede this one (see docs)")
 public final class OpenNLPChunkerFilter extends TokenFilter {
-
-  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
   private int tokenNum = 0;
-  private boolean moreTokensAvailable = true;
-  private String[] sentenceTerms = null;
-  private String[] sentenceTermPOSTags = null;
-
   private final NLPChunkerOp chunkerOp;
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final SentenceAttributeExtractor sentenceAttributeExtractor;
 
   public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
     super(input);
     this.chunkerOp = chunkerOp;
+    sentenceAttributeExtractor =
+        new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
   }
 
   @Override
-  public final boolean incrementToken() throws IOException {
-    if (!moreTokensAvailable) {
-      clear();
-      return false;
-    }
-    if (tokenNum == sentenceTokenAttrs.size()) {
-      nextSentence();
-      if (sentenceTerms == null) {
-        clear();
+  public boolean incrementToken() throws IOException {
+    List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
+    boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
+    if (isEndOfCurrentSentence) {
+      boolean noSentencesLeft =
+          sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
+      if (noSentencesLeft) {
         return false;
       }
-      assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
-      tokenNum = 0;
     }
     clearAttributes();
     sentenceTokenAttrs.get(tokenNum++).copyTo(this);
     return true;
   }
 
-  private void nextSentence() throws IOException {
+  private List<AttributeSource> nextSentence() throws IOException {
+    tokenNum = 0;
     List<String> termList = new ArrayList<>();
     List<String> posTagList = new ArrayList<>();
-    sentenceTokenAttrs.clear();
-    boolean endOfSentence = false;
-    while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
-      termList.add(termAtt.toString());
-      posTagList.add(typeAtt.type());
-      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
-      sentenceTokenAttrs.add(input.cloneAttributes());
+    for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
+      termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
+      posTagList.add(attributeSource.getAttribute(TypeAttribute.class).type());
     }
-    sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
-    sentenceTermPOSTags =
-        posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
+    String[] sentenceTerms = termList.toArray(new String[0]);
+    String[] sentenceTermPOSTags = posTagList.toArray(new String[0]);
+    assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
+    return sentenceAttributeExtractor.getSentenceAttributes();
   }
 
   private void assignTokenTypes(String[] tags) {
     for (int i = 0; i < tags.length; ++i) {
-      sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
+      sentenceAttributeExtractor
+          .getSentenceAttributes()
+          .get(i)
+          .getAttribute(TypeAttribute.class)
+          .setType(tags[i]);
     }
   }
 
   @Override
   public void reset() throws IOException {
     super.reset();
-    moreTokensAvailable = true;
+    sentenceAttributeExtractor.reset();
     clear();
   }
 
   private void clear() {
-    sentenceTokenAttrs.clear();
-    sentenceTerms = null;
-    sentenceTermPOSTags = null;
     tokenNum = 0;
   }
 }
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
index af14f03cf21..53aab1e46c6 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
@@ -24,10 +24,7 @@ import java.util.List;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.tokenattributes.*;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IgnoreRandomChains;
 
@@ -46,37 +43,28 @@ import org.apache.lucene.util.IgnoreRandomChains;
 public class OpenNLPLemmatizerFilter extends TokenFilter {
   private final NLPLemmatizerOp lemmatizerOp;
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
   private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
-  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
   private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
-  private boolean moreTokensAvailable = true;
-  private String[] sentenceTokens = null; // non-keyword tokens
-  private String[] sentenceTokenTypes = null; // types for non-keyword tokens
-  private String[] lemmas = null; // lemmas for non-keyword tokens
+  private final SentenceAttributeExtractor sentenceAttributeExtractor;
+  private String[] lemmas = new String[0]; // lemmas for non-keyword tokens
   private int lemmaNum = 0; // lemma counter
 
   public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
     super(input);
     this.lemmatizerOp = lemmatizerOp;
+    sentenceAttributeExtractor =
+        new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
   }
 
   @Override
   public final boolean incrementToken() throws IOException {
-    if (!moreTokensAvailable) {
-      clear();
-      return false;
-    }
-    if (sentenceTokenAttrsIter == null || !sentenceTokenAttrsIter.hasNext()) {
-      nextSentence();
-      if (sentenceTokens == null) { // zero non-keyword tokens
-        clear();
+    boolean isEndOfCurrentSentence = lemmaNum >= lemmas.length;
+    if (isEndOfCurrentSentence) {
+      boolean noSentencesLeft =
+          sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
+      if (noSentencesLeft) {
         return false;
       }
-      lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
-      lemmaNum = 0;
-      sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
     }
     clearAttributes();
     sentenceTokenAttrsIter.next().copyTo(this);
@@ -86,36 +74,35 @@ public class OpenNLPLemmatizerFilter extends TokenFilter {
     return true;
   }
 
-  private void nextSentence() throws IOException {
+  private List<AttributeSource> nextSentence() throws IOException {
+    lemmaNum = 0;
     List<String> tokenList = new ArrayList<>();
     List<String> typeList = new ArrayList<>();
-    sentenceTokenAttrs.clear();
-    boolean endOfSentence = false;
-    while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
-      if (!keywordAtt.isKeyword()) {
-        tokenList.add(termAtt.toString());
-        typeList.add(typeAtt.type());
+    List<AttributeSource> sentenceAttributes =
+        sentenceAttributeExtractor.extractSentenceAttributes();
+    for (AttributeSource attributeSource : sentenceAttributes) {
+      if (!attributeSource.getAttribute(KeywordAttribute.class).isKeyword()) {
+        tokenList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
+        typeList.add(attributeSource.getAttribute(TypeAttribute.class).type());
       }
-      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
-      sentenceTokenAttrs.add(input.cloneAttributes());
     }
-    sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
-    sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
+    String[] sentenceTokens = tokenList.toArray(new String[0]);
+    String[] sentenceTokenTypes = typeList.toArray(new String[0]);
+    lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
+    sentenceTokenAttrsIter = sentenceAttributes.iterator();
+    return sentenceAttributeExtractor.getSentenceAttributes();
   }
 
   @Override
   public void reset() throws IOException {
     super.reset();
-    moreTokensAvailable = true;
+    sentenceAttributeExtractor.reset();
     clear();
   }
 
   private void clear() {
-    sentenceTokenAttrs.clear();
     sentenceTokenAttrsIter = null;
-    sentenceTokens = null;
-    sentenceTokenTypes = null;
-    lemmas = null;
+    lemmas = new String[0];
     lemmaNum = 0;
   }
 }
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
index 2cb3ab595fc..d892c42f33b 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IgnoreRandomChains;
@@ -33,65 +33,62 @@ import org.apache.lucene.util.IgnoreRandomChains;
 @IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
 public final class OpenNLPPOSFilter extends TokenFilter {
 
-  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
-  String[] tags = null;
   private int tokenNum = 0;
-  private boolean moreTokensAvailable = true;
-
   private final NLPPOSTaggerOp posTaggerOp;
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final SentenceAttributeExtractor sentenceAttributeExtractor;
 
   public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
     super(input);
     this.posTaggerOp = posTaggerOp;
+    sentenceAttributeExtractor =
+        new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
   }
 
   @Override
-  public final boolean incrementToken() throws IOException {
-    if (!moreTokensAvailable) {
-      clear();
-      return false;
-    }
-    if (tokenNum
-        == sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
-      String[] sentenceTokens = nextSentence();
-      if (sentenceTokens == null) {
-        clear();
+  public boolean incrementToken() throws IOException {
+    List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
+    boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
+    if (isEndOfCurrentSentence) {
+      boolean noSentencesLeft =
+          sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
+      if (noSentencesLeft) {
         return false;
       }
-      tags = posTaggerOp.getPOSTags(sentenceTokens);
-      tokenNum = 0;
     }
     clearAttributes();
-    sentenceTokenAttrs.get(tokenNum).copyTo(this);
-    typeAtt.setType(tags[tokenNum++]);
+    sentenceTokenAttrs.get(tokenNum++).copyTo(this);
     return true;
   }
 
-  private String[] nextSentence() throws IOException {
+  private List<AttributeSource> nextSentence() throws IOException {
+    tokenNum = 0;
     List<String> termList = new ArrayList<>();
-    sentenceTokenAttrs.clear();
-    boolean endOfSentence = false;
-    while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
-      termList.add(termAtt.toString());
-      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
-      sentenceTokenAttrs.add(input.cloneAttributes());
+    for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
+      termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
+    }
+    String[] sentenceTerms = termList.toArray(new String[0]);
+    assignTokenTypes(posTaggerOp.getPOSTags(sentenceTerms));
+    return sentenceAttributeExtractor.getSentenceAttributes();
+  }
+
+  private void assignTokenTypes(String[] tags) {
+    for (int i = 0; i < tags.length; ++i) {
+      sentenceAttributeExtractor
+          .getSentenceAttributes()
+          .get(i)
+          .getAttribute(TypeAttribute.class)
+          .setType(tags[i]);
     }
-    return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
   }
 
   @Override
   public void reset() throws IOException {
     super.reset();
-    moreTokensAvailable = true;
+    sentenceAttributeExtractor.reset();
     clear();
   }
 
   private void clear() {
-    sentenceTokenAttrs.clear();
-    tags = null;
     tokenNum = 0;
   }
 }
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
index c31f5c11ea0..b9903630b25 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
@@ -22,28 +22,27 @@ import opennlp.tools.util.Span;
 import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
 import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
 import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.IgnoreRandomChains;
 
 /**
- * Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
- * the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
- * apply operations to tokens one sentence at a time.
+ * Run OpenNLP SentenceDetector and Tokenizer. The index of each sentence is stored in
+ * SentenceAttribute.
  */
 @IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
 public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
-  public static int EOS_FLAG_BIT = 1;
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final SentenceAttribute sentenceAtt = addAttribute(SentenceAttribute.class);
 
   private Span[] termSpans = null;
   private int termNum = 0;
   private int sentenceStart = 0;
+  private int sentenceIndex = -1;
 
   private NLPTokenizerOp tokenizerOp = null;
 
@@ -71,6 +70,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
     String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
     termSpans = tokenizerOp.getTerms(sentenceText);
     termNum = 0;
+    sentenceIndex++;
   }
 
   @Override
@@ -84,11 +84,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
     offsetAtt.setOffset(
         correctOffset(offset + sentenceStart + term.getStart()),
         correctOffset(offset + sentenceStart + term.getEnd()));
-    if (termNum == termSpans.length - 1) {
-      flagsAtt.setFlags(
-          flagsAtt.getFlags()
-              | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
-    }
+    sentenceAtt.setSentenceIndex(sentenceIndex);
     ++termNum;
     return true;
   }
@@ -98,5 +94,6 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
     super.reset();
     termSpans = null;
     termNum = sentenceStart = 0;
+    sentenceIndex = -1;
   }
 }
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
new file mode 100644
index 00000000000..34c88e5cf62
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Iterate through sentence tokens and cache their attributes. Could consider moving this to a more
+ * central location to be used by other sentence-aware components.
+ *
+ * <p>May want to consider making this its own Filter so that extracted sentence token attributes
+ * can be shared by downstream sentence-aware filters.
+ */
+public class SentenceAttributeExtractor {
+
+  private final TokenStream input;
+  private final SentenceAttribute sentenceAtt;
+  private final List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+  private AttributeSource prevAttributeSource;
+  private int currSentence = 0;
+  private boolean hasNextToken = true;
+
+  public SentenceAttributeExtractor(TokenStream input, SentenceAttribute sentenceAtt) {
+    this.input = input;
+    this.sentenceAtt = sentenceAtt;
+  }
+
+  // If this class were a stand-alone filter it could conceivably extract the attributes once
+  // and cache a reference to those attributes in SentenceAttribute. That way downstream filters
+  // could read the full sentence without having to independently extract it.
+  public List<AttributeSource> extractSentenceAttributes() throws IOException {
+    sentenceTokenAttrs.clear();
+    boolean hasNext;
+    do {
+      hasNextToken = input.incrementToken();
+      int currSentenceTmp = sentenceAtt.getSentenceIndex();
+      hasNext = (currSentence == currSentenceTmp && hasNextToken);
+      currSentence = currSentenceTmp;
+      if (prevAttributeSource != null) {
+        sentenceTokenAttrs.add(prevAttributeSource);
+      }
+      prevAttributeSource = input.cloneAttributes();
+    } while (hasNext);
+    return sentenceTokenAttrs;
+  }
+
+  public List<AttributeSource> getSentenceAttributes() {
+    return sentenceTokenAttrs;
+  }
+
+  public boolean allSentencesProcessed() {
+    return !hasNextToken;
+  }
+
+  public void reset() {
+    hasNextToken = true;
+    sentenceTokenAttrs.clear();
+    currSentence = 0;
+    prevAttributeSource = null;
+  }
+}
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
new file mode 100644
index 00000000000..1e202557df1
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
@@ -0,0 +1,23 @@
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+This should hopefully get analyzed.
+x
+And so should this.
\ No newline at end of file
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
new file mode 100644
index 00000000000..3678c205724
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
@@ -0,0 +1,32 @@
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+This This should should hopefully hopefully get get analyzed analyzed . .
+x x
+And And so so should should this this . .
\ No newline at end of file
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
index 1132be3ffe6..708f7bafc17 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
@@ -114,4 +114,16 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
         true,
         toPayloads(SENTENCES_chunks));
   }
+
+  public void testEmptyField() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+            .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
+            .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
+            .build();
+    assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
+  }
 }
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
index d74bc451d93..681ec91eb22 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
@@ -17,6 +17,11 @@
 
 package org.apache.lucene.analysis.opennlp;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.stream.Collectors;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
 import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
@@ -108,6 +113,10 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
     "IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."
   };
 
+  private static final String NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD = "period";
+
+  private static final String[] NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms = {"period", "period"};
+
   private static final String tokenizerModelFile = "en-test-tokenizer.bin";
   private static final String sentenceModelFile = "en-test-sent.bin";
   private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
@@ -290,4 +299,77 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
         null,
         true);
   }
+
+  public void testNoBreakWithRepeatKeywordFilter() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+            .addTokenFilter(KeywordRepeatFilterFactory.class)
+            .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+            .build();
+    assertAnalyzesTo(
+        analyzer,
+        NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD,
+        NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms,
+        null,
+        null,
+        null,
+        null,
+        null,
+        true);
+  }
+
+  // checks for bug described in https://github.com/apache/lucene/issues/11771
+  public void testPreventEarlyExit() throws IOException {
+    InputStream earlyExitInput = null;
+    InputStream earlyExitOutput = null;
+    try {
+      ClasspathResourceLoader loader = new ClasspathResourceLoader(getClass());
+      earlyExitInput = loader.openResource("data/early-exit-bug-input.txt");
+      String earlyExitInputText = new String(earlyExitInput.readAllBytes(), StandardCharsets.UTF_8);
+      earlyExitOutput = loader.openResource("data/early-exit-bug-output.txt");
+      String earlyExitOutputText =
+          new String(earlyExitOutput.readAllBytes(), StandardCharsets.UTF_8);
+      String[] earlyExitOutputTexts =
+          Arrays.stream(earlyExitOutputText.split("\\s"))
+              .filter(text -> text != "")
+              .collect(Collectors.joining(" "))
+              .split(" ");
+      CustomAnalyzer analyzer =
+          CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+              .withTokenizer(
+                  "opennlp",
+                  "tokenizerModel",
+                  tokenizerModelFile,
+                  "sentenceModel",
+                  sentenceModelFile)
+              .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+              .addTokenFilter(KeywordRepeatFilterFactory.class)
+              .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+              .build();
+      assertAnalyzesTo(
+          analyzer, earlyExitInputText, earlyExitOutputTexts, null, null, null, null, null, true);
+    } finally {
+      if (earlyExitInput != null) {
+        earlyExitInput.close();
+      }
+      if (earlyExitOutput != null) {
+        earlyExitOutput.close();
+      }
+    }
+  }
+
+  public void testEmptyField() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+            .addTokenFilter(KeywordRepeatFilterFactory.class)
+            .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+            .build();
+    assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
+  }
 }
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
index e9ac0f796d6..bd3e649334f 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
 import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.util.ClasspathResourceLoader;
@@ -66,6 +67,7 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
   private static final String[] NO_BREAK_terms = {"No", "period"};
   private static final int[] NO_BREAK_startOffsets = {0, 3};
   private static final int[] NO_BREAK_endOffsets = {2, 9};
+  private static final String[] NO_BREAK_KEYWORD_REPEAT_terms = {"No", "No", "period", "period"};
 
   private static final String sentenceModelFile = "en-test-sent.bin";
   private static final String tokenizerModelFile = "en-test-tokenizer.bin";
@@ -144,4 +146,26 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
         null,
         true);
   }
+
+  public void testNoBreakWithRepeatKeywordFilter() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter(KeywordRepeatFilterFactory.class)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+            .build();
+    assertAnalyzesTo(
+        analyzer, NO_BREAK, NO_BREAK_KEYWORD_REPEAT_terms, null, null, null, null, null, true);
+  }
+
+  public void testEmptyField() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+            .build();
+    assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
new file mode 100644
index 00000000000..cbae8a4f120
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.tokenattributes;
+
+import org.apache.lucene.util.Attribute;
+
+/**
+ * This attribute tracks what sentence a given token belongs to as well as potentially other
+ * sentence specific attributes.
+ */
+public interface SentenceAttribute extends Attribute {
+
+  /**
+   * Get the sentence index for the current token
+   *
+   * @return The index of the sentence
+   * @see #getSentenceIndex()
+   */
+  int getSentenceIndex();
+
+  /**
+   * Set the sentence of the current token
+   *
+   * @see #setSentenceIndex(int sentenceIndex)
+   */
+  void setSentenceIndex(int sentenceIndex);
+}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
new file mode 100644
index 00000000000..9911222ca15
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.tokenattributes;
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/**
+ * Default implementation of {@link SentenceAttribute}.
+ *
+ * <p>The current implementation is coincidentally identical to {@link FlagsAttributeImpl} It was
+ * decided to keep it separate because this attribute will NOT be an implied bitmap. Also, this
+ * class may hold other sentence specific data in the future.
+ */
+public class SentenceAttributeImpl extends AttributeImpl implements SentenceAttribute {
+
+  private int index = 0;
+
+  /** Initialize this attribute to default */
+  public SentenceAttributeImpl() {}
+
+  @Override
+  public void clear() {
+    index = 0;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (this == other) {
+      return true;
+    }
+
+    if (other instanceof SentenceAttributeImpl) {
+      return ((SentenceAttributeImpl) other).index == index;
+    }
+
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return index;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    SentenceAttribute t = (SentenceAttribute) target;
+    t.setSentenceIndex(index);
+  }
+
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    reflector.reflect(SentenceAttribute.class, "sentences", index);
+  }
+
+  @Override
+  public int getSentenceIndex() {
+    return index;
+  }
+
+  @Override
+  public void setSentenceIndex(int sentence) {
+    this.index = sentence;
+  }
+}