You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2022/09/23 18:11:39 UTC

[lucene] branch branch_9x updated: Revert "Fix repeating token sentence boundary bug (#11734)"

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 64b124349ba Revert "Fix repeating token sentence boundary bug (#11734)"
64b124349ba is described below

commit 64b124349bac3cc764630e5e89de9574fc17dbe9
Author: Dawid Weiss <da...@carrotsearch.com>
AuthorDate: Fri Sep 23 20:11:26 2022 +0200

    Revert "Fix repeating token sentence boundary bug (#11734)"
    
    This reverts commit b79edbb54acd3a956435c126fb35ed49c3a2e7e0.
---
 lucene/CHANGES.txt                                 |  6 --
 .../analysis/opennlp/OpenNLPChunkerFilter.java     | 63 ++++++++++-------
 .../analysis/opennlp/OpenNLPLemmatizerFilter.java  | 63 ++++++++++-------
 .../lucene/analysis/opennlp/OpenNLPPOSFilter.java  | 63 +++++++++--------
 .../lucene/analysis/opennlp/OpenNLPTokenizer.java  | 19 ++---
 .../opennlp/SentenceAttributeExtractor.java        | 81 ---------------------
 .../analysis/opennlp/data/early-exit-bug-input.txt | 23 ------
 .../opennlp/data/early-exit-bug-output.txt         | 32 ---------
 .../opennlp/TestOpenNLPChunkerFilterFactory.java   | 12 ----
 .../TestOpenNLPLemmatizerFilterFactory.java        | 82 ----------------------
 .../opennlp/TestOpenNLPPOSFilterFactory.java       | 24 -------
 .../tokenattributes/SentenceAttribute.java         | 42 -----------
 .../tokenattributes/SentenceAttributeImpl.java     | 80 ---------------------
 13 files changed, 119 insertions(+), 471 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 15e97e0afa8..c5ad8e05651 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -24,12 +24,6 @@ Bug Fixes
   trying to apply a dictionary whose size is greater than the maximum supported
   window size for LZ4. (Adrien Grand)
 
-* GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream.
-  (Luke Kot-Zaniewski)
-
-* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
-  (Luke Kot-Zaniewski)
-
 Other
 ---------------------
 * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
index 6510bbce8d4..00932278337 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IgnoreRandomChains;
@@ -36,65 +36,76 @@ import org.apache.lucene.util.IgnoreRandomChains;
  */
 @IgnoreRandomChains(reason = "other filters must precede this one (see docs)")
 public final class OpenNLPChunkerFilter extends TokenFilter {
+
+  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
   private int tokenNum = 0;
+  private boolean moreTokensAvailable = true;
+  private String[] sentenceTerms = null;
+  private String[] sentenceTermPOSTags = null;
+
   private final NLPChunkerOp chunkerOp;
-  private final SentenceAttributeExtractor sentenceAttributeExtractor;
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 
   public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
     super(input);
     this.chunkerOp = chunkerOp;
-    sentenceAttributeExtractor =
-        new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
   }
 
   @Override
-  public boolean incrementToken() throws IOException {
-    List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
-    boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
-    if (isEndOfCurrentSentence) {
-      boolean noSentencesLeft =
-          sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
-      if (noSentencesLeft) {
+  public final boolean incrementToken() throws IOException {
+    if (!moreTokensAvailable) {
+      clear();
+      return false;
+    }
+    if (tokenNum == sentenceTokenAttrs.size()) {
+      nextSentence();
+      if (sentenceTerms == null) {
+        clear();
         return false;
       }
+      assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
+      tokenNum = 0;
     }
     clearAttributes();
     sentenceTokenAttrs.get(tokenNum++).copyTo(this);
     return true;
   }
 
-  private List<AttributeSource> nextSentence() throws IOException {
-    tokenNum = 0;
+  private void nextSentence() throws IOException {
     List<String> termList = new ArrayList<>();
     List<String> posTagList = new ArrayList<>();
-    for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
-      termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
-      posTagList.add(attributeSource.getAttribute(TypeAttribute.class).type());
+    sentenceTokenAttrs.clear();
+    boolean endOfSentence = false;
+    while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+      termList.add(termAtt.toString());
+      posTagList.add(typeAtt.type());
+      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+      sentenceTokenAttrs.add(input.cloneAttributes());
     }
-    String[] sentenceTerms = termList.toArray(new String[0]);
-    String[] sentenceTermPOSTags = posTagList.toArray(new String[0]);
-    assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
-    return sentenceAttributeExtractor.getSentenceAttributes();
+    sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
+    sentenceTermPOSTags =
+        posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
   }
 
   private void assignTokenTypes(String[] tags) {
     for (int i = 0; i < tags.length; ++i) {
-      sentenceAttributeExtractor
-          .getSentenceAttributes()
-          .get(i)
-          .getAttribute(TypeAttribute.class)
-          .setType(tags[i]);
+      sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
     }
   }
 
   @Override
   public void reset() throws IOException {
     super.reset();
-    sentenceAttributeExtractor.reset();
+    moreTokensAvailable = true;
     clear();
   }
 
   private void clear() {
+    sentenceTokenAttrs.clear();
+    sentenceTerms = null;
+    sentenceTermPOSTags = null;
     tokenNum = 0;
   }
 }
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
index 53aab1e46c6..af14f03cf21 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
@@ -24,7 +24,10 @@ import java.util.List;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
-import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IgnoreRandomChains;
 
@@ -43,28 +46,37 @@ import org.apache.lucene.util.IgnoreRandomChains;
 public class OpenNLPLemmatizerFilter extends TokenFilter {
   private final NLPLemmatizerOp lemmatizerOp;
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
   private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
   private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
-  private final SentenceAttributeExtractor sentenceAttributeExtractor;
-  private String[] lemmas = new String[0]; // lemmas for non-keyword tokens
+  private boolean moreTokensAvailable = true;
+  private String[] sentenceTokens = null; // non-keyword tokens
+  private String[] sentenceTokenTypes = null; // types for non-keyword tokens
+  private String[] lemmas = null; // lemmas for non-keyword tokens
   private int lemmaNum = 0; // lemma counter
 
   public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
     super(input);
     this.lemmatizerOp = lemmatizerOp;
-    sentenceAttributeExtractor =
-        new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
   }
 
   @Override
   public final boolean incrementToken() throws IOException {
-    boolean isEndOfCurrentSentence = lemmaNum >= lemmas.length;
-    if (isEndOfCurrentSentence) {
-      boolean noSentencesLeft =
-          sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
-      if (noSentencesLeft) {
+    if (!moreTokensAvailable) {
+      clear();
+      return false;
+    }
+    if (sentenceTokenAttrsIter == null || !sentenceTokenAttrsIter.hasNext()) {
+      nextSentence();
+      if (sentenceTokens == null) { // zero non-keyword tokens
+        clear();
         return false;
       }
+      lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
+      lemmaNum = 0;
+      sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
     }
     clearAttributes();
     sentenceTokenAttrsIter.next().copyTo(this);
@@ -74,35 +86,36 @@ public class OpenNLPLemmatizerFilter extends TokenFilter {
     return true;
   }
 
-  private List<AttributeSource> nextSentence() throws IOException {
-    lemmaNum = 0;
+  private void nextSentence() throws IOException {
     List<String> tokenList = new ArrayList<>();
     List<String> typeList = new ArrayList<>();
-    List<AttributeSource> sentenceAttributes =
-        sentenceAttributeExtractor.extractSentenceAttributes();
-    for (AttributeSource attributeSource : sentenceAttributes) {
-      if (!attributeSource.getAttribute(KeywordAttribute.class).isKeyword()) {
-        tokenList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
-        typeList.add(attributeSource.getAttribute(TypeAttribute.class).type());
+    sentenceTokenAttrs.clear();
+    boolean endOfSentence = false;
+    while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+      if (!keywordAtt.isKeyword()) {
+        tokenList.add(termAtt.toString());
+        typeList.add(typeAtt.type());
       }
+      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+      sentenceTokenAttrs.add(input.cloneAttributes());
     }
-    String[] sentenceTokens = tokenList.toArray(new String[0]);
-    String[] sentenceTokenTypes = typeList.toArray(new String[0]);
-    lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
-    sentenceTokenAttrsIter = sentenceAttributes.iterator();
-    return sentenceAttributeExtractor.getSentenceAttributes();
+    sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
+    sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
   }
 
   @Override
   public void reset() throws IOException {
     super.reset();
-    sentenceAttributeExtractor.reset();
+    moreTokensAvailable = true;
     clear();
   }
 
   private void clear() {
+    sentenceTokenAttrs.clear();
     sentenceTokenAttrsIter = null;
-    lemmas = new String[0];
+    sentenceTokens = null;
+    sentenceTokenTypes = null;
+    lemmas = null;
     lemmaNum = 0;
   }
 }
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
index d892c42f33b..2cb3ab595fc 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IgnoreRandomChains;
@@ -33,62 +33,65 @@ import org.apache.lucene.util.IgnoreRandomChains;
 @IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
 public final class OpenNLPPOSFilter extends TokenFilter {
 
+  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+  String[] tags = null;
   private int tokenNum = 0;
+  private boolean moreTokensAvailable = true;
+
   private final NLPPOSTaggerOp posTaggerOp;
-  private final SentenceAttributeExtractor sentenceAttributeExtractor;
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 
   public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
     super(input);
     this.posTaggerOp = posTaggerOp;
-    sentenceAttributeExtractor =
-        new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
   }
 
   @Override
-  public boolean incrementToken() throws IOException {
-    List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
-    boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
-    if (isEndOfCurrentSentence) {
-      boolean noSentencesLeft =
-          sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
-      if (noSentencesLeft) {
+  public final boolean incrementToken() throws IOException {
+    if (!moreTokensAvailable) {
+      clear();
+      return false;
+    }
+    if (tokenNum
+        == sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
+      String[] sentenceTokens = nextSentence();
+      if (sentenceTokens == null) {
+        clear();
         return false;
       }
+      tags = posTaggerOp.getPOSTags(sentenceTokens);
+      tokenNum = 0;
     }
     clearAttributes();
-    sentenceTokenAttrs.get(tokenNum++).copyTo(this);
+    sentenceTokenAttrs.get(tokenNum).copyTo(this);
+    typeAtt.setType(tags[tokenNum++]);
     return true;
   }
 
-  private List<AttributeSource> nextSentence() throws IOException {
-    tokenNum = 0;
+  private String[] nextSentence() throws IOException {
     List<String> termList = new ArrayList<>();
-    for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
-      termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
-    }
-    String[] sentenceTerms = termList.toArray(new String[0]);
-    assignTokenTypes(posTaggerOp.getPOSTags(sentenceTerms));
-    return sentenceAttributeExtractor.getSentenceAttributes();
-  }
-
-  private void assignTokenTypes(String[] tags) {
-    for (int i = 0; i < tags.length; ++i) {
-      sentenceAttributeExtractor
-          .getSentenceAttributes()
-          .get(i)
-          .getAttribute(TypeAttribute.class)
-          .setType(tags[i]);
+    sentenceTokenAttrs.clear();
+    boolean endOfSentence = false;
+    while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+      termList.add(termAtt.toString());
+      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+      sentenceTokenAttrs.add(input.cloneAttributes());
     }
+    return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
   }
 
   @Override
   public void reset() throws IOException {
     super.reset();
-    sentenceAttributeExtractor.reset();
+    moreTokensAvailable = true;
     clear();
   }
 
   private void clear() {
+    sentenceTokenAttrs.clear();
+    tags = null;
     tokenNum = 0;
   }
 }
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
index b9903630b25..c31f5c11ea0 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
@@ -22,27 +22,28 @@ import opennlp.tools.util.Span;
 import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
 import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
 import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.IgnoreRandomChains;
 
 /**
- * Run OpenNLP SentenceDetector and Tokenizer. The index of each sentence is stored in
- * SentenceAttribute.
+ * Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
+ * the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
+ * apply operations to tokens one sentence at a time.
  */
 @IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
 public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
+  public static int EOS_FLAG_BIT = 1;
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final SentenceAttribute sentenceAtt = addAttribute(SentenceAttribute.class);
 
   private Span[] termSpans = null;
   private int termNum = 0;
   private int sentenceStart = 0;
-  private int sentenceIndex = -1;
 
   private NLPTokenizerOp tokenizerOp = null;
 
@@ -70,7 +71,6 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
     String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
     termSpans = tokenizerOp.getTerms(sentenceText);
     termNum = 0;
-    sentenceIndex++;
   }
 
   @Override
@@ -84,7 +84,11 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
     offsetAtt.setOffset(
         correctOffset(offset + sentenceStart + term.getStart()),
         correctOffset(offset + sentenceStart + term.getEnd()));
-    sentenceAtt.setSentenceIndex(sentenceIndex);
+    if (termNum == termSpans.length - 1) {
+      flagsAtt.setFlags(
+          flagsAtt.getFlags()
+              | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
+    }
     ++termNum;
     return true;
   }
@@ -94,6 +98,5 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
     super.reset();
     termSpans = null;
     termNum = sentenceStart = 0;
-    sentenceIndex = -1;
   }
 }
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
deleted file mode 100644
index 34c88e5cf62..00000000000
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.opennlp;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
-import org.apache.lucene.util.AttributeSource;
-
-/**
- * Iterate through sentence tokens and cache their attributes. Could consider moving this to a more
- * central location to be used by other sentence-aware components.
- *
- * <p>May want to consider making this its own Filter so that extracted sentence token attributes
- * can be shared by downstream sentence-aware filters.
- */
-public class SentenceAttributeExtractor {
-
-  private final TokenStream input;
-  private final SentenceAttribute sentenceAtt;
-  private final List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
-  private AttributeSource prevAttributeSource;
-  private int currSentence = 0;
-  private boolean hasNextToken = true;
-
-  public SentenceAttributeExtractor(TokenStream input, SentenceAttribute sentenceAtt) {
-    this.input = input;
-    this.sentenceAtt = sentenceAtt;
-  }
-
-  // If this class were a stand-alone filter it could conceivably extract the attributes once
-  // and cache a reference to those attributes in SentenceAttribute. That way downstream filters
-  // could read the full sentence without having to independently extract it.
-  public List<AttributeSource> extractSentenceAttributes() throws IOException {
-    sentenceTokenAttrs.clear();
-    boolean hasNext;
-    do {
-      hasNextToken = input.incrementToken();
-      int currSentenceTmp = sentenceAtt.getSentenceIndex();
-      hasNext = (currSentence == currSentenceTmp && hasNextToken);
-      currSentence = currSentenceTmp;
-      if (prevAttributeSource != null) {
-        sentenceTokenAttrs.add(prevAttributeSource);
-      }
-      prevAttributeSource = input.cloneAttributes();
-    } while (hasNext);
-    return sentenceTokenAttrs;
-  }
-
-  public List<AttributeSource> getSentenceAttributes() {
-    return sentenceTokenAttrs;
-  }
-
-  public boolean allSentencesProcessed() {
-    return !hasNextToken;
-  }
-
-  public void reset() {
-    hasNextToken = true;
-    sentenceTokenAttrs.clear();
-    currSentence = 0;
-    prevAttributeSource = null;
-  }
-}
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
deleted file mode 100644
index 1e202557df1..00000000000
--- a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-This should hopefully get analyzed.
-x
-And so should this.
\ No newline at end of file
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
deleted file mode 100644
index 3678c205724..00000000000
--- a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-This This should should hopefully hopefully get get analyzed analyzed . .
-x x
-And And so so should should this this . .
\ No newline at end of file
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
index 708f7bafc17..1132be3ffe6 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
@@ -114,16 +114,4 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
         true,
         toPayloads(SENTENCES_chunks));
   }
-
-  public void testEmptyField() throws Exception {
-    CustomAnalyzer analyzer =
-        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
-            .withTokenizer(
-                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
-            .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
-            .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
-            .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
-            .build();
-    assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
-  }
 }
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
index 681ec91eb22..d74bc451d93 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
@@ -17,11 +17,6 @@
 
 package org.apache.lucene.analysis.opennlp;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
-import java.util.stream.Collectors;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
 import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
@@ -113,10 +108,6 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
     "IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."
   };
 
-  private static final String NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD = "period";
-
-  private static final String[] NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms = {"period", "period"};
-
   private static final String tokenizerModelFile = "en-test-tokenizer.bin";
   private static final String sentenceModelFile = "en-test-sent.bin";
   private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
@@ -299,77 +290,4 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
         null,
         true);
   }
-
-  public void testNoBreakWithRepeatKeywordFilter() throws Exception {
-    CustomAnalyzer analyzer =
-        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
-            .withTokenizer(
-                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
-            .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
-            .addTokenFilter(KeywordRepeatFilterFactory.class)
-            .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
-            .build();
-    assertAnalyzesTo(
-        analyzer,
-        NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD,
-        NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms,
-        null,
-        null,
-        null,
-        null,
-        null,
-        true);
-  }
-
-  // checks for bug described in https://github.com/apache/lucene/issues/11771
-  public void testPreventEarlyExit() throws IOException {
-    InputStream earlyExitInput = null;
-    InputStream earlyExitOutput = null;
-    try {
-      ClasspathResourceLoader loader = new ClasspathResourceLoader(getClass());
-      earlyExitInput = loader.openResource("data/early-exit-bug-input.txt");
-      String earlyExitInputText = new String(earlyExitInput.readAllBytes(), StandardCharsets.UTF_8);
-      earlyExitOutput = loader.openResource("data/early-exit-bug-output.txt");
-      String earlyExitOutputText =
-          new String(earlyExitOutput.readAllBytes(), StandardCharsets.UTF_8);
-      String[] earlyExitOutputTexts =
-          Arrays.stream(earlyExitOutputText.split("\\s"))
-              .filter(text -> text != "")
-              .collect(Collectors.joining(" "))
-              .split(" ");
-      CustomAnalyzer analyzer =
-          CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
-              .withTokenizer(
-                  "opennlp",
-                  "tokenizerModel",
-                  tokenizerModelFile,
-                  "sentenceModel",
-                  sentenceModelFile)
-              .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
-              .addTokenFilter(KeywordRepeatFilterFactory.class)
-              .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
-              .build();
-      assertAnalyzesTo(
-          analyzer, earlyExitInputText, earlyExitOutputTexts, null, null, null, null, null, true);
-    } finally {
-      if (earlyExitInput != null) {
-        earlyExitInput.close();
-      }
-      if (earlyExitOutput != null) {
-        earlyExitOutput.close();
-      }
-    }
-  }
-
-  public void testEmptyField() throws Exception {
-    CustomAnalyzer analyzer =
-        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
-            .withTokenizer(
-                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
-            .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
-            .addTokenFilter(KeywordRepeatFilterFactory.class)
-            .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
-            .build();
-    assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
-  }
 }
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
index bd3e649334f..e9ac0f796d6 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
@@ -21,7 +21,6 @@ import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
-import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
 import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.util.ClasspathResourceLoader;
@@ -67,7 +66,6 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
   private static final String[] NO_BREAK_terms = {"No", "period"};
   private static final int[] NO_BREAK_startOffsets = {0, 3};
   private static final int[] NO_BREAK_endOffsets = {2, 9};
-  private static final String[] NO_BREAK_KEYWORD_REPEAT_terms = {"No", "No", "period", "period"};
 
   private static final String sentenceModelFile = "en-test-sent.bin";
   private static final String tokenizerModelFile = "en-test-tokenizer.bin";
@@ -146,26 +144,4 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
         null,
         true);
   }
-
-  public void testNoBreakWithRepeatKeywordFilter() throws Exception {
-    CustomAnalyzer analyzer =
-        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
-            .withTokenizer(
-                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
-            .addTokenFilter(KeywordRepeatFilterFactory.class)
-            .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
-            .build();
-    assertAnalyzesTo(
-        analyzer, NO_BREAK, NO_BREAK_KEYWORD_REPEAT_terms, null, null, null, null, null, true);
-  }
-
-  public void testEmptyField() throws Exception {
-    CustomAnalyzer analyzer =
-        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
-            .withTokenizer(
-                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
-            .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
-            .build();
-    assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
-  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
deleted file mode 100644
index cbae8a4f120..00000000000
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.tokenattributes;
-
-import org.apache.lucene.util.Attribute;
-
-/**
- * This attribute tracks what sentence a given token belongs to as well as potentially other
- * sentence specific attributes.
- */
-public interface SentenceAttribute extends Attribute {
-
-  /**
-   * Get the sentence index for the current token
-   *
-   * @return The index of the sentence
-   * @see #getSentenceIndex()
-   */
-  int getSentenceIndex();
-
-  /**
-   * Set the sentence of the current token
-   *
-   * @see #setSentenceIndex(int sentenceIndex)
-   */
-  void setSentenceIndex(int sentenceIndex);
-}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
deleted file mode 100644
index 9911222ca15..00000000000
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.tokenattributes;
-
-import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.AttributeReflector;
-
-/**
- * Default implementation of {@link SentenceAttribute}.
- *
- * <p>The current implementation is coincidentally identical to {@link FlagsAttributeImpl} It was
- * decided to keep it separate because this attribute will NOT be an implied bitmap. Also, this
- * class may hold other sentence specific data in the future.
- */
-public class SentenceAttributeImpl extends AttributeImpl implements SentenceAttribute {
-
-  private int index = 0;
-
-  /** Initialize this attribute to default */
-  public SentenceAttributeImpl() {}
-
-  @Override
-  public void clear() {
-    index = 0;
-  }
-
-  @Override
-  public boolean equals(Object other) {
-    if (this == other) {
-      return true;
-    }
-
-    if (other instanceof SentenceAttributeImpl) {
-      return ((SentenceAttributeImpl) other).index == index;
-    }
-
-    return false;
-  }
-
-  @Override
-  public int hashCode() {
-    return index;
-  }
-
-  @Override
-  public void copyTo(AttributeImpl target) {
-    SentenceAttribute t = (SentenceAttribute) target;
-    t.setSentenceIndex(index);
-  }
-
-  @Override
-  public void reflectWith(AttributeReflector reflector) {
-    reflector.reflect(SentenceAttribute.class, "sentences", index);
-  }
-
-  @Override
-  public int getSentenceIndex() {
-    return index;
-  }
-
-  @Override
-  public void setSentenceIndex(int sentence) {
-    this.index = sentence;
-  }
-}