You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2022/09/23 18:11:39 UTC
[lucene] branch branch_9x updated: Revert "Fix repeating token sentence boundary bug (#11734)"
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 64b124349ba Revert "Fix repeating token sentence boundary bug (#11734)"
64b124349ba is described below
commit 64b124349bac3cc764630e5e89de9574fc17dbe9
Author: Dawid Weiss <da...@carrotsearch.com>
AuthorDate: Fri Sep 23 20:11:26 2022 +0200
Revert "Fix repeating token sentence boundary bug (#11734)"
This reverts commit b79edbb54acd3a956435c126fb35ed49c3a2e7e0.
---
lucene/CHANGES.txt | 6 --
.../analysis/opennlp/OpenNLPChunkerFilter.java | 63 ++++++++++-------
.../analysis/opennlp/OpenNLPLemmatizerFilter.java | 63 ++++++++++-------
.../lucene/analysis/opennlp/OpenNLPPOSFilter.java | 63 +++++++++--------
.../lucene/analysis/opennlp/OpenNLPTokenizer.java | 19 ++---
.../opennlp/SentenceAttributeExtractor.java | 81 ---------------------
.../analysis/opennlp/data/early-exit-bug-input.txt | 23 ------
.../opennlp/data/early-exit-bug-output.txt | 32 ---------
.../opennlp/TestOpenNLPChunkerFilterFactory.java | 12 ----
.../TestOpenNLPLemmatizerFilterFactory.java | 82 ----------------------
.../opennlp/TestOpenNLPPOSFilterFactory.java | 24 -------
.../tokenattributes/SentenceAttribute.java | 42 -----------
.../tokenattributes/SentenceAttributeImpl.java | 80 ---------------------
13 files changed, 119 insertions(+), 471 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 15e97e0afa8..c5ad8e05651 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -24,12 +24,6 @@ Bug Fixes
trying to apply a dictionary whose size is greater than the maximum supported
window size for LZ4. (Adrien Grand)
-* GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream.
- (Luke Kot-Zaniewski)
-
-* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
- (Luke Kot-Zaniewski)
-
Other
---------------------
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
index 6510bbce8d4..00932278337 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IgnoreRandomChains;
@@ -36,65 +36,76 @@ import org.apache.lucene.util.IgnoreRandomChains;
*/
@IgnoreRandomChains(reason = "other filters must precede this one (see docs)")
public final class OpenNLPChunkerFilter extends TokenFilter {
+
+ private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
private int tokenNum = 0;
+ private boolean moreTokensAvailable = true;
+ private String[] sentenceTerms = null;
+ private String[] sentenceTermPOSTags = null;
+
private final NLPChunkerOp chunkerOp;
- private final SentenceAttributeExtractor sentenceAttributeExtractor;
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
super(input);
this.chunkerOp = chunkerOp;
- sentenceAttributeExtractor =
- new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
}
@Override
- public boolean incrementToken() throws IOException {
- List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
- boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
- if (isEndOfCurrentSentence) {
- boolean noSentencesLeft =
- sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
- if (noSentencesLeft) {
+ public final boolean incrementToken() throws IOException {
+ if (!moreTokensAvailable) {
+ clear();
+ return false;
+ }
+ if (tokenNum == sentenceTokenAttrs.size()) {
+ nextSentence();
+ if (sentenceTerms == null) {
+ clear();
return false;
}
+ assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
+ tokenNum = 0;
}
clearAttributes();
sentenceTokenAttrs.get(tokenNum++).copyTo(this);
return true;
}
- private List<AttributeSource> nextSentence() throws IOException {
- tokenNum = 0;
+ private void nextSentence() throws IOException {
List<String> termList = new ArrayList<>();
List<String> posTagList = new ArrayList<>();
- for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
- termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
- posTagList.add(attributeSource.getAttribute(TypeAttribute.class).type());
+ sentenceTokenAttrs.clear();
+ boolean endOfSentence = false;
+ while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+ termList.add(termAtt.toString());
+ posTagList.add(typeAtt.type());
+ endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+ sentenceTokenAttrs.add(input.cloneAttributes());
}
- String[] sentenceTerms = termList.toArray(new String[0]);
- String[] sentenceTermPOSTags = posTagList.toArray(new String[0]);
- assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
- return sentenceAttributeExtractor.getSentenceAttributes();
+ sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
+ sentenceTermPOSTags =
+ posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
}
private void assignTokenTypes(String[] tags) {
for (int i = 0; i < tags.length; ++i) {
- sentenceAttributeExtractor
- .getSentenceAttributes()
- .get(i)
- .getAttribute(TypeAttribute.class)
- .setType(tags[i]);
+ sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
}
}
@Override
public void reset() throws IOException {
super.reset();
- sentenceAttributeExtractor.reset();
+ moreTokensAvailable = true;
clear();
}
private void clear() {
+ sentenceTokenAttrs.clear();
+ sentenceTerms = null;
+ sentenceTermPOSTags = null;
tokenNum = 0;
}
}
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
index 53aab1e46c6..af14f03cf21 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
@@ -24,7 +24,10 @@ import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
-import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IgnoreRandomChains;
@@ -43,28 +46,37 @@ import org.apache.lucene.util.IgnoreRandomChains;
public class OpenNLPLemmatizerFilter extends TokenFilter {
private final NLPLemmatizerOp lemmatizerOp;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+ private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+ private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
- private final SentenceAttributeExtractor sentenceAttributeExtractor;
- private String[] lemmas = new String[0]; // lemmas for non-keyword tokens
+ private boolean moreTokensAvailable = true;
+ private String[] sentenceTokens = null; // non-keyword tokens
+ private String[] sentenceTokenTypes = null; // types for non-keyword tokens
+ private String[] lemmas = null; // lemmas for non-keyword tokens
private int lemmaNum = 0; // lemma counter
public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
super(input);
this.lemmatizerOp = lemmatizerOp;
- sentenceAttributeExtractor =
- new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
}
@Override
public final boolean incrementToken() throws IOException {
- boolean isEndOfCurrentSentence = lemmaNum >= lemmas.length;
- if (isEndOfCurrentSentence) {
- boolean noSentencesLeft =
- sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
- if (noSentencesLeft) {
+ if (!moreTokensAvailable) {
+ clear();
+ return false;
+ }
+ if (sentenceTokenAttrsIter == null || !sentenceTokenAttrsIter.hasNext()) {
+ nextSentence();
+ if (sentenceTokens == null) { // zero non-keyword tokens
+ clear();
return false;
}
+ lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
+ lemmaNum = 0;
+ sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
}
clearAttributes();
sentenceTokenAttrsIter.next().copyTo(this);
@@ -74,35 +86,36 @@ public class OpenNLPLemmatizerFilter extends TokenFilter {
return true;
}
- private List<AttributeSource> nextSentence() throws IOException {
- lemmaNum = 0;
+ private void nextSentence() throws IOException {
List<String> tokenList = new ArrayList<>();
List<String> typeList = new ArrayList<>();
- List<AttributeSource> sentenceAttributes =
- sentenceAttributeExtractor.extractSentenceAttributes();
- for (AttributeSource attributeSource : sentenceAttributes) {
- if (!attributeSource.getAttribute(KeywordAttribute.class).isKeyword()) {
- tokenList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
- typeList.add(attributeSource.getAttribute(TypeAttribute.class).type());
+ sentenceTokenAttrs.clear();
+ boolean endOfSentence = false;
+ while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+ if (!keywordAtt.isKeyword()) {
+ tokenList.add(termAtt.toString());
+ typeList.add(typeAtt.type());
}
+ endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+ sentenceTokenAttrs.add(input.cloneAttributes());
}
- String[] sentenceTokens = tokenList.toArray(new String[0]);
- String[] sentenceTokenTypes = typeList.toArray(new String[0]);
- lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
- sentenceTokenAttrsIter = sentenceAttributes.iterator();
- return sentenceAttributeExtractor.getSentenceAttributes();
+ sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
+ sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
}
@Override
public void reset() throws IOException {
super.reset();
- sentenceAttributeExtractor.reset();
+ moreTokensAvailable = true;
clear();
}
private void clear() {
+ sentenceTokenAttrs.clear();
sentenceTokenAttrsIter = null;
- lemmas = new String[0];
+ sentenceTokens = null;
+ sentenceTokenTypes = null;
+ lemmas = null;
lemmaNum = 0;
}
}
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
index d892c42f33b..2cb3ab595fc 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
@@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IgnoreRandomChains;
@@ -33,62 +33,65 @@ import org.apache.lucene.util.IgnoreRandomChains;
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
public final class OpenNLPPOSFilter extends TokenFilter {
+ private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+ String[] tags = null;
private int tokenNum = 0;
+ private boolean moreTokensAvailable = true;
+
private final NLPPOSTaggerOp posTaggerOp;
- private final SentenceAttributeExtractor sentenceAttributeExtractor;
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
super(input);
this.posTaggerOp = posTaggerOp;
- sentenceAttributeExtractor =
- new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
}
@Override
- public boolean incrementToken() throws IOException {
- List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
- boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
- if (isEndOfCurrentSentence) {
- boolean noSentencesLeft =
- sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
- if (noSentencesLeft) {
+ public final boolean incrementToken() throws IOException {
+ if (!moreTokensAvailable) {
+ clear();
+ return false;
+ }
+ if (tokenNum
+ == sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
+ String[] sentenceTokens = nextSentence();
+ if (sentenceTokens == null) {
+ clear();
return false;
}
+ tags = posTaggerOp.getPOSTags(sentenceTokens);
+ tokenNum = 0;
}
clearAttributes();
- sentenceTokenAttrs.get(tokenNum++).copyTo(this);
+ sentenceTokenAttrs.get(tokenNum).copyTo(this);
+ typeAtt.setType(tags[tokenNum++]);
return true;
}
- private List<AttributeSource> nextSentence() throws IOException {
- tokenNum = 0;
+ private String[] nextSentence() throws IOException {
List<String> termList = new ArrayList<>();
- for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
- termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
- }
- String[] sentenceTerms = termList.toArray(new String[0]);
- assignTokenTypes(posTaggerOp.getPOSTags(sentenceTerms));
- return sentenceAttributeExtractor.getSentenceAttributes();
- }
-
- private void assignTokenTypes(String[] tags) {
- for (int i = 0; i < tags.length; ++i) {
- sentenceAttributeExtractor
- .getSentenceAttributes()
- .get(i)
- .getAttribute(TypeAttribute.class)
- .setType(tags[i]);
+ sentenceTokenAttrs.clear();
+ boolean endOfSentence = false;
+ while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+ termList.add(termAtt.toString());
+ endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+ sentenceTokenAttrs.add(input.cloneAttributes());
}
+ return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
}
@Override
public void reset() throws IOException {
super.reset();
- sentenceAttributeExtractor.reset();
+ moreTokensAvailable = true;
clear();
}
private void clear() {
+ sentenceTokenAttrs.clear();
+ tags = null;
tokenNum = 0;
}
}
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
index b9903630b25..c31f5c11ea0 100644
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
@@ -22,27 +22,28 @@ import opennlp.tools.util.Span;
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IgnoreRandomChains;
/**
- * Run OpenNLP SentenceDetector and Tokenizer. The index of each sentence is stored in
- * SentenceAttribute.
+ * Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
+ * the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
+ * apply operations to tokens one sentence at a time.
*/
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
+ public static int EOS_FLAG_BIT = 1;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final SentenceAttribute sentenceAtt = addAttribute(SentenceAttribute.class);
private Span[] termSpans = null;
private int termNum = 0;
private int sentenceStart = 0;
- private int sentenceIndex = -1;
private NLPTokenizerOp tokenizerOp = null;
@@ -70,7 +71,6 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
termSpans = tokenizerOp.getTerms(sentenceText);
termNum = 0;
- sentenceIndex++;
}
@Override
@@ -84,7 +84,11 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
offsetAtt.setOffset(
correctOffset(offset + sentenceStart + term.getStart()),
correctOffset(offset + sentenceStart + term.getEnd()));
- sentenceAtt.setSentenceIndex(sentenceIndex);
+ if (termNum == termSpans.length - 1) {
+ flagsAtt.setFlags(
+ flagsAtt.getFlags()
+ | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
+ }
++termNum;
return true;
}
@@ -94,6 +98,5 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
super.reset();
termSpans = null;
termNum = sentenceStart = 0;
- sentenceIndex = -1;
}
}
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
deleted file mode 100644
index 34c88e5cf62..00000000000
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.opennlp;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
-import org.apache.lucene.util.AttributeSource;
-
-/**
- * Iterate through sentence tokens and cache their attributes. Could consider moving this to a more
- * central location to be used by other sentence-aware components.
- *
- * <p>May want to consider making this its own Filter so that extracted sentence token attributes
- * can be shared by downstream sentence-aware filters.
- */
-public class SentenceAttributeExtractor {
-
- private final TokenStream input;
- private final SentenceAttribute sentenceAtt;
- private final List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
- private AttributeSource prevAttributeSource;
- private int currSentence = 0;
- private boolean hasNextToken = true;
-
- public SentenceAttributeExtractor(TokenStream input, SentenceAttribute sentenceAtt) {
- this.input = input;
- this.sentenceAtt = sentenceAtt;
- }
-
- // If this class were a stand-alone filter it could conceivably extract the attributes once
- // and cache a reference to those attributes in SentenceAttribute. That way downstream filters
- // could read the full sentence without having to independently extract it.
- public List<AttributeSource> extractSentenceAttributes() throws IOException {
- sentenceTokenAttrs.clear();
- boolean hasNext;
- do {
- hasNextToken = input.incrementToken();
- int currSentenceTmp = sentenceAtt.getSentenceIndex();
- hasNext = (currSentence == currSentenceTmp && hasNextToken);
- currSentence = currSentenceTmp;
- if (prevAttributeSource != null) {
- sentenceTokenAttrs.add(prevAttributeSource);
- }
- prevAttributeSource = input.cloneAttributes();
- } while (hasNext);
- return sentenceTokenAttrs;
- }
-
- public List<AttributeSource> getSentenceAttributes() {
- return sentenceTokenAttrs;
- }
-
- public boolean allSentencesProcessed() {
- return !hasNextToken;
- }
-
- public void reset() {
- hasNextToken = true;
- sentenceTokenAttrs.clear();
- currSentence = 0;
- prevAttributeSource = null;
- }
-}
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
deleted file mode 100644
index 1e202557df1..00000000000
--- a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-Quick brown fox jumped over the lazy dog.
-x
-This should hopefully get analyzed.
-x
-And so should this.
\ No newline at end of file
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
deleted file mode 100644
index 3678c205724..00000000000
--- a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
-x x
-This This should should hopefully hopefully get get analyzed analyzed . .
-x x
-And And so so should should this this . .
\ No newline at end of file
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
index 708f7bafc17..1132be3ffe6 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
@@ -114,16 +114,4 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
true,
toPayloads(SENTENCES_chunks));
}
-
- public void testEmptyField() throws Exception {
- CustomAnalyzer analyzer =
- CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
- .withTokenizer(
- "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
- .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
- .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
- .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
- .build();
- assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
- }
}
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
index 681ec91eb22..d74bc451d93 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
@@ -17,11 +17,6 @@
package org.apache.lucene.analysis.opennlp;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
-import java.util.stream.Collectors;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
@@ -113,10 +108,6 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
"IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."
};
- private static final String NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD = "period";
-
- private static final String[] NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms = {"period", "period"};
-
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
@@ -299,77 +290,4 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
null,
true);
}
-
- public void testNoBreakWithRepeatKeywordFilter() throws Exception {
- CustomAnalyzer analyzer =
- CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
- .withTokenizer(
- "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
- .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
- .addTokenFilter(KeywordRepeatFilterFactory.class)
- .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
- .build();
- assertAnalyzesTo(
- analyzer,
- NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD,
- NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms,
- null,
- null,
- null,
- null,
- null,
- true);
- }
-
- // checks for bug described in https://github.com/apache/lucene/issues/11771
- public void testPreventEarlyExit() throws IOException {
- InputStream earlyExitInput = null;
- InputStream earlyExitOutput = null;
- try {
- ClasspathResourceLoader loader = new ClasspathResourceLoader(getClass());
- earlyExitInput = loader.openResource("data/early-exit-bug-input.txt");
- String earlyExitInputText = new String(earlyExitInput.readAllBytes(), StandardCharsets.UTF_8);
- earlyExitOutput = loader.openResource("data/early-exit-bug-output.txt");
- String earlyExitOutputText =
- new String(earlyExitOutput.readAllBytes(), StandardCharsets.UTF_8);
- String[] earlyExitOutputTexts =
- Arrays.stream(earlyExitOutputText.split("\\s"))
- .filter(text -> text != "")
- .collect(Collectors.joining(" "))
- .split(" ");
- CustomAnalyzer analyzer =
- CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
- .withTokenizer(
- "opennlp",
- "tokenizerModel",
- tokenizerModelFile,
- "sentenceModel",
- sentenceModelFile)
- .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
- .addTokenFilter(KeywordRepeatFilterFactory.class)
- .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
- .build();
- assertAnalyzesTo(
- analyzer, earlyExitInputText, earlyExitOutputTexts, null, null, null, null, null, true);
- } finally {
- if (earlyExitInput != null) {
- earlyExitInput.close();
- }
- if (earlyExitOutput != null) {
- earlyExitOutput.close();
- }
- }
- }
-
- public void testEmptyField() throws Exception {
- CustomAnalyzer analyzer =
- CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
- .withTokenizer(
- "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
- .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
- .addTokenFilter(KeywordRepeatFilterFactory.class)
- .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
- .build();
- assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
- }
}
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
index bd3e649334f..e9ac0f796d6 100644
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
@@ -21,7 +21,6 @@ import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
-import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.ClasspathResourceLoader;
@@ -67,7 +66,6 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
private static final String[] NO_BREAK_terms = {"No", "period"};
private static final int[] NO_BREAK_startOffsets = {0, 3};
private static final int[] NO_BREAK_endOffsets = {2, 9};
- private static final String[] NO_BREAK_KEYWORD_REPEAT_terms = {"No", "No", "period", "period"};
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
@@ -146,26 +144,4 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
null,
true);
}
-
- public void testNoBreakWithRepeatKeywordFilter() throws Exception {
- CustomAnalyzer analyzer =
- CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
- .withTokenizer(
- "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
- .addTokenFilter(KeywordRepeatFilterFactory.class)
- .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
- .build();
- assertAnalyzesTo(
- analyzer, NO_BREAK, NO_BREAK_KEYWORD_REPEAT_terms, null, null, null, null, null, true);
- }
-
- public void testEmptyField() throws Exception {
- CustomAnalyzer analyzer =
- CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
- .withTokenizer(
- "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
- .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
- .build();
- assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
- }
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
deleted file mode 100644
index cbae8a4f120..00000000000
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.tokenattributes;
-
-import org.apache.lucene.util.Attribute;
-
-/**
- * This attribute tracks what sentence a given token belongs to as well as potentially other
- * sentence specific attributes.
- */
-public interface SentenceAttribute extends Attribute {
-
- /**
- * Get the sentence index for the current token
- *
- * @return The index of the sentence
- * @see #getSentenceIndex()
- */
- int getSentenceIndex();
-
- /**
- * Set the sentence of the current token
- *
- * @see #setSentenceIndex(int sentenceIndex)
- */
- void setSentenceIndex(int sentenceIndex);
-}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
deleted file mode 100644
index 9911222ca15..00000000000
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.tokenattributes;
-
-import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.AttributeReflector;
-
-/**
- * Default implementation of {@link SentenceAttribute}.
- *
- * <p>The current implementation is coincidentally identical to {@link FlagsAttributeImpl} It was
- * decided to keep it separate because this attribute will NOT be an implied bitmap. Also, this
- * class may hold other sentence specific data in the future.
- */
-public class SentenceAttributeImpl extends AttributeImpl implements SentenceAttribute {
-
- private int index = 0;
-
- /** Initialize this attribute to default */
- public SentenceAttributeImpl() {}
-
- @Override
- public void clear() {
- index = 0;
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other) {
- return true;
- }
-
- if (other instanceof SentenceAttributeImpl) {
- return ((SentenceAttributeImpl) other).index == index;
- }
-
- return false;
- }
-
- @Override
- public int hashCode() {
- return index;
- }
-
- @Override
- public void copyTo(AttributeImpl target) {
- SentenceAttribute t = (SentenceAttribute) target;
- t.setSentenceIndex(index);
- }
-
- @Override
- public void reflectWith(AttributeReflector reflector) {
- reflector.reflect(SentenceAttribute.class, "sentences", index);
- }
-
- @Override
- public int getSentenceIndex() {
- return index;
- }
-
- @Override
- public void setSentenceIndex(int sentence) {
- this.index = sentence;
- }
-}