You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/08/23 12:20:02 UTC
[lucene-solr] branch branch_8x updated: LUCENE-10059: Fix an
AssertionError when JapaneseTokenizer tries to backtrace from and to the
same position (#254) (#2557)
This is an automated email from the ASF dual-hosted git repository.
mikemccand pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 0fc8784 LUCENE-10059: Fix an AssertionError when JapaneseTokenizer tries to backtrace from and to the same position (#254) (#2557)
0fc8784 is described below
commit 0fc8784ff7bfea8b2c3f2c0fa8a99826bfa40c6d
Author: Dzung Bui <du...@gmail.com>
AuthorDate: Mon Aug 23 21:19:42 2021 +0900
LUCENE-10059: Fix an AssertionError when JapaneseTokenizer tries to backtrace from and to the same position (#254) (#2557)
---
.../lucene/analysis/ja/JapaneseTokenizer.java | 9 ++++++++
.../lucene/analysis/ja/TestJapaneseTokenizer.java | 25 ++++++++++++++++++++++
2 files changed, 34 insertions(+)
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
index 96f64e4..7bc77db 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
@@ -1630,6 +1630,15 @@ public final class JapaneseTokenizer extends Tokenizer {
private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
final int endPos = endPosData.pos;
+ /**
+ * LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
+ * avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
+ * empty buffer
+ */
+ if (endPos == lastBackTracePos) {
+ return;
+ }
+
if (VERBOSE) {
System.out.println("\n backtrace: endPos=" + endPos + " pos=" + pos + "; " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
}
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
index 30836af..0ca846f 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@@ -25,6 +25,7 @@ import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
+import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
@@ -932,4 +933,28 @@ public class
assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ",
new String[]{"北海道", "日本", "ハムファイターズ"});
}
+
+ public void testEmptyBacktrace() throws IOException {
+ String text = "";
+
+ // since the max backtrace gap ({@link JapaneseTokenizer#MAX_BACKTRACE_GAP)
+ // is set to 1024, we want the first 1023 characters to generate multiple paths
+ // so that the regular backtrace is not executed.
+ for (int i = 0; i < 1023; i++) {
+ text += "あ";
+ }
+
+ // and the last 2 characters to be a valid word so that they
+ // will end-up together
+ text += "手紙";
+
+ List<String> outputs = new ArrayList<>();
+ for (int i = 0; i < 511; i++) {
+ outputs.add("ああ");
+ }
+ outputs.add("あ");
+ outputs.add("手紙");
+
+ assertAnalyzesTo(analyzer, text, outputs.toArray(new String[0]));
+ }
}