You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/08/23 12:20:02 UTC
[lucene-solr] branch branch_8x updated: LUCENE-10059: Fix an AssertionError when JapaneseTokenizer tries to backtrace from and to the same position (#254) (#2557)

This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 0fc8784  LUCENE-10059: Fix an AssertionError when JapaneseTokenizer tries to backtrace from and to the same position (#254) (#2557)
0fc8784 is described below

commit 0fc8784ff7bfea8b2c3f2c0fa8a99826bfa40c6d
Author: Dzung Bui <du...@gmail.com>
AuthorDate: Mon Aug 23 21:19:42 2021 +0900

    LUCENE-10059: Fix an AssertionError when JapaneseTokenizer tries to backtrace from and to the same position (#254) (#2557)
---
 .../lucene/analysis/ja/JapaneseTokenizer.java      |  9 ++++++++
 .../lucene/analysis/ja/TestJapaneseTokenizer.java  | 25 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
index 96f64e4..7bc77db 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
@@ -1630,6 +1630,15 @@ public final class JapaneseTokenizer extends Tokenizer {
   private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
     final int endPos = endPosData.pos;
 
+    /**
+     * LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
+     * avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
+     * empty buffer
+     */
+    if (endPos == lastBackTracePos) {
+      return;
+    }
+
     if (VERBOSE) {
       System.out.println("\n  backtrace: endPos=" + endPos + " pos=" + pos + "; " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
     }
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
index 30836af..0ca846f 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@@ -25,6 +25,7 @@ import java.io.Reader;
 import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.List;
 import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -932,4 +933,28 @@ public class
     assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ",
             new String[]{"北海道", "日本", "ハムファイターズ"});
   }
+
+  public void testEmptyBacktrace() throws IOException {
+    String text = "";
+
+    // since the max backtrace gap ({@link JapaneseTokenizer#MAX_BACKTRACE_GAP)
+    // is set to 1024, we want the first 1023 characters to generate multiple paths
+    // so that the regular backtrace is not executed.
+    for (int i = 0; i < 1023; i++) {
+      text += "あ";
+    }
+
+    // and the last 2 characters to be a valid word so that they
+    // will end-up together
+    text += "手紙";
+
+    List<String> outputs = new ArrayList<>();
+    for (int i = 0; i < 511; i++) {
+      outputs.add("ああ");
+    }
+    outputs.add("あ");
+    outputs.add("手紙");
+
+    assertAnalyzesTo(analyzer, text, outputs.toArray(new String[0]));
+  }
 }