You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2023/04/11 18:13:06 UTC
[lucenenet] branch master updated: Fix AssertionError in JapaneseTokenizer backtrace LUCENE-10059 (#777)
This is an automated email from the ASF dual-hosted git repository.
nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git
The following commit(s) were added to refs/heads/master by this push:
new 5ebc7e207 Fix AssertionError in JapaneseTokenizer backtrace LUCENE-10059 (#777)
5ebc7e207 is described below
commit 5ebc7e207e38f74574eecadcad5873c2cce0a055
Author: H.H.Chen <ch...@126.com>
AuthorDate: Wed Apr 12 02:13:00 2023 +0800
Fix AssertionError in JapaneseTokenizer backtrace LUCENE-10059 (#777)
* fix japaneseTokenizer assertionError when backtrace
* Lucene.Net.Analysis.Kuromoji.(JapaneseTokenizer + TestJapaneseTokenizer): Added original Lucene comments from LUCENE-10059. Fixed bracket style.
* Lucene.Net.Analysis.Kuromoji:TestJapaneseTokenizer: comments to indicate where this test came from, since it is not from Lucene 4.8.0
* Lucene.Net.Analysis.Kuromoji.TestJapaneseAnalyzer: Added more comments because we don't have the entire fix - some of it belongs to the NBest feature which doesn't exist in Lucene 4.8.0.
---------
Co-authored-by: Shad Storhaug <sh...@shadstorhaug.com>
---
.../JapaneseTokenizer.cs | 20 ++++++++--
.../TestJapaneseTokenizer.cs | 46 +++++++++++++++++++---
2 files changed, 57 insertions(+), 9 deletions(-)
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs
index b4a68db39..edf76facd 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs
@@ -1,4 +1,6 @@
-using J2N;
+// Lucene version compatibility level 4.8.1 + LUCENE-10059 (https://github.com/apache/lucene/pull/254 only)
+
+using J2N;
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Analysis.Ja.TokenAttributes;
using Lucene.Net.Analysis.TokenAttributes;
@@ -46,13 +48,13 @@ namespace Lucene.Net.Analysis.Ja
/// <item><description><see cref="IInflectionAttribute"/> containing additional part-of-speech information for inflected forms.</description></item>
/// </list>
/// <para/>
- /// This tokenizer uses a rolling Viterbi search to find the
+ /// This tokenizer uses a rolling Viterbi search to find the
/// least cost segmentation (path) of the incoming characters.
/// For tokens that appear to be compound (> length 2 for all
/// Kanji, or > length 7 for non-Kanji), we see if there is a
/// 2nd best segmentation of that token after applying
/// penalties to the long tokens. If so, and the Mode is
- /// <see cref="JapaneseTokenizerMode.SEARCH"/>, we output the alternate segmentation
+ /// <see cref="JapaneseTokenizerMode.SEARCH"/>, we output the alternate segmentation
/// as well.
/// </remarks>
public sealed class JapaneseTokenizer : Tokenizer
@@ -883,6 +885,16 @@ namespace Lucene.Net.Analysis.Ja
{
int endPos = endPosData.pos;
+ /*
+ * LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
+ * avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
+ * empty buffer
+ */
+ if (endPos == lastBackTracePos)
+ {
+ return;
+ }
+
if (VERBOSE)
{
Console.WriteLine("\n backtrace: endPos=" + endPos + " pos=" + this.pos + "; " + (this.pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
@@ -1227,7 +1239,7 @@ namespace Lucene.Net.Analysis.Ja
NORMAL,
/// <summary>
- /// Segmentation geared towards search: this includes a
+ /// Segmentation geared towards search: this includes a
/// decompounding process for long nouns, also including
/// the full compound token as a synonym.
/// </summary>
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
index c6233a435..f649a03d1 100644
--- a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
@@ -1,4 +1,7 @@
-using J2N;
+// Lucene version compatibility level 4.8.1 + LUCENE-10059 (https://github.com/apache/lucene/pull/254 only)
+
+using J2N;
+using J2N.Collections.Generic.Extensions;
using J2N.Text;
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Analysis.Ja.TokenAttributes;
@@ -6,6 +9,7 @@ using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
+using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
@@ -58,27 +62,27 @@ namespace Lucene.Net.Analysis.Ja
}
}
- private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ private readonly Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
});
- private Analyzer analyzerNormal = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ private readonly Analyzer analyzerNormal = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.NORMAL);
return new TokenStreamComponents(tokenizer, tokenizer);
});
- private Analyzer analyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ private readonly Analyzer analyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), true, JapaneseTokenizerMode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
});
- private Analyzer extendedModeAnalyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ private readonly Analyzer extendedModeAnalyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), true, JapaneseTokenizerMode.EXTENDED);
return new TokenStreamComponents(tokenizer, tokenizer);
@@ -846,5 +850,37 @@ namespace Lucene.Net.Analysis.Ja
new int[] { 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 1 });
}
+
+ // LUCENENET: ported from LUCENE-10059
+ // Note that these are only the changes from https://github.com/apache/lucene/pull/254.
+ // The NBest feature doesn't yet exist in Lucene 4.8.0, so the changes from
+ // https://github.com/apache/lucene/pull/284 will need to be added here when that feature is ported.
+ [Test]
+ public void TestEmptyBacktrace()
+ {
+ String text = "";
+
+ // since the max backtrace gap ({@link JapaneseTokenizer#MAX_BACKTRACE_GAP)
+ // is set to 1024, we want the first 1023 characters to generate multiple paths
+ // so that the regular backtrace is not executed.
+ for (int i = 0; i < 1023; i++)
+ {
+ text += "あ";
+ }
+
+ // and the last 2 characters to be a valid word so that they
+ // will end-up together
+ text += "手紙";
+
+ IList<String> outputs = new List<String>();
+ for (int i = 0; i < 511; i++)
+ {
+ outputs.Add("ああ");
+ }
+ outputs.Add("あ");
+ outputs.Add("手紙");
+
+ AssertAnalyzesTo(analyzer, text, outputs.ToArray());
+ }
}
}