You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2023/04/11 18:13:06 UTC

[lucenenet] branch master updated: Fix AssertionError in JapaneseTokenizer backtrace LUCENE-10059 (#777)

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git


The following commit(s) were added to refs/heads/master by this push:
     new 5ebc7e207 Fix AssertionError in JapaneseTokenizer backtrace LUCENE-10059 (#777)
5ebc7e207 is described below

commit 5ebc7e207e38f74574eecadcad5873c2cce0a055
Author: H.H.Chen <ch...@126.com>
AuthorDate: Wed Apr 12 02:13:00 2023 +0800

    Fix AssertionError in JapaneseTokenizer backtrace LUCENE-10059 (#777)
    
    * fix japaneseTokenizer assertionError when backtrace
    
    * Lucene.Net.Analysis.Kuromoji.(JapaneseTokenizer + TestJapaneseTokenizer): Added original Lucene comments from LUCENE-10059. Fixed bracket style.
    
    * Lucene.Net.Analysis.Kuromoji:TestJapaneseTokenizer: comments to indicate where this test came from, since it is not from Lucene 4.8.0
    
    * Lucene.Net.Analysis.Kuromoji.TestJapaneseAnalyzer: Added more comments because we don't have the entire fix - some of it belongs to the NBest feature which doesn't exist in Lucene 4.8.0.
    
    ---------
    
    Co-authored-by: Shad Storhaug <sh...@shadstorhaug.com>
---
 .../JapaneseTokenizer.cs                           | 20 ++++++++--
 .../TestJapaneseTokenizer.cs                       | 46 +++++++++++++++++++---
 2 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs
index b4a68db39..edf76facd 100644
--- a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs
@@ -1,4 +1,6 @@
-using J2N;
+// Lucene version compatibility level 4.8.1 + LUCENE-10059 (https://github.com/apache/lucene/pull/254 only)
+
+using J2N;
 using Lucene.Net.Analysis.Ja.Dict;
 using Lucene.Net.Analysis.Ja.TokenAttributes;
 using Lucene.Net.Analysis.TokenAttributes;
@@ -46,13 +48,13 @@ namespace Lucene.Net.Analysis.Ja
     ///     <item><description><see cref="IInflectionAttribute"/> containing additional part-of-speech information for inflected forms.</description></item>
     /// </list>
     /// <para/>
-    /// This tokenizer uses a rolling Viterbi search to find the 
+    /// This tokenizer uses a rolling Viterbi search to find the
     /// least cost segmentation (path) of the incoming characters.
     /// For tokens that appear to be compound (> length 2 for all
     /// Kanji, or > length 7 for non-Kanji), we see if there is a
     /// 2nd best segmentation of that token after applying
     /// penalties to the long tokens.  If so, and the Mode is
-    /// <see cref="JapaneseTokenizerMode.SEARCH"/>, we output the alternate segmentation 
+    /// <see cref="JapaneseTokenizerMode.SEARCH"/>, we output the alternate segmentation
     /// as well.
     /// </remarks>
     public sealed class JapaneseTokenizer : Tokenizer
@@ -883,6 +885,16 @@ namespace Lucene.Net.Analysis.Ja
         {
             int endPos = endPosData.pos;
 
+            /*
+             * LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
+             * avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
+             * empty buffer
+             */
+            if (endPos == lastBackTracePos)
+            {
+                return;
+            }
+
             if (VERBOSE)
             {
                 Console.WriteLine("\n  backtrace: endPos=" + endPos + " pos=" + this.pos + "; " + (this.pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
@@ -1227,7 +1239,7 @@ namespace Lucene.Net.Analysis.Ja
         NORMAL,
 
         /// <summary>
-        /// Segmentation geared towards search: this includes a 
+        /// Segmentation geared towards search: this includes a
         /// decompounding process for long nouns, also including
         /// the full compound token as a synonym.
         /// </summary>
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
index c6233a435..f649a03d1 100644
--- a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
@@ -1,4 +1,7 @@
-using J2N;
+// Lucene version compatibility level 4.8.1 + LUCENE-10059 (https://github.com/apache/lucene/pull/254 only)
+
+using J2N;
+using J2N.Collections.Generic.Extensions;
 using J2N.Text;
 using Lucene.Net.Analysis.Ja.Dict;
 using Lucene.Net.Analysis.Ja.TokenAttributes;
@@ -6,6 +9,7 @@ using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Util;
 using NUnit.Framework;
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Text;
 using System.Text.RegularExpressions;
@@ -58,27 +62,27 @@ namespace Lucene.Net.Analysis.Ja
             }
         }
 
-        private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        private readonly Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
         {
             Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH);
             return new TokenStreamComponents(tokenizer, tokenizer);
         });
 
 
-        private Analyzer analyzerNormal = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        private readonly Analyzer analyzerNormal = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
         {
             Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.NORMAL);
             return new TokenStreamComponents(tokenizer, tokenizer);
         });
 
-        private Analyzer analyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        private readonly Analyzer analyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
         {
             Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), true, JapaneseTokenizerMode.SEARCH);
             return new TokenStreamComponents(tokenizer, tokenizer);
         });
 
 
-        private Analyzer extendedModeAnalyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+        private readonly Analyzer extendedModeAnalyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
         {
             Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), true, JapaneseTokenizerMode.EXTENDED);
             return new TokenStreamComponents(tokenizer, tokenizer);
@@ -846,5 +850,37 @@ namespace Lucene.Net.Analysis.Ja
                                       new int[] { 1, 1, 1, 1, 1 },
                                       new int[] { 1, 1, 1, 1, 1 });
         }
+
+        // LUCENENET: ported from LUCENE-10059
+        // Note that these are only the changes from https://github.com/apache/lucene/pull/254.
+        // The NBest feature doesn't yet exist in Lucene 4.8.0, so the changes from
+        // https://github.com/apache/lucene/pull/284 will need to be added here when that feature is ported.
+        [Test]
+        public void TestEmptyBacktrace()
+        {
+            String text = "";
+
+            // since the max backtrace gap ({@link JapaneseTokenizer#MAX_BACKTRACE_GAP)
+            // is set to 1024, we want the first 1023 characters to generate multiple paths
+            // so that the regular backtrace is not executed.
+            for (int i = 0; i < 1023; i++)
+            {
+                text += "あ";
+            }
+
+            // and the last 2 characters to be a valid word so that they
+            // will end-up together
+            text += "手紙";
+
+            IList<String> outputs = new List<String>();
+            for (int i = 0; i < 511; i++)
+            {
+                outputs.Add("ああ");
+            }
+            outputs.Add("あ");
+            outputs.Add("手紙");
+
+            AssertAnalyzesTo(analyzer, text, outputs.ToArray());
+        }
     }
 }