You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2022/10/31 06:19:13 UTC
[lucenenet] 05/14: PERFORMANCE: Lucene.Net.Analysis.Miscellaneous.StemmerOverrideFilter: Added overloads to Add for ICharSequence and char[] to reduce allocations. Added guard clauses.

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 3abf2dbfefa23fa97269ee8695a52c387ffcc353
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Wed Oct 26 17:51:24 2022 +0700

    PERFORMANCE: Lucene.Net.Analysis.Miscellaneous.StemmerOverrideFilter: Added overloads to Add for ICharSequence and char[] to reduce allocations. Added guard clauses.
---
 .../Miscellaneous/StemmerOverrideFilter.cs         | 116 +++++++++++++++-
 .../Analysis/Nl/DutchAnalyzer.cs                   |   2 +-
 .../Miscellaneous/TestStemmerOverrideFilter.cs     |  27 ++++
 src/Lucene.Net/Lucene.Net.csproj                   |   4 +-
 src/Lucene.Net/Util/UnicodeUtil.cs                 | 149 ++++++++++++++++++++-
 5 files changed, 284 insertions(+), 14 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
index b14af8ecb..eb8772c48 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
@@ -1,8 +1,10 @@
 // Lucene version compatibility level 4.8.1
 using J2N;
+using J2N.Text;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
+using System;
 using System.Collections.Generic;
 using System.Globalization;
 using System.IO;
@@ -185,23 +187,101 @@ namespace Lucene.Net.Analysis.Miscellaneous
             /// <param name="input"> the input char sequence </param>
             /// <param name="output"> the stemmer override output char sequence </param>
             /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
+            /// <exception cref="ArgumentNullException"><paramref name="input"/> or <paramref name="output"/> is <c>null</c>.</exception>
+            // LUCENENET specific overload of ICharSequence
             public virtual bool Add(string input, string output)
             {
+                // LUCENENET: Added guard clauses
+                if (input is null)
+                    throw new ArgumentNullException(nameof(input));
+                if (output is null)
+                    throw new ArgumentNullException(nameof(output));
+
+                int length = input.Length;
+                if (ignoreCase)
+                {
+                    // convert on the fly to lowercase
+
+                    // LUCENENET: Reduce allocations/improve throughput by using stack and spans
+                    var source = input.AsSpan();
+                    if (length * sizeof(char) <= Constants.MaxStackByteLimit)
+                    {
+                        // Fast path - use the stack
+                        Span<char> buffer = stackalloc char[length];
+                        source.ToLowerInvariant(buffer);
+
+                        UnicodeUtil.UTF16toUTF8(buffer, spare);
+                    }
+                    else
+                    {
+                        // Slow path - use the heap
+                        charsSpare.Grow(length);
+                        char[] buffer = charsSpare.Chars;
+
+                        var destination = buffer.AsSpan(0, length);
+                        source.ToLowerInvariant(destination);
+
+                        UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
+                    }
+                }
+                else
+                {
+                    UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
+                }
+                if (hash.Add(spare) >= 0)
+                {
+                    outputValues.Add(output);
+                    return true;
+                }
+                return false;
+            }
+
+            /// <summary>
+            /// Adds an input string and it's stemmer override output to this builder.
+            /// </summary>
+            /// <param name="input"> the input char sequence </param>
+            /// <param name="output"> the stemmer override output char sequence </param>
+            /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
+            /// <exception cref="ArgumentNullException"><paramref name="input"/> or <paramref name="output"/> is <c>null</c>.</exception>
+            // LUCENENET specific overload of ICharSequence
+            public virtual bool Add(char[] input, string output)
+            {
+                // LUCENENET: Added guard clauses
+                if (input is null)
+                    throw new ArgumentNullException(nameof(input));
+                if (output is null)
+                    throw new ArgumentNullException(nameof(output));
+
                 int length = input.Length;
                 if (ignoreCase)
                 {
                     // convert on the fly to lowercase
-                    charsSpare.Grow(length);
-                    char[] buffer = charsSpare.Chars;
-                    for (int i = 0; i < length;)
+
+                    // LUCENENET: Reduce allocations/improve throughput by using stack and spans
+                    var source = new ReadOnlySpan<char>(input);
+                    if (length * sizeof(char) <= Constants.MaxStackByteLimit)
                     {
-                        i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i);
+                        // Fast path - use the stack
+                        Span<char> buffer = stackalloc char[length];
+                        source.ToLowerInvariant(buffer);
+
+                        UnicodeUtil.UTF16toUTF8(buffer, spare);
+                    }
+                    else
+                    {
+                        // Slow path - use the heap
+                        charsSpare.Grow(length);
+                        char[] buffer = charsSpare.Chars;
+
+                        var destination = buffer.AsSpan(0, length);
+                        source.ToLowerInvariant(destination);
+
+                        UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                     }
-                    UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                 }
                 else
                 {
-                    UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare);
+                    UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
                 }
                 if (hash.Add(spare) >= 0)
                 {
@@ -211,6 +291,30 @@ namespace Lucene.Net.Analysis.Miscellaneous
                 return false;
             }
 
+            /// <summary>
+            /// Adds an input string and it's stemmer override output to this builder.
+            /// </summary>
+            /// <param name="input"> the input char sequence </param>
+            /// <param name="output"> the stemmer override output char sequence </param>
+            /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
+            /// <exception cref="ArgumentNullException"><paramref name="input"/> or <paramref name="output"/> is <c>null</c>.</exception>
+            // LUCENENET specific overload of ICharSequence
+            public virtual bool Add(ICharSequence input, string output)
+            {
+                // LUCENENET: Added guard clauses
+                if (input is null)
+                    throw new ArgumentNullException(nameof(input));
+                if (output is null)
+                    throw new ArgumentNullException(nameof(output));
+
+                if (input is CharArrayCharSequence charArrayCharSequence && charArrayCharSequence.HasValue)
+                    return Add(charArrayCharSequence.Value, output);
+
+                // LUCENENET: In .NET, the indexer for StringBuilder is slow, so we are better off
+                // converting to a string in all other cases.
+                return Add(input.ToString(), output);
+            }
+
             /// <summary>
             /// Returns a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </summary>
             /// <returns> a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </returns>
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs
index 6200e49ba..08579e941 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs
@@ -176,7 +176,7 @@ namespace Lucene.Net.Analysis.Nl
                     {
                         char[] nextKey = iter.NextKey();
                         spare.CopyChars(nextKey, 0, nextKey.Length);
-                        builder.Add(new string(spare.Chars), iter.CurrentValue);
+                        builder.Add(spare.Chars, iter.CurrentValue);
                     }
                 }
                 try
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
index fe045a64d..699d86fa8 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
@@ -4,6 +4,7 @@ using J2N.Collections.Generic.Extensions;
 using J2N.Text;
 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.En;
+using Lucene.Net.Attributes;
 using Lucene.Net.Util;
 using NUnit.Framework;
 using System.Collections.Generic;
@@ -59,6 +60,32 @@ namespace Lucene.Net.Analysis.Miscellaneous
             AssertTokenStreamContents(stream, new string[] { "books" });
         }
 
+        [Test, LuceneNetSpecific]
+        public virtual void TestIgnoreCase_CharArray()
+        {
+            // lets make booked stem to books
+            // the override filter will convert "booked" to "books",
+            // but also mark it with KeywordAttribute so Porter will not change it.
+            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
+            builder.Add("boOkEd".ToCharArray(), "books");
+            Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD"));
+            TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
+            AssertTokenStreamContents(stream, new string[] { "books" });
+        }
+
+        [Test, LuceneNetSpecific]
+        public virtual void TestIgnoreCase_CharSequence()
+        {
+            // lets make booked stem to books
+            // the override filter will convert "booked" to "books",
+            // but also mark it with KeywordAttribute so Porter will not change it.
+            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
+            builder.Add("boOkEd".AsCharSequence(), "books");
+            Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD"));
+            TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
+            AssertTokenStreamContents(stream, new string[] { "books" });
+        }
+
         [Test]
         public virtual void TestNoOverrides()
         {
diff --git a/src/Lucene.Net/Lucene.Net.csproj b/src/Lucene.Net/Lucene.Net.csproj
index 3e70f21c6..5c7e3802a 100644
--- a/src/Lucene.Net/Lucene.Net.csproj
+++ b/src/Lucene.Net/Lucene.Net.csproj
@@ -66,13 +66,15 @@
   <ItemGroup Condition=" '$(TargetFramework)' == 'netstandard2.0' ">
     <PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="$(MicrosoftExtensionsConfigurationAbstractionsPackageVersion)" />
     <PackageReference Include="Prism.Core" Version="$(PrismCorePackageVersion)" />
+    <PackageReference Include="System.Memory" Version="$(SystemMemoryPackageVersion)" />
   </ItemGroup>
 
   <ItemGroup Condition=" '$(TargetFramework)' == 'net462' ">
     <PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="$(MicrosoftExtensionsConfigurationAbstractionsPackageVersion)" />
     <PackageReference Include="Prism.Core" Version="$(PrismCorePackageVersion)" />
+    <PackageReference Include="System.Memory" Version="$(SystemMemoryPackageVersion)" />
   </ItemGroup>
-  
+
   <ItemGroup>
     <InternalsVisibleTo Include="Lucene.Net.Analysis.Common" />
     <InternalsVisibleTo Include="Lucene.Net.Analysis.Kuromoji" />
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 20dd3ea95..b6a6b8d30 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -121,13 +121,110 @@ namespace Lucene.Net.Util
 
         private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
 
+        /// <summary>
+        /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
+        /// and ending at <paramref name="result"/>. After encoding, <c>result.Offset</c> will always be 0.
+        /// </summary>
+        /// <exception cref="ArgumentNullException"><paramref name="result"/> is <c>null</c>.</exception>
+        // TODO: broken if incoming result.offset != 0
+        // LUCENENET specific overload
+        public static void UTF16toUTF8(Span<char> source, BytesRef result)
+        {
+            // LUCENENET: Added guard clause
+            if (result is null)
+                throw new ArgumentNullException(nameof(result));
+
+            int length = source.Length;
+
+            int upto = 0;
+            int i = 0;
+            int end = source.Length;
+            var @out = result.Bytes;
+
+            // Pre-allocate for worst case 4-for-1
+            int maxLen = length * 4;
+            if (@out.Length < maxLen)
+            {
+                @out = result.Bytes = new byte[maxLen];
+            }
+            result.Offset = 0;
+
+            while (i < end)
+            {
+                int code = (int)source[i++];
+
+                if (code < 0x80)
+                {
+                    @out[upto++] = (byte)code;
+                }
+                else if (code < 0x800)
+                {
+                    @out[upto++] = (byte)(0xC0 | (code >> 6));
+                    @out[upto++] = (byte)(0x80 | (code & 0x3F));
+                }
+                else if (code < 0xD800 || code > 0xDFFF)
+                {
+                    @out[upto++] = (byte)(0xE0 | (code >> 12));
+                    @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+                    @out[upto++] = (byte)(0x80 | (code & 0x3F));
+                }
+                else
+                {
+                    // surrogate pair
+                    // confirm valid high surrogate
+                    if (code < 0xDC00 && i < end)
+                    {
+                        var utf32 = (int)source[i];
+                        // confirm valid low surrogate and write pair
+                        if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
+                        {
+                            utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
+                            i++;
+                            @out[upto++] = (byte)(0xF0 | (utf32 >> 18));
+                            @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+                            @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+                            @out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+                            continue;
+                        }
+                    }
+                    // replace unpaired surrogate or out-of-order low surrogate
+                    // with substitution character
+                    @out[upto++] = 0xEF;
+                    @out[upto++] = 0xBF;
+                    @out[upto++] = 0xBD;
+                }
+            }
+            //assert matches(source, offset, length, out, upto);
+            result.Length = upto;
+        }
+
         /// <summary>
         /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
         /// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0.
         /// </summary>
+        /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// <paramref name="offset"/> or <paramref name="length"/> is less than zero.
+        /// <para/>
+        /// -or-
+        /// <para/>
+        /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
+        /// </exception>
         // TODO: broken if incoming result.offset != 0
         public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result)
         {
+            // LUCENENET: Added guard clauses
+            if (source is null)
+                throw new ArgumentNullException(nameof(source));
+            if (result is null)
+                throw new ArgumentNullException(nameof(result));
+            if (offset < 0)
+                throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative.");
+            if (length < 0)
+                throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
+            if (offset > source.Length - length) // Checks for int overflow
+                throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+
             int upto = 0;
             int i = offset;
             int end = offset + length;
@@ -193,9 +290,29 @@ namespace Lucene.Net.Util
         /// Encode characters from this <see cref="ICharSequence"/>, starting at <paramref name="offset"/>
         /// for <paramref name="length"/> characters. After encoding, <c>result.Offset</c> will always be 0.
         /// </summary>
+        /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// <paramref name="offset"/> or <paramref name="length"/> is less than zero.
+        /// <para/>
+        /// -or-
+        /// <para/>
+        /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
+        /// </exception>
         // TODO: broken if incoming result.offset != 0
-        public static void UTF16toUTF8(ICharSequence s, int offset, int length, BytesRef result)
+        public static void UTF16toUTF8(ICharSequence source, int offset, int length, BytesRef result)
         {
+            // LUCENENET: Added guard clauses
+            if (source is null)
+                throw new ArgumentNullException(nameof(source));
+            if (result is null)
+                throw new ArgumentNullException(nameof(result));
+            if (offset < 0)
+                throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative.");
+            if (length < 0)
+                throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
+            if (offset > source.Length - length) // Checks for int overflow
+                throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+
             int end = offset + length;
 
             var @out = result.Bytes;
@@ -210,7 +327,7 @@ namespace Lucene.Net.Util
             int upto = 0;
             for (int i = offset; i < end; i++)
             {
-                var code = (int)s[i];
+                var code = (int)source[i];
                 if (code < 0x80)
                 {
                     @out[upto++] = (byte)code;
@@ -232,7 +349,7 @@ namespace Lucene.Net.Util
                     // confirm valid high surrogate
                     if (code < 0xDC00 && (i < end - 1))
                     {
-                        int utf32 = (int)s[i + 1];
+                        int utf32 = (int)source[i + 1];
                         // confirm valid low surrogate and write pair
                         if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
                         {
@@ -262,9 +379,29 @@ namespace Lucene.Net.Util
         /// <para/>
         /// LUCENENET specific.
         /// </summary>
+        /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// <paramref name="offset"/> or <paramref name="length"/> is less than zero.
+        /// <para/>
+        /// -or-
+        /// <para/>
+        /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
+        /// </exception>
         // TODO: broken if incoming result.offset != 0
-        public static void UTF16toUTF8(string s, int offset, int length, BytesRef result)
+        public static void UTF16toUTF8(string source, int offset, int length, BytesRef result)
         {
+            // LUCENENET: Added guard clauses
+            if (source is null)
+                throw new ArgumentNullException(nameof(source));
+            if (result is null)
+                throw new ArgumentNullException(nameof(result));
+            if (offset < 0)
+                throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative.");
+            if (length < 0)
+                throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
+            if (offset > source.Length - length) // Checks for int overflow
+                throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+
             int end = offset + length;
 
             var @out = result.Bytes;
@@ -279,7 +416,7 @@ namespace Lucene.Net.Util
             int upto = 0;
             for (int i = offset; i < end; i++)
             {
-                var code = (int)s[i];
+                var code = (int)source[i];
                 if (code < 0x80)
                 {
                     @out[upto++] = (byte)code;
@@ -301,7 +438,7 @@ namespace Lucene.Net.Util
                     // confirm valid high surrogate
                     if (code < 0xDC00 && (i < end - 1))
                     {
-                        int utf32 = (int)s[i + 1];
+                        int utf32 = (int)source[i + 1];
                         // confirm valid low surrogate and write pair
                         if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
                         {