You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2022/10/31 06:19:13 UTC
[lucenenet] 05/14: PERFORMANCE: Lucene.Net.Analysis.Miscellaneous.StemmerOverrideFilter: Added overloads to Add for ICharSequence and char[] to reduce allocations. Added guard clauses.
This is an automated email from the ASF dual-hosted git repository.
nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 3abf2dbfefa23fa97269ee8695a52c387ffcc353
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Wed Oct 26 17:51:24 2022 +0700
PERFORMANCE: Lucene.Net.Analysis.Miscellaneous.StemmerOverrideFilter: Added overloads to Add for ICharSequence and char[] to reduce allocations. Added guard clauses.
---
.../Miscellaneous/StemmerOverrideFilter.cs | 116 +++++++++++++++-
.../Analysis/Nl/DutchAnalyzer.cs | 2 +-
.../Miscellaneous/TestStemmerOverrideFilter.cs | 27 ++++
src/Lucene.Net/Lucene.Net.csproj | 4 +-
src/Lucene.Net/Util/UnicodeUtil.cs | 149 ++++++++++++++++++++-
5 files changed, 284 insertions(+), 14 deletions(-)
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
index b14af8ecb..eb8772c48 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
@@ -1,8 +1,10 @@
// Lucene version compatibility level 4.8.1
using J2N;
+using J2N.Text;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
+using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
@@ -185,23 +187,101 @@ namespace Lucene.Net.Analysis.Miscellaneous
/// <param name="input"> the input char sequence </param>
/// <param name="output"> the stemmer override output char sequence </param>
/// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
+ /// <exception cref="ArgumentNullException"><paramref name="input"/> or <paramref name="output"/> is <c>null</c>.</exception>
+ // LUCENENET specific overload of ICharSequence
public virtual bool Add(string input, string output)
{
+ // LUCENENET: Added guard clauses
+ if (input is null)
+ throw new ArgumentNullException(nameof(input));
+ if (output is null)
+ throw new ArgumentNullException(nameof(output));
+
+ int length = input.Length;
+ if (ignoreCase)
+ {
+ // convert on the fly to lowercase
+
+ // LUCENENET: Reduce allocations/improve throughput by using stack and spans
+ var source = input.AsSpan();
+ if (length * sizeof(char) <= Constants.MaxStackByteLimit)
+ {
+ // Fast path - use the stack
+ Span<char> buffer = stackalloc char[length];
+ source.ToLowerInvariant(buffer);
+
+ UnicodeUtil.UTF16toUTF8(buffer, spare);
+ }
+ else
+ {
+ // Slow path - use the heap
+ charsSpare.Grow(length);
+ char[] buffer = charsSpare.Chars;
+
+ var destination = buffer.AsSpan(0, length);
+ source.ToLowerInvariant(destination);
+
+ UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
+ }
+ }
+ else
+ {
+ UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
+ }
+ if (hash.Add(spare) >= 0)
+ {
+ outputValues.Add(output);
+ return true;
+ }
+ return false;
+ }
+
+ /// <summary>
+ /// Adds an input string and it's stemmer override output to this builder.
+ /// </summary>
+ /// <param name="input"> the input char sequence </param>
+ /// <param name="output"> the stemmer override output char sequence </param>
+ /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
+ /// <exception cref="ArgumentNullException"><paramref name="input"/> or <paramref name="output"/> is <c>null</c>.</exception>
+ // LUCENENET specific overload of ICharSequence
+ public virtual bool Add(char[] input, string output)
+ {
+ // LUCENENET: Added guard clauses
+ if (input is null)
+ throw new ArgumentNullException(nameof(input));
+ if (output is null)
+ throw new ArgumentNullException(nameof(output));
+
int length = input.Length;
if (ignoreCase)
{
// convert on the fly to lowercase
- charsSpare.Grow(length);
- char[] buffer = charsSpare.Chars;
- for (int i = 0; i < length;)
+
+ // LUCENENET: Reduce allocations/improve throughput by using stack and spans
+ var source = new ReadOnlySpan<char>(input);
+ if (length * sizeof(char) <= Constants.MaxStackByteLimit)
{
- i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i);
+ // Fast path - use the stack
+ Span<char> buffer = stackalloc char[length];
+ source.ToLowerInvariant(buffer);
+
+ UnicodeUtil.UTF16toUTF8(buffer, spare);
+ }
+ else
+ {
+ // Slow path - use the heap
+ charsSpare.Grow(length);
+ char[] buffer = charsSpare.Chars;
+
+ var destination = buffer.AsSpan(0, length);
+ source.ToLowerInvariant(destination);
+
+ UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
}
- UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
}
else
{
- UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare);
+ UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
}
if (hash.Add(spare) >= 0)
{
@@ -211,6 +291,30 @@ namespace Lucene.Net.Analysis.Miscellaneous
return false;
}
+ /// <summary>
+ /// Adds an input string and it's stemmer override output to this builder.
+ /// </summary>
+ /// <param name="input"> the input char sequence </param>
+ /// <param name="output"> the stemmer override output char sequence </param>
+ /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
+ /// <exception cref="ArgumentNullException"><paramref name="input"/> or <paramref name="output"/> is <c>null</c>.</exception>
+ // LUCENENET specific overload of ICharSequence
+ public virtual bool Add(ICharSequence input, string output)
+ {
+ // LUCENENET: Added guard clauses
+ if (input is null)
+ throw new ArgumentNullException(nameof(input));
+ if (output is null)
+ throw new ArgumentNullException(nameof(output));
+
+ if (input is CharArrayCharSequence charArrayCharSequence && charArrayCharSequence.HasValue)
+ return Add(charArrayCharSequence.Value, output);
+
+ // LUCENENET: In .NET, the indexer for StringBuilder is slow, so we are better off
+ // converting to a string in all other cases.
+ return Add(input.ToString(), output);
+ }
+
/// <summary>
/// Returns a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </summary>
/// <returns> a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </returns>
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs
index 6200e49ba..08579e941 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs
@@ -176,7 +176,7 @@ namespace Lucene.Net.Analysis.Nl
{
char[] nextKey = iter.NextKey();
spare.CopyChars(nextKey, 0, nextKey.Length);
- builder.Add(new string(spare.Chars), iter.CurrentValue);
+ builder.Add(spare.Chars, iter.CurrentValue);
}
}
try
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
index fe045a64d..699d86fa8 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
@@ -4,6 +4,7 @@ using J2N.Collections.Generic.Extensions;
using J2N.Text;
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.En;
+using Lucene.Net.Attributes;
using Lucene.Net.Util;
using NUnit.Framework;
using System.Collections.Generic;
@@ -59,6 +60,32 @@ namespace Lucene.Net.Analysis.Miscellaneous
AssertTokenStreamContents(stream, new string[] { "books" });
}
+ [Test, LuceneNetSpecific]
+ public virtual void TestIgnoreCase_CharArray()
+ {
+ // lets make booked stem to books
+ // the override filter will convert "booked" to "books",
+ // but also mark it with KeywordAttribute so Porter will not change it.
+ StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
+ builder.Add("boOkEd".ToCharArray(), "books");
+ Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD"));
+ TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
+ AssertTokenStreamContents(stream, new string[] { "books" });
+ }
+
+ [Test, LuceneNetSpecific]
+ public virtual void TestIgnoreCase_CharSequence()
+ {
+ // lets make booked stem to books
+ // the override filter will convert "booked" to "books",
+ // but also mark it with KeywordAttribute so Porter will not change it.
+ StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
+ builder.Add("boOkEd".AsCharSequence(), "books");
+ Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD"));
+ TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
+ AssertTokenStreamContents(stream, new string[] { "books" });
+ }
+
[Test]
public virtual void TestNoOverrides()
{
diff --git a/src/Lucene.Net/Lucene.Net.csproj b/src/Lucene.Net/Lucene.Net.csproj
index 3e70f21c6..5c7e3802a 100644
--- a/src/Lucene.Net/Lucene.Net.csproj
+++ b/src/Lucene.Net/Lucene.Net.csproj
@@ -66,13 +66,15 @@
<ItemGroup Condition=" '$(TargetFramework)' == 'netstandard2.0' ">
<PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="$(MicrosoftExtensionsConfigurationAbstractionsPackageVersion)" />
<PackageReference Include="Prism.Core" Version="$(PrismCorePackageVersion)" />
+ <PackageReference Include="System.Memory" Version="$(SystemMemoryPackageVersion)" />
</ItemGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'net462' ">
<PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="$(MicrosoftExtensionsConfigurationAbstractionsPackageVersion)" />
<PackageReference Include="Prism.Core" Version="$(PrismCorePackageVersion)" />
+ <PackageReference Include="System.Memory" Version="$(SystemMemoryPackageVersion)" />
</ItemGroup>
-
+
<ItemGroup>
<InternalsVisibleTo Include="Lucene.Net.Analysis.Common" />
<InternalsVisibleTo Include="Lucene.Net.Analysis.Kuromoji" />
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 20dd3ea95..b6a6b8d30 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -121,13 +121,110 @@ namespace Lucene.Net.Util
private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
+ /// <summary>
+ /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
+ /// and ending at <paramref name="result"/>. After encoding, <c>result.Offset</c> will always be 0.
+ /// </summary>
+ /// <exception cref="ArgumentNullException"><paramref name="result"/> is <c>null</c>.</exception>
+ // TODO: broken if incoming result.offset != 0
+ // LUCENENET specific overload
+ public static void UTF16toUTF8(Span<char> source, BytesRef result)
+ {
+ // LUCENENET: Added guard clause
+ if (result is null)
+ throw new ArgumentNullException(nameof(result));
+
+ int length = source.Length;
+
+ int upto = 0;
+ int i = 0;
+ int end = source.Length;
+ var @out = result.Bytes;
+
+ // Pre-allocate for worst case 4-for-1
+ int maxLen = length * 4;
+ if (@out.Length < maxLen)
+ {
+ @out = result.Bytes = new byte[maxLen];
+ }
+ result.Offset = 0;
+
+ while (i < end)
+ {
+ int code = (int)source[i++];
+
+ if (code < 0x80)
+ {
+ @out[upto++] = (byte)code;
+ }
+ else if (code < 0x800)
+ {
+ @out[upto++] = (byte)(0xC0 | (code >> 6));
+ @out[upto++] = (byte)(0x80 | (code & 0x3F));
+ }
+ else if (code < 0xD800 || code > 0xDFFF)
+ {
+ @out[upto++] = (byte)(0xE0 | (code >> 12));
+ @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+ @out[upto++] = (byte)(0x80 | (code & 0x3F));
+ }
+ else
+ {
+ // surrogate pair
+ // confirm valid high surrogate
+ if (code < 0xDC00 && i < end)
+ {
+ var utf32 = (int)source[i];
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
+ {
+ utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
+ i++;
+ @out[upto++] = (byte)(0xF0 | (utf32 >> 18));
+ @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+ @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+ @out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+ continue;
+ }
+ }
+ // replace unpaired surrogate or out-of-order low surrogate
+ // with substitution character
+ @out[upto++] = 0xEF;
+ @out[upto++] = 0xBF;
+ @out[upto++] = 0xBD;
+ }
+ }
+ //assert matches(source, offset, length, out, upto);
+ result.Length = upto;
+ }
+
/// <summary>
/// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
/// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0.
/// </summary>
+ /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
+ /// <exception cref="ArgumentOutOfRangeException">
+ /// <paramref name="offset"/> or <paramref name="length"/> is less than zero.
+ /// <para/>
+ /// -or-
+ /// <para/>
+ /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
+ /// </exception>
// TODO: broken if incoming result.offset != 0
public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result)
{
+ // LUCENENET: Added guard clauses
+ if (source is null)
+ throw new ArgumentNullException(nameof(source));
+ if (result is null)
+ throw new ArgumentNullException(nameof(result));
+ if (offset < 0)
+ throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative.");
+ if (length < 0)
+ throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
+ if (offset > source.Length - length) // Checks for int overflow
+ throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+
int upto = 0;
int i = offset;
int end = offset + length;
@@ -193,9 +290,29 @@ namespace Lucene.Net.Util
/// Encode characters from this <see cref="ICharSequence"/>, starting at <paramref name="offset"/>
/// for <paramref name="length"/> characters. After encoding, <c>result.Offset</c> will always be 0.
/// </summary>
+ /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
+ /// <exception cref="ArgumentOutOfRangeException">
+ /// <paramref name="offset"/> or <paramref name="length"/> is less than zero.
+ /// <para/>
+ /// -or-
+ /// <para/>
+ /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
+ /// </exception>
// TODO: broken if incoming result.offset != 0
- public static void UTF16toUTF8(ICharSequence s, int offset, int length, BytesRef result)
+ public static void UTF16toUTF8(ICharSequence source, int offset, int length, BytesRef result)
{
+ // LUCENENET: Added guard clauses
+ if (source is null)
+ throw new ArgumentNullException(nameof(source));
+ if (result is null)
+ throw new ArgumentNullException(nameof(result));
+ if (offset < 0)
+ throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative.");
+ if (length < 0)
+ throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
+ if (offset > source.Length - length) // Checks for int overflow
+ throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+
int end = offset + length;
var @out = result.Bytes;
@@ -210,7 +327,7 @@ namespace Lucene.Net.Util
int upto = 0;
for (int i = offset; i < end; i++)
{
- var code = (int)s[i];
+ var code = (int)source[i];
if (code < 0x80)
{
@out[upto++] = (byte)code;
@@ -232,7 +349,7 @@ namespace Lucene.Net.Util
// confirm valid high surrogate
if (code < 0xDC00 && (i < end - 1))
{
- int utf32 = (int)s[i + 1];
+ int utf32 = (int)source[i + 1];
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
{
@@ -262,9 +379,29 @@ namespace Lucene.Net.Util
/// <para/>
/// LUCENENET specific.
/// </summary>
+ /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
+ /// <exception cref="ArgumentOutOfRangeException">
+ /// <paramref name="offset"/> or <paramref name="length"/> is less than zero.
+ /// <para/>
+ /// -or-
+ /// <para/>
+ /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
+ /// </exception>
// TODO: broken if incoming result.offset != 0
- public static void UTF16toUTF8(string s, int offset, int length, BytesRef result)
+ public static void UTF16toUTF8(string source, int offset, int length, BytesRef result)
{
+ // LUCENENET: Added guard clauses
+ if (source is null)
+ throw new ArgumentNullException(nameof(source));
+ if (result is null)
+ throw new ArgumentNullException(nameof(result));
+ if (offset < 0)
+ throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative.");
+ if (length < 0)
+ throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
+ if (offset > source.Length - length) // Checks for int overflow
+ throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+
int end = offset + length;
var @out = result.Bytes;
@@ -279,7 +416,7 @@ namespace Lucene.Net.Util
int upto = 0;
for (int i = offset; i < end; i++)
{
- var code = (int)s[i];
+ var code = (int)source[i];
if (code < 0x80)
{
@out[upto++] = (byte)code;
@@ -301,7 +438,7 @@ namespace Lucene.Net.Util
// confirm valid high surrogate
if (code < 0xDC00 && (i < end - 1))
{
- int utf32 = (int)s[i + 1];
+ int utf32 = (int)source[i + 1];
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
{