You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2022/10/31 06:19:19 UTC
[lucenenet] 11/14: PERFORMANCE: Lucene.Net.Analysis.Th.ThaiWordBreaker: Removed unnecessary string allocations and concatenation. Use CharsRef to reuse the same memory. Removed Regex and replaced with UnicodeSet to detect Thai code points.

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit e72315a75009854483c979462eb2406f41311796
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Thu Oct 27 08:10:47 2022 +0700

    PERFORMANCE: Lucene.Net.Analysis.Th.ThaiWordBreaker: Removed unnecessary string allocations and concatenation. Use CharsRef to reuse the same memory. Removed Regex and replaced with UnicodeSet to detect Thai code points.
---
 .../Analysis/Th/ThaiTokenizer.cs                   | 24 ++++++++++++----------
 .../Analysis/Th/ThaiWordFilter.cs                  |  4 ++--
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index 3bc1a8314..aba8a4328 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -1,10 +1,12 @@
 // Lucene version compatibility level 4.8.1
 #if FEATURE_BREAKITERATOR
+using ICU4N.Support.Text;
 using ICU4N.Text;
 using J2N;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Support.Threading;
+using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
 using System.Globalization;
@@ -80,7 +82,7 @@ namespace Lucene.Net.Analysis.Th
         }
 
         private readonly ThaiWordBreaker wordBreaker;
-        private readonly CharArrayIterator wrapper = Analysis.Util.CharArrayIterator.NewWordInstance();
+        private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance();
 
         private int sentenceStart;
         private int sentenceEnd;
@@ -162,7 +164,7 @@ namespace Lucene.Net.Analysis.Th
                 this.sentenceStart = sentenceStart;
                 this.sentenceEnd = sentenceEnd;
                 wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
-                wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
+                wordBreaker.SetText(wrapper);
             }
             finally
             {
@@ -215,18 +217,18 @@ namespace Lucene.Net.Analysis.Th
     internal class ThaiWordBreaker
     {
         private readonly BreakIterator wordBreaker;
-        private string text;
+        private readonly CharsRef text = new CharsRef();
         private readonly Queue<int> transitions = new Queue<int>();
-        private static readonly Regex thaiPattern = new Regex(@"\p{IsThai}+", RegexOptions.Compiled | RegexOptions.CultureInvariant);
+        private static readonly UnicodeSet thai = new UnicodeSet("[:Thai:]").Freeze();
 
         public ThaiWordBreaker(BreakIterator wordBreaker)
         {
             this.wordBreaker = wordBreaker ?? throw new ArgumentNullException(nameof(wordBreaker));
         }
 
-        public void SetText(string text)
+        public void SetText(CharArrayIterator text)
         {
-            this.text = text;
+            this.text.CopyChars(text.Text, text.Start, text.Length);
             wordBreaker.SetText(text);
         }
 
@@ -262,20 +264,20 @@ namespace Lucene.Net.Analysis.Th
             if (current != BreakIterator.Done && current - prev > 0)
             {
                 int length = text.Length;
-                string toMatch;
+                int codePoint;
                 // Find all of the transitions between Thai and non-Thai characters and digits
                 for (int i = prev; i < current; i++)
                 {
                     char high = text[i];
                     // Account for surrogate pairs
                     if (char.IsHighSurrogate(high) && i < length && i + 1 < current && char.IsLowSurrogate(text[i + 1]))
-                        toMatch = string.Empty + high + text[++i];
+                        codePoint = Character.ToCodePoint(high, text[++i]);
                     else
-                        toMatch = string.Empty + high;
+                        codePoint = high;
 
-                    if (char.IsLetter(toMatch, 0)) // Always break letters apart from digits to match the JDK
+                    if (Character.IsLetter(codePoint)) // Always break letters apart from digits to match the JDK
                     {
-                        isThai = thaiPattern.IsMatch(toMatch);
+                        isThai = thai.Contains(codePoint);
                         isNonThai = !isThai;
                     }
                     else
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
index 0c060cfd7..ea90dbd5d 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 4.8.1
+// Lucene version compatibility level 4.8.1
 #if FEATURE_BREAKITERATOR
 using ICU4N.Text;
 using Lucene.Net.Analysis.Core;
@@ -138,7 +138,7 @@ namespace Lucene.Net.Analysis.Th
 
             // reinit CharacterIterator
             charIterator.SetText(clonedTermAtt.Buffer, 0, clonedTermAtt.Length);
-            breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length));
+            breaker.SetText(charIterator);
             int end2 = breaker.Next();
             if (end2 != BreakIterator.Done)
             {