You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2020/08/24 21:19:40 UTC

[lucenenet] 08/09: Lucene.Net.ICU: Added locking to ThaiTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This helps, but is only a partial fix.

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 5ff92583f219fa851375c9be12ae1b6bf52383c1
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Mon Aug 24 15:09:37 2020 +0700

    Lucene.Net.ICU: Added locking to ThaiTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This helps, but is only a partial fix.
---
 .../Analysis/Th/ThaiTokenizer.cs                   | 92 ++++++++++++++++------
 1 file changed, 66 insertions(+), 26 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index 7e0754c..d1f80f1 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -41,13 +41,27 @@ namespace Lucene.Net.Analysis.Th
     /// </summary>
     public class ThaiTokenizer : SegmentingTokenizerBase
     {
+        private static readonly object syncLock = new object(); // LUCENENET specific - workaround until BreakIterator is made thread safe  (LUCENENET TODO: TO REVERT)
+
         // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
-        private static readonly BreakIterator proto = BreakIterator.GetWordInstance(new CultureInfo("th"));
+        private static readonly BreakIterator proto = LoadProto();
 
         /// <summary>
         /// used for breaking the text into sentences
         /// </summary>
-        private static readonly BreakIterator sentenceProto = BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
+        private static readonly BreakIterator sentenceProto = LoadSentenceProto();
+
+        private static BreakIterator LoadProto()
+        {
+            lock (syncLock)
+                return BreakIterator.GetWordInstance(new CultureInfo("th"));
+        }
+
+        private static BreakIterator LoadSentenceProto()
+        {
+            lock (syncLock)
+                return BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
+        }
 
         private readonly ThaiWordBreaker wordBreaker;
         private readonly CharArrayIterator wrapper = Analysis.Util.CharArrayIterator.NewWordInstance();
@@ -68,48 +82,74 @@ namespace Lucene.Net.Analysis.Th
         /// <summary>
         /// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
         public ThaiTokenizer(AttributeFactory factory, TextReader reader)
-            : base(factory, reader, (BreakIterator)sentenceProto.Clone())
+            : base(factory, reader, CreateSentenceClone())
         {
             // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
 
-            wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
+            lock (syncLock)
+                wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
             termAtt = AddAttribute<ICharTermAttribute>();
             offsetAtt = AddAttribute<IOffsetAttribute>();
         }
 
-        protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
+        private static BreakIterator CreateSentenceClone()
         {
-            this.sentenceStart = sentenceStart;
-            this.sentenceEnd = sentenceEnd;
-            wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
-            wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
+            lock (syncLock)
+                return (BreakIterator)sentenceProto.Clone();
         }
 
-        protected override bool IncrementWord()
+        public override void Reset()
+        {
+            lock (syncLock)
+                base.Reset();
+        }
+
+        public override State CaptureState()
+        {
+            lock (syncLock)
+                return base.CaptureState();
+        }
+
+        protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
         {
-            int start = wordBreaker.Current;
-            if (start == BreakIterator.Done)
+            lock (syncLock)
             {
-                return false; // BreakIterator exhausted
+                this.sentenceStart = sentenceStart;
+                this.sentenceEnd = sentenceEnd;
+                wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
+                wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
             }
+        }
 
-            // find the next set of boundaries, skipping over non-tokens
-            int end = wordBreaker.Next();
-            while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
+        protected override bool IncrementWord()
+        {
+            int start, end;
+            lock (syncLock)
             {
-                start = end;
+                start = wordBreaker.Current;
+                if (start == BreakIterator.Done)
+                {
+                    return false; // BreakIterator exhausted
+                }
+
+                // find the next set of boundaries, skipping over non-tokens
                 end = wordBreaker.Next();
-            }
+                while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
+                {
+                    start = end;
+                    end = wordBreaker.Next();
+                }
 
-            if (end == BreakIterator.Done)
-            {
-                return false; // BreakIterator exhausted
-            }
+                if (end == BreakIterator.Done)
+                {
+                    return false; // BreakIterator exhausted
+                }
 
-            ClearAttributes();
-            termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
-            offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
-            return true;
+                ClearAttributes();
+                termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
+                offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
+                return true;
+            }
         }
     }