You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2020/08/24 21:19:32 UTC

[lucenenet] branch master updated (e8bc15f -> 6eba1e0)

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git.


    from e8bc15f  azure-pipelines.yml: Added option to disable asserts when running tests
     new 672c5a9  Lucene.Net.Analysis.ICU: Updated Segmentation files to Lucene 8.6.1 to account for the latest features of ICU
     new 6161f4f  Lucene.Net.ICU: Reverted extra locking/cloning for ThaiTokenizer
     new f6addeb  PERFORMANCE: Lucene.Net.Util.AttributeSource: Eliminated unnecessary try catch and replaced ContainsKey with TryGetValue
     new a250321  PERFORMANCE: Lucene.Net.Util.AttributeSource::GetAttribute<T>(): Removed extra lookup by using TryGetValue
     new 990f929  PERFORMANCE: Lucene.Net.Util: Streamlined DefaultAttributeFactory to make the get/update process of creating an attribute WeakReference atomic
     new 57ed84a  PERFORMANCE: Lucene.Net.Util.AttributeSource.DefaultAttributeFactory: Use external lock for better performance and removed redundant GetOrAdd() call
     new d6dcab3  Lucene.Net.Tests.ICU: Added concurrency test for ThaiAnalyzer
     new 5ff9258  Lucene.Net.ICU: Added locking to ThaiTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This helps, but is only a partial fix.
     new 6eba1e0  TO REVERT: Lucene.Net.ICU: Added locking to ICUTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This can be reverted when the BreakIterator issue is fixed.

The 9 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../Analysis/Th/ThaiTokenizer.cs                   |  54 +++++--
 .../Icu/Segmentation/BreakIteratorWrapper.cs       | 179 +++++++-------------
 .../Analysis/Icu/Segmentation/CharArrayIterator.cs |   2 +-
 .../Icu/Segmentation/CompositeBreakIterator.cs     |   6 +-
 .../Icu/Segmentation/DefaultICUTokenizerConfig.cs  |  16 +-
 .../Analysis/Icu/Segmentation/ICUTokenizer.cs      |  48 +++---
 .../Icu/Segmentation/ICUTokenizerConfig.cs         |   8 +-
 .../Icu/Segmentation/ICUTokenizerFactory.cs        |  11 +-
 .../Analysis/Icu/Segmentation/ScriptIterator.cs    |   2 +-
 .../Analysis/Th/TestThaiAnalyzer.cs                | 180 +++++++++++++++++++++
 src/Lucene.Net/Util/AttributeSource.cs             |  44 ++---
 11 files changed, 344 insertions(+), 206 deletions(-)


[lucenenet] 03/09: PERFORMANCE: Lucene.Net.Util.AttributeSource: Eliminated unnecessary try catch and replaced ContainsKey with TryGetValue

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit f6addebe128fcfe288e6ce848fa0eed418562468
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Sun Aug 23 23:40:08 2020 +0700

    PERFORMANCE: Lucene.Net.Util.AttributeSource: Eliminated unnecessary try catch and replaced ContainsKey with TryGetValue
---
 src/Lucene.Net/Util/AttributeSource.cs | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/Lucene.Net/Util/AttributeSource.cs b/src/Lucene.Net/Util/AttributeSource.cs
index e4cf9b1..74fc911 100644
--- a/src/Lucene.Net/Util/AttributeSource.cs
+++ b/src/Lucene.Net/Util/AttributeSource.cs
@@ -395,28 +395,19 @@ namespace Lucene.Net.Util
             where T : IAttribute
         {
             var attClass = typeof(T);
-            if (!attributes.ContainsKey(attClass))
+            // LUCENENET: Eliminated exception and used TryGetValue
+            if (!attributes.TryGetValue(attClass, out var result))
             {
                 if (!(attClass.IsInterface && typeof(IAttribute).IsAssignableFrom(attClass)))
                 {
                     throw new ArgumentException("AddAttribute() only accepts an interface that extends IAttribute, but " + attClass.FullName + " does not fulfil this contract.");
                 }
 
-                AddAttributeImpl(this.factory.CreateAttributeInstance<T>());
+                result = this.factory.CreateAttributeInstance<T>();
+                AddAttributeImpl(result);
             }
 
-            T returnAttr;
-            try
-            {
-                returnAttr = (T)(IAttribute)attributes[attClass];
-            }
-#pragma warning disable 168
-            catch (KeyNotFoundException knf)
-#pragma warning restore 168
-            {
-                return default(T);
-            }
-            return returnAttr;
+            return (T)(IAttribute)result;
         }
 
         /// <summary>


[lucenenet] 04/09: PERFORMANCE: Lucene.Net.Util.AttributeSource::GetAttribute(): Removed extra lookup by using TryGetValue

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit a25032127bee8173aa7db3a3018f42c3ed35f277
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Mon Aug 24 20:05:02 2020 +0700

    PERFORMANCE: Lucene.Net.Util.AttributeSource::GetAttribute<T>(): Removed extra lookup by using TryGetValue
---
 src/Lucene.Net/Util/AttributeSource.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Lucene.Net/Util/AttributeSource.cs b/src/Lucene.Net/Util/AttributeSource.cs
index 74fc911..0f0ed10 100644
--- a/src/Lucene.Net/Util/AttributeSource.cs
+++ b/src/Lucene.Net/Util/AttributeSource.cs
@@ -437,11 +437,11 @@ namespace Lucene.Net.Util
         public virtual T GetAttribute<T>() where T : IAttribute
         {
             var attClass = typeof(T);
-            if (!attributes.ContainsKey(attClass))
+            if (!attributes.TryGetValue(attClass, out var result))
             {
-                throw new ArgumentException("this AttributeSource does not have the attribute '" + attClass.Name + "'.");
+                throw new ArgumentException($"this AttributeSource does not have the attribute '{attClass.Name}'.");
             }
-            return (T)(IAttribute)this.attributes[attClass];
+            return (T)(IAttribute)result;
         }
 
         private State GetCurrentState()


[lucenenet] 07/09: Lucene.Net.Tests.ICU: Added concurrency test for ThaiAnalyzer

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit d6dcab39ead9160c4a0baa3a00b28b30e2827fb0
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Sun Aug 23 23:43:08 2020 +0700

    Lucene.Net.Tests.ICU: Added concurrency test for ThaiAnalyzer
---
 .../Analysis/Th/TestThaiAnalyzer.cs                | 180 +++++++++++++++++++++
 1 file changed, 180 insertions(+)

diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Th/TestThaiAnalyzer.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Th/TestThaiAnalyzer.cs
index e8ceb75..113bbc7 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Th/TestThaiAnalyzer.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Th/TestThaiAnalyzer.cs
@@ -1,4 +1,5 @@
 #if FEATURE_BREAKITERATOR
+using J2N.Threading;
 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
@@ -6,6 +7,9 @@ using Lucene.Net.Attributes;
 using Lucene.Net.Util;
 using NUnit.Framework;
 using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Threading;
 
 namespace Lucene.Net.Analysis.Th
 {
@@ -172,6 +176,182 @@ namespace Lucene.Net.Analysis.Th
             AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
         }
 
+        [Test]
+        [LuceneNetSpecific]
+        public virtual void TestConcurrency()
+        {
+            ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
+
+            char[] chars = new char[] {
+                (char)4160,
+                (char)4124,
+                (char)4097,
+                (char)4177,
+                (char)4113,
+                (char)32,
+                (char)10671,
+            };
+            string contents = new string(chars);
+            AssertAnalyzer(analyzer, contents);
+
+            int numThreads = 4;
+            var startingGun = new CountdownEvent(1);
+            var threads = new ThaiAnalyzerThread[numThreads];
+            for (int i = 0; i < threads.Length; i++)
+            {
+                threads[i] = new ThaiAnalyzerThread(startingGun, analyzer, contents);
+            }
+
+            foreach (var thread in threads)
+            {
+                thread.Start();
+            }
+
+            startingGun.Signal();
+            foreach (var t in threads)
+            {
+                try
+                {
+                    t.Join();
+                }
+#pragma warning disable 168
+                catch (ThreadInterruptedException e)
+#pragma warning restore 168
+                {
+                    fail("Thread interrupted");
+                }
+            }
+        }
+
+        private class ThaiAnalyzerThread : ThreadJob
+        {
+            private readonly CountdownEvent latch;
+            private readonly Analyzer analyzer;
+            private readonly string text;
+
+            public ThaiAnalyzerThread(CountdownEvent latch, Analyzer analyzer, string text)
+            {
+                this.latch = latch;
+                this.analyzer = analyzer;
+                this.text = text;
+            }
+
+            public override void Run()
+            {
+                latch.Wait();
+
+                for (int i = 0; i < 1000; i++)
+                {
+                    AssertAnalyzer(analyzer, text);
+                }
+            }
+        }
+
+        private static void AssertAnalyzer(Analyzer analyzer, string text)
+        {
+            ICharTermAttribute termAtt;
+            IOffsetAttribute offsetAtt;
+            IPositionIncrementAttribute posIncAtt;
+
+            List<string> tokens = new List<string>();
+            List<int> positions = new List<int>();
+            List<int> startOffsets = new List<int>();
+            List<int> endOffsets = new List<int>();
+
+            TokenStream ts;
+            TextReader reader = new StringReader(text);
+
+            using (ts = analyzer.GetTokenStream("dummy", reader))
+            {
+                bool isReset = false;
+                try
+                {
+
+                    termAtt = ts.GetAttribute<ICharTermAttribute>();
+                    offsetAtt = ts.GetAttribute<IOffsetAttribute>();
+                    posIncAtt = ts.GetAttribute<IPositionIncrementAttribute>();
+
+                    ts.Reset();
+                    isReset = true;
+
+                    while (ts.IncrementToken())
+                    {
+                        Assert.IsNotNull(termAtt, "has no CharTermAttribute");
+                        tokens.Add(termAtt.ToString());
+                        positions.Add(posIncAtt.PositionIncrement);
+                        startOffsets.Add(offsetAtt.StartOffset);
+                        endOffsets.Add(offsetAtt.EndOffset);
+                    }
+                }
+                finally
+                {
+                    if (!isReset)
+                    {
+                        try
+                        {
+                            // consume correctly
+                            ts.Reset();
+                            while (ts.IncrementToken()) ;
+                            //ts.End();
+                            //ts.Dispose();
+                        }
+#pragma warning disable 168
+                        catch (Exception ex)
+#pragma warning restore 168
+                        {
+                            // ignore
+                        }
+                    }
+                    ts.End(); // ts.end();
+                }
+            } // ts.Dispose()
+
+            reader = new StringReader(text);
+            using (ts = analyzer.GetTokenStream("dummy", reader))
+            {
+                bool isReset = false;
+                try
+                {
+
+                    // offset + pos
+                    AssertTokenStreamContents(ts,
+                        output: tokens.ToArray(),
+                        startOffsets: ToIntArray(startOffsets),
+                        endOffsets: ToIntArray(endOffsets),
+                        types: null,
+                        posIncrements: ToIntArray(positions),
+                        posLengths: null,
+                        finalOffset: text.Length,
+                        offsetsAreCorrect: true);
+
+                    isReset = true;
+                }
+                finally
+                {
+                    if (!isReset)
+                    {
+                        try
+                        {
+                            // consume correctly
+                            ts.Reset();
+                            while (ts.IncrementToken()) ;
+                            //ts.End();
+                            //ts.Dispose();
+                        }
+#pragma warning disable 168
+                        catch (Exception ex)
+#pragma warning restore 168
+                        {
+                            // ignore
+                        }
+                    }
+                    ts.End(); // ts.end();
+                }
+            }
+
+        } // ts.Dispose()
+
+
         /// <summary>
         /// blast some random strings through the analyzer </summary>
         [Test]


[lucenenet] 01/09: Lucene.Net.Analysis.ICU: Updated Segmentation files to Lucene 8.6.1 to account for the latest features of ICU

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 672c5a94abfc179121146cae4fbb13379f7b7203
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Sat Aug 22 21:08:10 2020 +0700

    Lucene.Net.Analysis.ICU: Updated Segmentation files to Lucene 8.6.1 to account for the latest features of ICU
---
 .../Icu/Segmentation/BreakIteratorWrapper.cs       | 179 +++++++--------------
 .../Analysis/Icu/Segmentation/CharArrayIterator.cs |   2 +-
 .../Icu/Segmentation/CompositeBreakIterator.cs     |   6 +-
 .../Icu/Segmentation/DefaultICUTokenizerConfig.cs  |  16 +-
 .../Analysis/Icu/Segmentation/ICUTokenizer.cs      |  19 +--
 .../Icu/Segmentation/ICUTokenizerConfig.cs         |   8 +-
 .../Icu/Segmentation/ICUTokenizerFactory.cs        |  11 +-
 .../Analysis/Icu/Segmentation/ScriptIterator.cs    |   2 +-
 8 files changed, 93 insertions(+), 150 deletions(-)

diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs
index af50927..d01aacc 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs
@@ -1,7 +1,6 @@
-// Lucene version compatibility level 7.1.0
-using ICU4N;
-using ICU4N.Support.Text;
+// Lucene version compatibility level 8.6.1
 using ICU4N.Text;
+using J2N;
 
 namespace Lucene.Net.Analysis.Icu.Segmentation
 {
@@ -23,146 +22,88 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
      */
 
     /// <summary>
-    /// Contain all the issues surrounding BreakIterators in ICU in one place.
-    /// Basically this boils down to the fact that they aren't very friendly to any
-    /// sort of OO design.
-    /// <para/>
-    /// http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
-    /// BreakIterator from <see cref="RuleBasedBreakIterator"/>
-    /// <para/>
-    /// DictionaryBasedBreakIterator is a subclass of <see cref="RuleBasedBreakIterator"/>, but
-    /// doesn't actually behave as a subclass: it always returns 0 for
-    /// getRuleStatus(): 
-    /// http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
-    /// tags
+    /// Wraps <see cref="RuleBasedBreakIterator"/>, making object reuse convenient and
+    /// emitting a rule status for emoji sequences.
     /// <para/>
     /// @lucene.experimental
     /// </summary>
-    internal abstract class BreakIteratorWrapper
+    internal sealed class BreakIteratorWrapper
     {
-        protected readonly CharArrayIterator m_textIterator = new CharArrayIterator();
-        protected char[] m_text;
-        protected int m_start;
-        protected int m_length;
-
-        public abstract int Next();
-        public abstract int Current { get; }
-        public abstract int RuleStatus { get; }
-        public abstract void SetText(CharacterIterator text);
+        private readonly CharArrayIterator textIterator = new CharArrayIterator();
+        private readonly RuleBasedBreakIterator rbbi;
+        private char[] text;
+        private int start;
+        private int status;
 
-        public void SetText(char[] text, int start, int length)
+        internal BreakIteratorWrapper(RuleBasedBreakIterator rbbi)
         {
-            this.m_text = text;
-            this.m_start = start;
-            this.m_length = length;
-            m_textIterator.SetText(text, start, length);
-            SetText(m_textIterator);
+            this.rbbi = rbbi;
         }
 
-        /// <summary>
-        /// If its a <see cref="RuleBasedBreakIterator"/>, the rule status can be used for token type. If it's
-        /// any other <see cref="BreakIterator"/>, the rulestatus method is not available, so treat
-        /// it like a generic <see cref="BreakIterator"/>.
-        /// </summary>
-        /// <param name="breakIterator"></param>
-        /// <returns></returns>
-        public static BreakIteratorWrapper Wrap(BreakIterator breakIterator)
+        public int Current => rbbi.Current;
+
+        public int RuleStatus => status;
+
+        public int Next()
         {
-            if (breakIterator is RuleBasedBreakIterator)
-                return new RBBIWrapper((RuleBasedBreakIterator)breakIterator);
-            else
-                return new BIWrapper(breakIterator);
+            int current = rbbi.Current;
+            int next = rbbi.Next();
+            status = CalcStatus(current, next);
+            return next;
         }
 
-        /// <summary>
-        /// <see cref="RuleBasedBreakIterator"/> wrapper: <see cref="RuleBasedBreakIterator"/> (as long as it's not
-        /// a DictionaryBasedBreakIterator) behaves correctly.
-        /// </summary>
-        private sealed class RBBIWrapper : BreakIteratorWrapper
+        /// <summary>Returns current rule status for the text between breaks. (determines token type)</summary>
+        private int CalcStatus(int current, int next)
         {
-            private readonly RuleBasedBreakIterator rbbi;
-
-            internal RBBIWrapper(RuleBasedBreakIterator rbbi)
-            {
-                this.rbbi = rbbi;
-            }
-
-            public override int Current => rbbi.Current;
-
-            public override int RuleStatus => rbbi.RuleStatus;
-
-            public override int Next()
+            // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
+            // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
+            if (next != BreakIterator.Done && IsEmoji(current, next))
             {
-                return rbbi.Next();
+                return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
             }
-
-            public override void SetText(CharacterIterator text)
+            else
             {
-                rbbi.SetText(text);
+                return rbbi.RuleStatus;
             }
         }
 
-        /// <summary>
-        /// Generic <see cref="BreakIterator"/> wrapper: Either the rulestatus method is not
-        /// available or always returns 0. Calculate a rulestatus here so it behaves
-        /// like <see cref="RuleBasedBreakIterator"/>.
-        /// </summary>
-        /// <remarks>
-        /// Note: This is slower than <see cref="RuleBasedBreakIterator"/>.
-        /// </remarks>
-        private sealed class BIWrapper : BreakIteratorWrapper
-        {
-            private readonly BreakIterator bi;
-            private int status;
-
-            internal BIWrapper(BreakIterator bi)
-            {
-                this.bi = bi;
-            }
-
-            public override int Current => bi.Current;
-
-            public override int RuleStatus => status;
+        // See unicode doc L2/16-315 for rationale.
+        // basically for us the ambiguous cases (keycap/etc) as far as types go.
+        internal static readonly UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").Freeze();
+        // faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
+        //internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").Freeze(); // LUCENENET: Extended_Pictographic wasn't added until ICU 62
+        internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:]]").Freeze();
 
-            public override int Next()
-            {
-                int current = bi.Current;
-                int next = bi.Next();
-                status = CalcStatus(current, next);
-                return next;
-            }
-
-            private int CalcStatus(int current, int next)
+        /// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary>
+        private bool IsEmoji(int current, int next)
+        {
+            int begin = start + current;
+            int end = start + next;
+            int codepoint = UTF16.CharAt(text, 0, end, begin);
+            if (EMOJI.Contains(codepoint))
             {
-                if (current == BreakIterator.Done || next == BreakIterator.Done)
-                    return BreakIterator.WordNone;
-
-                int begin = m_start + current;
-                int end = m_start + next;
-
-                int codepoint;
-                for (int i = begin; i < end; i += UTF16.GetCharCount(codepoint))
+                if (EMOJI_RK.Contains(codepoint))
                 {
-                    codepoint = UTF16.CharAt(m_text, 0, end, begin);
-
-                    if (UChar.IsDigit(codepoint))
-                        return BreakIterator.WordNumber;
-                    else if (UChar.IsLetter(codepoint))
-                    {
-                        // TODO: try to separately specify ideographic, kana? 
-                        // [currently all bundled as letter for this case]
-                        return BreakIterator.WordLetter;
-                    }
+                    // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
+                    // an emoji presentation selector or keycap follows.
+                    int trailer = begin + Character.CharCount(codepoint);
+                    return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
+                }
+                else
+                {
+                    return true;
                 }
-
-                return BreakIterator.WordNone;
             }
+            return false;
+        }
 
-            public override void SetText(CharacterIterator text)
-            {
-                bi.SetText(text);
-                status = BreakIterator.WordNone;
-            }
+        public void SetText(char[] text, int start, int length)
+        {
+            this.text = text;
+            this.start = start;
+            textIterator.SetText(text, start, length);
+            rbbi.SetText(textIterator);
+            status = RuleBasedBreakIterator.WordNone;
         }
     }
 }
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs
index 064604f..b4eef83 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 #if FEATURE_BREAKITERATOR
 using ICU4N.Support.Text;
 using Lucene.Net.Support;
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs
index d697ae1..f628e81 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Globalization;
 using ICU4N.Text;
@@ -124,8 +124,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
 
         private BreakIteratorWrapper GetBreakIterator(int scriptCode)
         {
-            if (wordBreakers[scriptCode] == null)
-                wordBreakers[scriptCode] = BreakIteratorWrapper.Wrap(config.GetBreakIterator(scriptCode));
+            if (wordBreakers[scriptCode] is null)
+                wordBreakers[scriptCode] = new BreakIteratorWrapper(config.GetBreakIterator(scriptCode));
             return wordBreakers[scriptCode];
         }
     }
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs
index 447567c..b6093cb 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N.Globalization;
 using ICU4N.Text;
 using J2N;
@@ -53,6 +53,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
         public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
         /// <summary>Token type for words that appear to be numbers</summary>
         public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
+        /// <summary>Token type for words that appear to be emoji sequences</summary>
+        public static readonly string WORD_EMOJI = "<EMOJI>"; //StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; // LUCENENET: 4.8.1 StandardTokenizer doesn't contain EMOJI
 
         /// <summary>
         /// the default breakiterators in use. these can be expensive to
@@ -90,21 +92,21 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
 
         public override bool CombineCJ => cjkAsWords;
 
-        public override BreakIterator GetBreakIterator(int script)
+        public override RuleBasedBreakIterator GetBreakIterator(int script)
         {
             switch (script)
             {
-                case UScript.Japanese: return (BreakIterator)cjkBreakIterator.Clone();
+                case UScript.Japanese: return (RuleBasedBreakIterator)cjkBreakIterator.Clone();
                 case UScript.Myanmar:
                     if (myanmarAsWords)
                     {
-                        return (BreakIterator)defaultBreakIterator.Clone();
+                        return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
                     }
                     else
                     {
-                        return (BreakIterator)myanmarSyllableIterator.Clone();
+                        return (RuleBasedBreakIterator)myanmarSyllableIterator.Clone();
                     }
-                default: return (BreakIterator)defaultBreakIterator.Clone();
+                default: return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
             }
         }
 
@@ -120,6 +122,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
                     return script == UScript.Hangul ? WORD_HANGUL : WORD_LETTER;
                 case BreakIterator.WordNumber: //RuleBasedBreakIterator.WORD_NUMBER:
                     return WORD_NUMBER;
+                case EMOJI_SEQUENCE_STATUS:
+                    return WORD_EMOJI;
                 default: /* some other custom code */
                     return "<OTHER>";
             }
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
index 1afbfc1..2b37cde 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 4.8.1
+// Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Text;
 using Lucene.Net.Analysis.Icu.TokenAttributes;
@@ -27,6 +27,7 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */
+
     /// <summary>
     /// Breaks text into words according to UAX #29: Unicode Text Segmentation
     /// (http://www.unicode.org/reports/tr29/)
@@ -211,9 +212,9 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
         }
 
         /// <summary>
-        /// Returns true if there is a token from the buffer, or null if it is exhausted.
+        /// Returns <c>true</c> if there is a token from the buffer, or <c>false</c> if it is exhausted.
         /// </summary>
-        /// <returns>true if there is a token from the buffer, or null if it is exhausted.</returns>
+        /// <returns><c>true</c> if there is a token from the buffer, or <c>false</c> if it is exhausted.</returns>
         private bool IncrementTokenBuffer()
         {
             int start = breaker.Current;
@@ -222,21 +223,13 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
 
             // find the next set of boundaries, skipping over non-tokens (rule status 0)
             int end = breaker.Next();
-
-            // LUCENENET specific - ICU 60.1 does not set the rule status back to 0,
-            // so we need to explicitly check whether we went out of bounds.
-            // This is more efficient anyway, since we don't call Next() twice in
-            // this case.
-            if (end == BreakIterator.Done)
-                return false; // BreakIterator exhausted
-
-            while (start != BreakIterator.Done && breaker.RuleStatus == 0)
+            while (end != BreakIterator.Done && breaker.RuleStatus == 0)
             {
                 start = end;
                 end = breaker.Next();
             }
 
-            if (start == BreakIterator.Done)
+            if (end == BreakIterator.Done)
                 return false; // BreakIterator exhausted
 
             termAtt.CopyBuffer(buffer, start, end - start);
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs
index e8014f5..1d3ece1 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N.Text;
 using Lucene.Net.Support;
 
@@ -30,15 +30,17 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
     [ExceptionToClassNameConvention]
     public abstract class ICUTokenizerConfig
     {
+        public const int EMOJI_SEQUENCE_STATUS = 299;
+
         /// <summary>
         /// Sole constructor. (For invocation by subclass 
         /// constructors, typically implicit.)
         /// </summary>
-        public ICUTokenizerConfig() { }
+        protected ICUTokenizerConfig() { } // LUCENENET specific - marked protected instead of public
         /// <summary>
         /// Return a breakiterator capable of processing a given script.
         /// </summary>
-        public abstract BreakIterator GetBreakIterator(int script);
+        public abstract RuleBasedBreakIterator GetBreakIterator(int script);
         /// <summary>
         /// Return a token type value for a given script and BreakIterator rule status.
         /// </summary>
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs
index 823e8a6..fe38e72 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level < 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Globalization;
 using ICU4N.Text;
@@ -69,7 +69,10 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
     [ExceptionToClassNameConvention]
     public class ICUTokenizerFactory : TokenizerFactory, IResourceLoaderAware
     {
-        internal static readonly string RULEFILES = "rulefiles";
+        // SPI Name
+        //public const string NAME = "icu";
+
+        internal const string RULEFILES = "rulefiles";
         private readonly IDictionary<int, string> tailored;
         private ICUTokenizerConfig config;
         private readonly bool cjkAsWords;
@@ -128,11 +131,11 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
                 }
             }
 
-            public override BreakIterator GetBreakIterator(int script)
+            public override RuleBasedBreakIterator GetBreakIterator(int script)
             {
                 if (breakers[script] != null)
                 {
-                    return (BreakIterator)breakers[script].Clone();
+                    return (RuleBasedBreakIterator)breakers[script].Clone();
                 }
                 else
                 {
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs
index ceda09c..1228c5d 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Globalization;
 using ICU4N.Text;


[lucenenet] 05/09: PERFORMANCE: Lucene.Net.Util: Streamlined DefaultAttributeFactory to make the get/update process of creating an attribute WeakReference atomic

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 990f929cca17a0fbfb9bb2a798258a77ebedd6a4
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Mon Aug 24 20:16:52 2020 +0700

    PERFORMANCE: Lucene.Net.Util: Streamlined DefaultAttributeFactory to make the get/update process of creating an attribute WeakReference atomic
---
 src/Lucene.Net/Util/AttributeSource.cs | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/Lucene.Net/Util/AttributeSource.cs b/src/Lucene.Net/Util/AttributeSource.cs
index 0f0ed10..7cfa60b 100644
--- a/src/Lucene.Net/Util/AttributeSource.cs
+++ b/src/Lucene.Net/Util/AttributeSource.cs
@@ -108,20 +108,13 @@ namespace Lucene.Net.Util
                     var attClass = typeof(T);
                     Type clazz;
 
-#if !FEATURE_CONDITIONALWEAKTABLE_ADDORUPDATE
                     // LUCENENET: If the weakreference is dead, we need to explicitly remove and re-add its key.
                     // We synchronize on attClassImplMap only to make the operation atomic. This does not actually
                     // utilize the same lock as attClassImplMap does internally, but since this is the only place
                     // it is used, it is fine here.
-
-                    // In .NET Standard 2.1, we can use AddOrUpdate, so don't need the lock.
                     lock (attClassImplMap)
-#endif
                     {
-                        var @ref = attClassImplMap.GetValue(attClass, createValueCallback: (key) =>
-                            CreateAttributeWeakReference(key, out clazz));
-
-                        if (!@ref.TryGetTarget(out clazz))
+                        if (!attClassImplMap.TryGetValue(attClass, out var @ref) || !@ref.TryGetTarget(out clazz))
                         {
 #if FEATURE_CONDITIONALWEAKTABLE_ADDORUPDATE
                             // There is a small chance that multiple threads will get through here, but it doesn't matter


[lucenenet] 09/09: TO REVERT: Lucene.Net.ICU: Added locking to ICUTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This can be reverted when the BreakIterator issue is fixed.

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 6eba1e0648e3460cabc7032fdc976a1f44999465
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Mon Aug 24 15:10:10 2020 +0700

    TO REVERT: Lucene.Net.ICU: Added locking to ICUTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This can be reverted when the BreakIterator issue is fixed.
---
 .../Analysis/Icu/Segmentation/ICUTokenizer.cs      | 29 ++++++++++++++--------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
index 2b37cde..7b31755 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
@@ -57,6 +57,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
         private readonly ITypeAttribute typeAtt;
         private readonly IScriptAttribute scriptAtt;
 
+        private static readonly object syncLock = new object(); // LUCENENET specific - workaround until BreakIterator is made thread safe (LUCENENET TODO: TO REVERT)
+
         /// <summary>
         /// Construct a new <see cref="ICUTokenizer"/> that breaks text into words from the given
         /// <see cref="TextReader"/>.
@@ -109,23 +111,27 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
 
         public override bool IncrementToken()
         {
-            ClearAttributes();
-            if (length == 0)
-                Refill();
-            while (!IncrementTokenBuffer())
+            lock (syncLock)
             {
-                Refill();
-                if (length <= 0) // no more bytes to read;
-                    return false;
+                ClearAttributes();
+                if (length == 0)
+                    Refill();
+                while (!IncrementTokenBuffer())
+                {
+                    Refill();
+                    if (length <= 0) // no more bytes to read;
+                        return false;
+                }
+                return true;
             }
-            return true;
         }
 
 
         public override void Reset()
         {
             base.Reset();
-            breaker.SetText(buffer, 0, 0);
+            lock (syncLock)
+                breaker.SetText(buffer, 0, 0);
             length = usableLength = offset = 0;
         }
 
@@ -187,7 +193,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
                                 */
             }
 
-            breaker.SetText(buffer, 0, Math.Max(0, usableLength));
+            lock (syncLock)
+                breaker.SetText(buffer, 0, Math.Max(0, usableLength));
         }
 
         // TODO: refactor to a shared readFully somewhere
@@ -236,7 +243,7 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
             offsetAtt.SetOffset(CorrectOffset(offset + start), CorrectOffset(offset + end));
             typeAtt.Type = config.GetType(breaker.ScriptCode, breaker.RuleStatus);
             scriptAtt.Code = breaker.ScriptCode;
-
+            
             return true;
         }
     }


[lucenenet] 06/09: PERFORMANCE: Lucene.Net.Util.AttributeSource.DefaultAttributeFactory: Use external lock for better performance and removed redundant GetOrAdd() call

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 57ed84af858b74429e391ddc52d5190c2334651a
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Mon Aug 24 20:39:51 2020 +0700

    PERFORMANCE: Lucene.Net.Util.AttributeSource.DefaultAttributeFactory: Use external lock for better performance and removed redundant GetOrAdd() call
---
 src/Lucene.Net/Util/AttributeSource.cs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/Lucene.Net/Util/AttributeSource.cs b/src/Lucene.Net/Util/AttributeSource.cs
index 7cfa60b..0b5f922 100644
--- a/src/Lucene.Net/Util/AttributeSource.cs
+++ b/src/Lucene.Net/Util/AttributeSource.cs
@@ -61,6 +61,7 @@ namespace Lucene.Net.Util
                 // identity for a class, so there is no need for an identity wrapper for it.
                 private static readonly ConditionalWeakTable<Type, WeakReference<Type>> attClassImplMap =
                     new ConditionalWeakTable<Type, WeakReference<Type>>();
+                private static readonly object attClassImplMapLock = new object();
 
                 internal DefaultAttributeFactory()
                 {
@@ -108,16 +109,13 @@ namespace Lucene.Net.Util
                     var attClass = typeof(T);
                     Type clazz;
 
-                    // LUCENENET: If the weakreference is dead, we need to explicitly remove and re-add its key.
-                    // We synchronize on attClassImplMap only to make the operation atomic. This does not actually
-                    // utilize the same lock as attClassImplMap does internally, but since this is the only place
-                    // it is used, it is fine here.
-                    lock (attClassImplMap)
+                    // LUCENENET: If the weakreference is dead, we need to explicitly update its key.
+                    // We synchronize on attClassImplMapLock to make the operation atomic.
+                    lock (attClassImplMapLock)
                     {
                         if (!attClassImplMap.TryGetValue(attClass, out var @ref) || !@ref.TryGetTarget(out clazz))
                         {
 #if FEATURE_CONDITIONALWEAKTABLE_ADDORUPDATE
-                            // There is a small chance that multiple threads will get through here, but it doesn't matter
                             attClassImplMap.AddOrUpdate(attClass, CreateAttributeWeakReference(attClass, out clazz));
 #else
                             attClassImplMap.Remove(attClass);


[lucenenet] 08/09: Lucene.Net.ICU: Added locking to ThaiTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This helps, but is only a partial fix.

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 5ff92583f219fa851375c9be12ae1b6bf52383c1
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Mon Aug 24 15:09:37 2020 +0700

    Lucene.Net.ICU: Added locking to ThaiTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This helps, but is only a partial fix.
---
 .../Analysis/Th/ThaiTokenizer.cs                   | 92 ++++++++++++++++------
 1 file changed, 66 insertions(+), 26 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index 7e0754c..d1f80f1 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -41,13 +41,27 @@ namespace Lucene.Net.Analysis.Th
     /// </summary>
     public class ThaiTokenizer : SegmentingTokenizerBase
     {
+        private static readonly object syncLock = new object(); // LUCENENET specific - workaround until BreakIterator is made thread safe  (LUCENENET TODO: TO REVERT)
+
         // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
-        private static readonly BreakIterator proto = BreakIterator.GetWordInstance(new CultureInfo("th"));
+        private static readonly BreakIterator proto = LoadProto();
 
         /// <summary>
         /// used for breaking the text into sentences
         /// </summary>
-        private static readonly BreakIterator sentenceProto = BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
+        private static readonly BreakIterator sentenceProto = LoadSentenceProto();
+
+        private static BreakIterator LoadProto()
+        {
+            lock (syncLock)
+                return BreakIterator.GetWordInstance(new CultureInfo("th"));
+        }
+
+        private static BreakIterator LoadSentenceProto()
+        {
+            lock (syncLock)
+                return BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
+        }
 
         private readonly ThaiWordBreaker wordBreaker;
         private readonly CharArrayIterator wrapper = Analysis.Util.CharArrayIterator.NewWordInstance();
@@ -68,48 +82,74 @@ namespace Lucene.Net.Analysis.Th
         /// <summary>
         /// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
         public ThaiTokenizer(AttributeFactory factory, TextReader reader)
-            : base(factory, reader, (BreakIterator)sentenceProto.Clone())
+            : base(factory, reader, CreateSentenceClone())
         {
             // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
 
-            wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
+            lock (syncLock)
+                wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
             termAtt = AddAttribute<ICharTermAttribute>();
             offsetAtt = AddAttribute<IOffsetAttribute>();
         }
 
-        protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
+        private static BreakIterator CreateSentenceClone()
         {
-            this.sentenceStart = sentenceStart;
-            this.sentenceEnd = sentenceEnd;
-            wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
-            wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
+            lock (syncLock)
+                return (BreakIterator)sentenceProto.Clone();
         }
 
-        protected override bool IncrementWord()
+        public override void Reset()
+        {
+            lock (syncLock)
+                base.Reset();
+        }
+
+        public override State CaptureState()
+        {
+            lock (syncLock)
+                return base.CaptureState();
+        }
+
+        protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
         {
-            int start = wordBreaker.Current;
-            if (start == BreakIterator.Done)
+            lock (syncLock)
             {
-                return false; // BreakIterator exhausted
+                this.sentenceStart = sentenceStart;
+                this.sentenceEnd = sentenceEnd;
+                wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
+                wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
             }
+        }
 
-            // find the next set of boundaries, skipping over non-tokens
-            int end = wordBreaker.Next();
-            while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
+        protected override bool IncrementWord()
+        {
+            int start, end;
+            lock (syncLock)
             {
-                start = end;
+                start = wordBreaker.Current;
+                if (start == BreakIterator.Done)
+                {
+                    return false; // BreakIterator exhausted
+                }
+
+                // find the next set of boundaries, skipping over non-tokens
                 end = wordBreaker.Next();
-            }
+                while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
+                {
+                    start = end;
+                    end = wordBreaker.Next();
+                }
 
-            if (end == BreakIterator.Done)
-            {
-                return false; // BreakIterator exhausted
-            }
+                if (end == BreakIterator.Done)
+                {
+                    return false; // BreakIterator exhausted
+                }
 
-            ClearAttributes();
-            termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
-            offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
-            return true;
+                ClearAttributes();
+                termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
+                offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
+                return true;
+            }
         }
     }
 


[lucenenet] 02/09: Lucene.Net.ICU: Reverted extra locking/cloning for ThaiTokenizer

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 6161f4f10cb1ddbd49cda0432dce0007d27a1891
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Sun Aug 23 23:38:41 2020 +0700

    Lucene.Net.ICU: Reverted extra locking/cloning for ThaiTokenizer
---
 .../Analysis/Th/ThaiTokenizer.cs                   | 64 +++++++++-------------
 1 file changed, 25 insertions(+), 39 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index 283256f..7e0754c 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -42,12 +42,12 @@ namespace Lucene.Net.Analysis.Th
     public class ThaiTokenizer : SegmentingTokenizerBase
     {
         // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
-        private static readonly BreakIterator proto = (BreakIterator)BreakIterator.GetWordInstance(new CultureInfo("th")).Clone();
+        private static readonly BreakIterator proto = BreakIterator.GetWordInstance(new CultureInfo("th"));
 
         /// <summary>
         /// used for breaking the text into sentences
         /// </summary>
-        private static readonly BreakIterator sentenceProto = (BreakIterator)BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture).Clone();
+        private static readonly BreakIterator sentenceProto = BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
 
         private readonly ThaiWordBreaker wordBreaker;
         private readonly CharArrayIterator wrapper = Analysis.Util.CharArrayIterator.NewWordInstance();
@@ -58,8 +58,6 @@ namespace Lucene.Net.Analysis.Th
         private readonly ICharTermAttribute termAtt;
         private readonly IOffsetAttribute offsetAtt;
 
-        private readonly object syncLock = new object();
-
         /// <summary>
         /// Creates a new <see cref="ThaiTokenizer"/> </summary>
         public ThaiTokenizer(TextReader reader)
@@ -81,49 +79,37 @@ namespace Lucene.Net.Analysis.Th
 
         protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
         {
-            // LUCENENET TODO: This class isn't passing thread safety checks.
-            // Adding locking and extra cloning of BreakIterator seems to help, but
-            // it is not a complete fix.
-            lock (syncLock)
-            {
-                this.sentenceStart = sentenceStart;
-                this.sentenceEnd = sentenceEnd;
-                wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
-                wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
-            }
+            this.sentenceStart = sentenceStart;
+            this.sentenceEnd = sentenceEnd;
+            wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
+            wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
         }
 
         protected override bool IncrementWord()
         {
-            // LUCENENET TODO: This class isn't passing thread safety checks.
-            // Adding locking and extra cloning of BreakIterator seems to help, but
-            // it is not a complete fix.
-            lock (syncLock)
+            int start = wordBreaker.Current;
+            if (start == BreakIterator.Done)
             {
-                int start = wordBreaker.Current;
-                if (start == BreakIterator.Done)
-                {
-                    return false; // BreakIterator exhausted
-                }
-
-                // find the next set of boundaries, skipping over non-tokens
-                int end = wordBreaker.Next();
-                while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
-                {
-                    start = end;
-                    end = wordBreaker.Next();
-                }
+                return false; // BreakIterator exhausted
+            }
 
-                if (end == BreakIterator.Done)
-                {
-                    return false; // BreakIterator exhausted
-                }
+            // find the next set of boundaries, skipping over non-tokens
+            int end = wordBreaker.Next();
+            while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
+            {
+                start = end;
+                end = wordBreaker.Next();
+            }
 
-                ClearAttributes();
-                termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
-                offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
-                return true;
+            if (end == BreakIterator.Done)
+            {
+                return false; // BreakIterator exhausted
             }
+
+            ClearAttributes();
+            termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
+            offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
+            return true;
         }
     }