You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2020/08/24 21:19:33 UTC

[lucenenet] 01/09: Lucene.Net.Analysis.ICU: Updated Segmentation files to Lucene 8.6.1 to account for the latest features of ICU

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 672c5a94abfc179121146cae4fbb13379f7b7203
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Sat Aug 22 21:08:10 2020 +0700

    Lucene.Net.Analysis.ICU: Updated Segmentation files to Lucene 8.6.1 to account for the latest features of ICU
---
 .../Icu/Segmentation/BreakIteratorWrapper.cs       | 179 +++++++--------------
 .../Analysis/Icu/Segmentation/CharArrayIterator.cs |   2 +-
 .../Icu/Segmentation/CompositeBreakIterator.cs     |   6 +-
 .../Icu/Segmentation/DefaultICUTokenizerConfig.cs  |  16 +-
 .../Analysis/Icu/Segmentation/ICUTokenizer.cs      |  19 +--
 .../Icu/Segmentation/ICUTokenizerConfig.cs         |   8 +-
 .../Icu/Segmentation/ICUTokenizerFactory.cs        |  11 +-
 .../Analysis/Icu/Segmentation/ScriptIterator.cs    |   2 +-
 8 files changed, 93 insertions(+), 150 deletions(-)

diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs
index af50927..d01aacc 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs
@@ -1,7 +1,6 @@
-// Lucene version compatibility level 7.1.0
-using ICU4N;
-using ICU4N.Support.Text;
+// Lucene version compatibility level 8.6.1
 using ICU4N.Text;
+using J2N;
 
 namespace Lucene.Net.Analysis.Icu.Segmentation
 {
@@ -23,146 +22,88 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
      */
 
     /// <summary>
-    /// Contain all the issues surrounding BreakIterators in ICU in one place.
-    /// Basically this boils down to the fact that they aren't very friendly to any
-    /// sort of OO design.
-    /// <para/>
-    /// http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
-    /// BreakIterator from <see cref="RuleBasedBreakIterator"/>
-    /// <para/>
-    /// DictionaryBasedBreakIterator is a subclass of <see cref="RuleBasedBreakIterator"/>, but
-    /// doesn't actually behave as a subclass: it always returns 0 for
-    /// getRuleStatus(): 
-    /// http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
-    /// tags
+    /// Wraps <see cref="RuleBasedBreakIterator"/>, making object reuse convenient and
+    /// emitting a rule status for emoji sequences.
     /// <para/>
     /// @lucene.experimental
     /// </summary>
-    internal abstract class BreakIteratorWrapper
+    internal sealed class BreakIteratorWrapper
     {
-        protected readonly CharArrayIterator m_textIterator = new CharArrayIterator();
-        protected char[] m_text;
-        protected int m_start;
-        protected int m_length;
-
-        public abstract int Next();
-        public abstract int Current { get; }
-        public abstract int RuleStatus { get; }
-        public abstract void SetText(CharacterIterator text);
+        private readonly CharArrayIterator textIterator = new CharArrayIterator();
+        private readonly RuleBasedBreakIterator rbbi;
+        private char[] text;
+        private int start;
+        private int status;
 
-        public void SetText(char[] text, int start, int length)
+        internal BreakIteratorWrapper(RuleBasedBreakIterator rbbi)
         {
-            this.m_text = text;
-            this.m_start = start;
-            this.m_length = length;
-            m_textIterator.SetText(text, start, length);
-            SetText(m_textIterator);
+            this.rbbi = rbbi;
         }
 
-        /// <summary>
-        /// If its a <see cref="RuleBasedBreakIterator"/>, the rule status can be used for token type. If it's
-        /// any other <see cref="BreakIterator"/>, the rulestatus method is not available, so treat
-        /// it like a generic <see cref="BreakIterator"/>.
-        /// </summary>
-        /// <param name="breakIterator"></param>
-        /// <returns></returns>
-        public static BreakIteratorWrapper Wrap(BreakIterator breakIterator)
+        public int Current => rbbi.Current;
+
+        public int RuleStatus => status;
+
+        public int Next()
         {
-            if (breakIterator is RuleBasedBreakIterator)
-                return new RBBIWrapper((RuleBasedBreakIterator)breakIterator);
-            else
-                return new BIWrapper(breakIterator);
+            int current = rbbi.Current;
+            int next = rbbi.Next();
+            status = CalcStatus(current, next);
+            return next;
         }
 
-        /// <summary>
-        /// <see cref="RuleBasedBreakIterator"/> wrapper: <see cref="RuleBasedBreakIterator"/> (as long as it's not
-        /// a DictionaryBasedBreakIterator) behaves correctly.
-        /// </summary>
-        private sealed class RBBIWrapper : BreakIteratorWrapper
+        /// <summary>Returns current rule status for the text between breaks. (determines token type)</summary>
+        private int CalcStatus(int current, int next)
         {
-            private readonly RuleBasedBreakIterator rbbi;
-
-            internal RBBIWrapper(RuleBasedBreakIterator rbbi)
-            {
-                this.rbbi = rbbi;
-            }
-
-            public override int Current => rbbi.Current;
-
-            public override int RuleStatus => rbbi.RuleStatus;
-
-            public override int Next()
+            // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
+            // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
+            if (next != BreakIterator.Done && IsEmoji(current, next))
             {
-                return rbbi.Next();
+                return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
             }
-
-            public override void SetText(CharacterIterator text)
+            else
             {
-                rbbi.SetText(text);
+                return rbbi.RuleStatus;
             }
         }
 
-        /// <summary>
-        /// Generic <see cref="BreakIterator"/> wrapper: Either the rulestatus method is not
-        /// available or always returns 0. Calculate a rulestatus here so it behaves
-        /// like <see cref="RuleBasedBreakIterator"/>.
-        /// </summary>
-        /// <remarks>
-        /// Note: This is slower than <see cref="RuleBasedBreakIterator"/>.
-        /// </remarks>
-        private sealed class BIWrapper : BreakIteratorWrapper
-        {
-            private readonly BreakIterator bi;
-            private int status;
-
-            internal BIWrapper(BreakIterator bi)
-            {
-                this.bi = bi;
-            }
-
-            public override int Current => bi.Current;
-
-            public override int RuleStatus => status;
+        // See unicode doc L2/16-315 for rationale.
+        // basically for us the ambiguous cases (keycap/etc) as far as types go.
+        internal static readonly UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").Freeze();
+        // faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
+        //internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").Freeze(); // LUCENENET: Extended_Pictographic wasn't added until ICU 62
+        internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:]]").Freeze();
 
-            public override int Next()
-            {
-                int current = bi.Current;
-                int next = bi.Next();
-                status = CalcStatus(current, next);
-                return next;
-            }
-
-            private int CalcStatus(int current, int next)
+        /// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary>
+        private bool IsEmoji(int current, int next)
+        {
+            int begin = start + current;
+            int end = start + next;
+            int codepoint = UTF16.CharAt(text, 0, end, begin);
+            if (EMOJI.Contains(codepoint))
             {
-                if (current == BreakIterator.Done || next == BreakIterator.Done)
-                    return BreakIterator.WordNone;
-
-                int begin = m_start + current;
-                int end = m_start + next;
-
-                int codepoint;
-                for (int i = begin; i < end; i += UTF16.GetCharCount(codepoint))
+                if (EMOJI_RK.Contains(codepoint))
                 {
-                    codepoint = UTF16.CharAt(m_text, 0, end, begin);
-
-                    if (UChar.IsDigit(codepoint))
-                        return BreakIterator.WordNumber;
-                    else if (UChar.IsLetter(codepoint))
-                    {
-                        // TODO: try to separately specify ideographic, kana? 
-                        // [currently all bundled as letter for this case]
-                        return BreakIterator.WordLetter;
-                    }
+                    // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
+                    // an emoji presentation selector or keycap follows.
+                    int trailer = begin + Character.CharCount(codepoint);
+                    return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
+                }
+                else
+                {
+                    return true;
                 }
-
-                return BreakIterator.WordNone;
             }
+            return false;
+        }
 
-            public override void SetText(CharacterIterator text)
-            {
-                bi.SetText(text);
-                status = BreakIterator.WordNone;
-            }
+        public void SetText(char[] text, int start, int length)
+        {
+            this.text = text;
+            this.start = start;
+            textIterator.SetText(text, start, length);
+            rbbi.SetText(textIterator);
+            status = RuleBasedBreakIterator.WordNone;
         }
     }
 }
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs
index 064604f..b4eef83 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 #if FEATURE_BREAKITERATOR
 using ICU4N.Support.Text;
 using Lucene.Net.Support;
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs
index d697ae1..f628e81 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Globalization;
 using ICU4N.Text;
@@ -124,8 +124,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
 
         private BreakIteratorWrapper GetBreakIterator(int scriptCode)
         {
-            if (wordBreakers[scriptCode] == null)
-                wordBreakers[scriptCode] = BreakIteratorWrapper.Wrap(config.GetBreakIterator(scriptCode));
+            if (wordBreakers[scriptCode] is null)
+                wordBreakers[scriptCode] = new BreakIteratorWrapper(config.GetBreakIterator(scriptCode));
             return wordBreakers[scriptCode];
         }
     }
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs
index 447567c..b6093cb 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N.Globalization;
 using ICU4N.Text;
 using J2N;
@@ -53,6 +53,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
         public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
         /// <summary>Token type for words that appear to be numbers</summary>
         public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
+        /// <summary>Token type for words that appear to be emoji sequences</summary>
+        public static readonly string WORD_EMOJI = "<EMOJI>"; //StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; // LUCENENET: 4.8.1 StandardTokenizer doesn't contain EMOJI
 
         /// <summary>
         /// the default breakiterators in use. these can be expensive to
@@ -90,21 +92,21 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
 
         public override bool CombineCJ => cjkAsWords;
 
-        public override BreakIterator GetBreakIterator(int script)
+        public override RuleBasedBreakIterator GetBreakIterator(int script)
         {
             switch (script)
             {
-                case UScript.Japanese: return (BreakIterator)cjkBreakIterator.Clone();
+                case UScript.Japanese: return (RuleBasedBreakIterator)cjkBreakIterator.Clone();
                 case UScript.Myanmar:
                     if (myanmarAsWords)
                     {
-                        return (BreakIterator)defaultBreakIterator.Clone();
+                        return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
                     }
                     else
                     {
-                        return (BreakIterator)myanmarSyllableIterator.Clone();
+                        return (RuleBasedBreakIterator)myanmarSyllableIterator.Clone();
                     }
-                default: return (BreakIterator)defaultBreakIterator.Clone();
+                default: return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
             }
         }
 
@@ -120,6 +122,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
                     return script == UScript.Hangul ? WORD_HANGUL : WORD_LETTER;
                 case BreakIterator.WordNumber: //RuleBasedBreakIterator.WORD_NUMBER:
                     return WORD_NUMBER;
+                case EMOJI_SEQUENCE_STATUS:
+                    return WORD_EMOJI;
                 default: /* some other custom code */
                     return "<OTHER>";
             }
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
index 1afbfc1..2b37cde 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 4.8.1
+// Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Text;
 using Lucene.Net.Analysis.Icu.TokenAttributes;
@@ -27,6 +27,7 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */
+
     /// <summary>
     /// Breaks text into words according to UAX #29: Unicode Text Segmentation
     /// (http://www.unicode.org/reports/tr29/)
@@ -211,9 +212,9 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
         }
 
         /// <summary>
-        /// Returns true if there is a token from the buffer, or null if it is exhausted.
+        /// Returns <c>true</c> if there is a token from the buffer, or <c>false</c> if it is exhausted.
         /// </summary>
-        /// <returns>true if there is a token from the buffer, or null if it is exhausted.</returns>
+        /// <returns><c>true</c> if there is a token from the buffer, or <c>false</c> if it is exhausted.</returns>
         private bool IncrementTokenBuffer()
         {
             int start = breaker.Current;
@@ -222,21 +223,13 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
 
             // find the next set of boundaries, skipping over non-tokens (rule status 0)
             int end = breaker.Next();
-
-            // LUCENENET specific - ICU 60.1 does not set the rule status back to 0,
-            // so we need to explicitly check whether we went out of bounds.
-            // This is more efficient anyway, since we don't call Next() twice in
-            // this case.
-            if (end == BreakIterator.Done)
-                return false; // BreakIterator exhausted
-
-            while (start != BreakIterator.Done && breaker.RuleStatus == 0)
+            while (end != BreakIterator.Done && breaker.RuleStatus == 0)
             {
                 start = end;
                 end = breaker.Next();
             }
 
-            if (start == BreakIterator.Done)
+            if (end == BreakIterator.Done)
                 return false; // BreakIterator exhausted
 
             termAtt.CopyBuffer(buffer, start, end - start);
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs
index e8014f5..1d3ece1 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N.Text;
 using Lucene.Net.Support;
 
@@ -30,15 +30,17 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
     [ExceptionToClassNameConvention]
     public abstract class ICUTokenizerConfig
     {
+        public const int EMOJI_SEQUENCE_STATUS = 299;
+
         /// <summary>
         /// Sole constructor. (For invocation by subclass 
         /// constructors, typically implicit.)
         /// </summary>
-        public ICUTokenizerConfig() { }
+        protected ICUTokenizerConfig() { } // LUCENENET specific - marked protected instead of public
         /// <summary>
         /// Return a breakiterator capable of processing a given script.
         /// </summary>
-        public abstract BreakIterator GetBreakIterator(int script);
+        public abstract RuleBasedBreakIterator GetBreakIterator(int script);
         /// <summary>
         /// Return a token type value for a given script and BreakIterator rule status.
         /// </summary>
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs
index 823e8a6..fe38e72 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level < 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Globalization;
 using ICU4N.Text;
@@ -69,7 +69,10 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
     [ExceptionToClassNameConvention]
     public class ICUTokenizerFactory : TokenizerFactory, IResourceLoaderAware
     {
-        internal static readonly string RULEFILES = "rulefiles";
+        // SPI Name
+        //public const string NAME = "icu";
+
+        internal const string RULEFILES = "rulefiles";
         private readonly IDictionary<int, string> tailored;
         private ICUTokenizerConfig config;
         private readonly bool cjkAsWords;
@@ -128,11 +131,11 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
                 }
             }
 
-            public override BreakIterator GetBreakIterator(int script)
+            public override RuleBasedBreakIterator GetBreakIterator(int script)
             {
                 if (breakers[script] != null)
                 {
-                    return (BreakIterator)breakers[script].Clone();
+                    return (RuleBasedBreakIterator)breakers[script].Clone();
                 }
                 else
                 {
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs
index ceda09c..1228c5d 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Globalization;
 using ICU4N.Text;