You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2020/08/24 21:19:33 UTC
[lucenenet] 01/09: Lucene.Net.Analysis.ICU: Updated Segmentation
files to Lucene 8.6.1 to account for the latest features of ICU
This is an automated email from the ASF dual-hosted git repository.
nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 672c5a94abfc179121146cae4fbb13379f7b7203
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Sat Aug 22 21:08:10 2020 +0700
Lucene.Net.Analysis.ICU: Updated Segmentation files to Lucene 8.6.1 to account for the latest features of ICU
---
.../Icu/Segmentation/BreakIteratorWrapper.cs | 179 +++++++--------------
.../Analysis/Icu/Segmentation/CharArrayIterator.cs | 2 +-
.../Icu/Segmentation/CompositeBreakIterator.cs | 6 +-
.../Icu/Segmentation/DefaultICUTokenizerConfig.cs | 16 +-
.../Analysis/Icu/Segmentation/ICUTokenizer.cs | 19 +--
.../Icu/Segmentation/ICUTokenizerConfig.cs | 8 +-
.../Icu/Segmentation/ICUTokenizerFactory.cs | 11 +-
.../Analysis/Icu/Segmentation/ScriptIterator.cs | 2 +-
8 files changed, 93 insertions(+), 150 deletions(-)
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs
index af50927..d01aacc 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs
@@ -1,7 +1,6 @@
-// Lucene version compatibility level 7.1.0
-using ICU4N;
-using ICU4N.Support.Text;
+// Lucene version compatibility level 8.6.1
using ICU4N.Text;
+using J2N;
namespace Lucene.Net.Analysis.Icu.Segmentation
{
@@ -23,146 +22,88 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
*/
/// <summary>
- /// Contain all the issues surrounding BreakIterators in ICU in one place.
- /// Basically this boils down to the fact that they aren't very friendly to any
- /// sort of OO design.
- /// <para/>
- /// http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
- /// BreakIterator from <see cref="RuleBasedBreakIterator"/>
- /// <para/>
- /// DictionaryBasedBreakIterator is a subclass of <see cref="RuleBasedBreakIterator"/>, but
- /// doesn't actually behave as a subclass: it always returns 0 for
- /// getRuleStatus():
- /// http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
- /// tags
+ /// Wraps <see cref="RuleBasedBreakIterator"/>, making object reuse convenient and
+ /// emitting a rule status for emoji sequences.
/// <para/>
/// @lucene.experimental
/// </summary>
- internal abstract class BreakIteratorWrapper
+ internal sealed class BreakIteratorWrapper
{
- protected readonly CharArrayIterator m_textIterator = new CharArrayIterator();
- protected char[] m_text;
- protected int m_start;
- protected int m_length;
-
- public abstract int Next();
- public abstract int Current { get; }
- public abstract int RuleStatus { get; }
- public abstract void SetText(CharacterIterator text);
+ private readonly CharArrayIterator textIterator = new CharArrayIterator();
+ private readonly RuleBasedBreakIterator rbbi;
+ private char[] text;
+ private int start;
+ private int status;
- public void SetText(char[] text, int start, int length)
+ internal BreakIteratorWrapper(RuleBasedBreakIterator rbbi)
{
- this.m_text = text;
- this.m_start = start;
- this.m_length = length;
- m_textIterator.SetText(text, start, length);
- SetText(m_textIterator);
+ this.rbbi = rbbi;
}
- /// <summary>
- /// If its a <see cref="RuleBasedBreakIterator"/>, the rule status can be used for token type. If it's
- /// any other <see cref="BreakIterator"/>, the rulestatus method is not available, so treat
- /// it like a generic <see cref="BreakIterator"/>.
- /// </summary>
- /// <param name="breakIterator"></param>
- /// <returns></returns>
- public static BreakIteratorWrapper Wrap(BreakIterator breakIterator)
+ public int Current => rbbi.Current;
+
+ public int RuleStatus => status;
+
+ public int Next()
{
- if (breakIterator is RuleBasedBreakIterator)
- return new RBBIWrapper((RuleBasedBreakIterator)breakIterator);
- else
- return new BIWrapper(breakIterator);
+ int current = rbbi.Current;
+ int next = rbbi.Next();
+ status = CalcStatus(current, next);
+ return next;
}
- /// <summary>
- /// <see cref="RuleBasedBreakIterator"/> wrapper: <see cref="RuleBasedBreakIterator"/> (as long as it's not
- /// a DictionaryBasedBreakIterator) behaves correctly.
- /// </summary>
- private sealed class RBBIWrapper : BreakIteratorWrapper
+ /// <summary>Returns current rule status for the text between breaks. (determines token type)</summary>
+ private int CalcStatus(int current, int next)
{
- private readonly RuleBasedBreakIterator rbbi;
-
- internal RBBIWrapper(RuleBasedBreakIterator rbbi)
- {
- this.rbbi = rbbi;
- }
-
- public override int Current => rbbi.Current;
-
- public override int RuleStatus => rbbi.RuleStatus;
-
- public override int Next()
+ // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
+ // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
+ if (next != BreakIterator.Done && IsEmoji(current, next))
{
- return rbbi.Next();
+ return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
}
-
- public override void SetText(CharacterIterator text)
+ else
{
- rbbi.SetText(text);
+ return rbbi.RuleStatus;
}
}
- /// <summary>
- /// Generic <see cref="BreakIterator"/> wrapper: Either the rulestatus method is not
- /// available or always returns 0. Calculate a rulestatus here so it behaves
- /// like <see cref="RuleBasedBreakIterator"/>.
- /// </summary>
- /// <remarks>
- /// Note: This is slower than <see cref="RuleBasedBreakIterator"/>.
- /// </remarks>
- private sealed class BIWrapper : BreakIteratorWrapper
- {
- private readonly BreakIterator bi;
- private int status;
-
- internal BIWrapper(BreakIterator bi)
- {
- this.bi = bi;
- }
-
- public override int Current => bi.Current;
-
- public override int RuleStatus => status;
+ // See unicode doc L2/16-315 for rationale.
+ // basically for us the ambiguous cases (keycap/etc) as far as types go.
+ internal static readonly UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").Freeze();
+ // faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
+ //internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").Freeze(); // LUCENENET: Extended_Pictographic wasn't added until ICU 62
+ internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:]]").Freeze();
- public override int Next()
- {
- int current = bi.Current;
- int next = bi.Next();
- status = CalcStatus(current, next);
- return next;
- }
-
- private int CalcStatus(int current, int next)
+ /// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary>
+ private bool IsEmoji(int current, int next)
+ {
+ int begin = start + current;
+ int end = start + next;
+ int codepoint = UTF16.CharAt(text, 0, end, begin);
+ if (EMOJI.Contains(codepoint))
{
- if (current == BreakIterator.Done || next == BreakIterator.Done)
- return BreakIterator.WordNone;
-
- int begin = m_start + current;
- int end = m_start + next;
-
- int codepoint;
- for (int i = begin; i < end; i += UTF16.GetCharCount(codepoint))
+ if (EMOJI_RK.Contains(codepoint))
{
- codepoint = UTF16.CharAt(m_text, 0, end, begin);
-
- if (UChar.IsDigit(codepoint))
- return BreakIterator.WordNumber;
- else if (UChar.IsLetter(codepoint))
- {
- // TODO: try to separately specify ideographic, kana?
- // [currently all bundled as letter for this case]
- return BreakIterator.WordLetter;
- }
+ // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
+ // an emoji presentation selector or keycap follows.
+ int trailer = begin + Character.CharCount(codepoint);
+ return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
+ }
+ else
+ {
+ return true;
}
-
- return BreakIterator.WordNone;
}
+ return false;
+ }
- public override void SetText(CharacterIterator text)
- {
- bi.SetText(text);
- status = BreakIterator.WordNone;
- }
+ public void SetText(char[] text, int start, int length)
+ {
+ this.text = text;
+ this.start = start;
+ textIterator.SetText(text, start, length);
+ rbbi.SetText(textIterator);
+ status = RuleBasedBreakIterator.WordNone;
}
}
}
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs
index 064604f..b4eef83 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
#if FEATURE_BREAKITERATOR
using ICU4N.Support.Text;
using Lucene.Net.Support;
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs
index d697ae1..f628e81 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
using ICU4N;
using ICU4N.Globalization;
using ICU4N.Text;
@@ -124,8 +124,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
private BreakIteratorWrapper GetBreakIterator(int scriptCode)
{
- if (wordBreakers[scriptCode] == null)
- wordBreakers[scriptCode] = BreakIteratorWrapper.Wrap(config.GetBreakIterator(scriptCode));
+ if (wordBreakers[scriptCode] is null)
+ wordBreakers[scriptCode] = new BreakIteratorWrapper(config.GetBreakIterator(scriptCode));
return wordBreakers[scriptCode];
}
}
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs
index 447567c..b6093cb 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
using ICU4N.Globalization;
using ICU4N.Text;
using J2N;
@@ -53,6 +53,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/// <summary>Token type for words that appear to be numbers</summary>
public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
+ /// <summary>Token type for words that appear to be emoji sequences</summary>
+ public static readonly string WORD_EMOJI = "<EMOJI>"; //StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; // LUCENENET: 4.8.1 StandardTokenizer doesn't contain EMOJI
/// <summary>
/// the default breakiterators in use. these can be expensive to
@@ -90,21 +92,21 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
public override bool CombineCJ => cjkAsWords;
- public override BreakIterator GetBreakIterator(int script)
+ public override RuleBasedBreakIterator GetBreakIterator(int script)
{
switch (script)
{
- case UScript.Japanese: return (BreakIterator)cjkBreakIterator.Clone();
+ case UScript.Japanese: return (RuleBasedBreakIterator)cjkBreakIterator.Clone();
case UScript.Myanmar:
if (myanmarAsWords)
{
- return (BreakIterator)defaultBreakIterator.Clone();
+ return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
}
else
{
- return (BreakIterator)myanmarSyllableIterator.Clone();
+ return (RuleBasedBreakIterator)myanmarSyllableIterator.Clone();
}
- default: return (BreakIterator)defaultBreakIterator.Clone();
+ default: return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
}
}
@@ -120,6 +122,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
return script == UScript.Hangul ? WORD_HANGUL : WORD_LETTER;
case BreakIterator.WordNumber: //RuleBasedBreakIterator.WORD_NUMBER:
return WORD_NUMBER;
+ case EMOJI_SEQUENCE_STATUS:
+ return WORD_EMOJI;
default: /* some other custom code */
return "<OTHER>";
}
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
index 1afbfc1..2b37cde 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 4.8.1
+// Lucene version compatibility level 8.6.1
using ICU4N;
using ICU4N.Text;
using Lucene.Net.Analysis.Icu.TokenAttributes;
@@ -27,6 +27,7 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
/// <summary>
/// Breaks text into words according to UAX #29: Unicode Text Segmentation
/// (http://www.unicode.org/reports/tr29/)
@@ -211,9 +212,9 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
}
/// <summary>
- /// Returns true if there is a token from the buffer, or null if it is exhausted.
+ /// Returns <c>true</c> if there is a token from the buffer, or <c>false</c> if it is exhausted.
/// </summary>
- /// <returns>true if there is a token from the buffer, or null if it is exhausted.</returns>
+ /// <returns><c>true</c> if there is a token from the buffer, or <c>false</c> if it is exhausted.</returns>
private bool IncrementTokenBuffer()
{
int start = breaker.Current;
@@ -222,21 +223,13 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
// find the next set of boundaries, skipping over non-tokens (rule status 0)
int end = breaker.Next();
-
- // LUCENENET specific - ICU 60.1 does not set the rule status back to 0,
- // so we need to explicitly check whether we went out of bounds.
- // This is more efficient anyway, since we don't call Next() twice in
- // this case.
- if (end == BreakIterator.Done)
- return false; // BreakIterator exhausted
-
- while (start != BreakIterator.Done && breaker.RuleStatus == 0)
+ while (end != BreakIterator.Done && breaker.RuleStatus == 0)
{
start = end;
end = breaker.Next();
}
- if (start == BreakIterator.Done)
+ if (end == BreakIterator.Done)
return false; // BreakIterator exhausted
termAtt.CopyBuffer(buffer, start, end - start);
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs
index e8014f5..1d3ece1 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
using ICU4N.Text;
using Lucene.Net.Support;
@@ -30,15 +30,17 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
[ExceptionToClassNameConvention]
public abstract class ICUTokenizerConfig
{
+ public const int EMOJI_SEQUENCE_STATUS = 299;
+
/// <summary>
/// Sole constructor. (For invocation by subclass
/// constructors, typically implicit.)
/// </summary>
- public ICUTokenizerConfig() { }
+ protected ICUTokenizerConfig() { } // LUCENENET specific - marked protected instead of public
/// <summary>
/// Return a breakiterator capable of processing a given script.
/// </summary>
- public abstract BreakIterator GetBreakIterator(int script);
+ public abstract RuleBasedBreakIterator GetBreakIterator(int script);
/// <summary>
/// Return a token type value for a given script and BreakIterator rule status.
/// </summary>
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs
index 823e8a6..fe38e72 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level < 7.1.0
+// Lucene version compatibility level 8.6.1
using ICU4N;
using ICU4N.Globalization;
using ICU4N.Text;
@@ -69,7 +69,10 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
[ExceptionToClassNameConvention]
public class ICUTokenizerFactory : TokenizerFactory, IResourceLoaderAware
{
- internal static readonly string RULEFILES = "rulefiles";
+ // SPI Name
+ //public const string NAME = "icu";
+
+ internal const string RULEFILES = "rulefiles";
private readonly IDictionary<int, string> tailored;
private ICUTokenizerConfig config;
private readonly bool cjkAsWords;
@@ -128,11 +131,11 @@ namespace Lucene.Net.Analysis.Icu.Segmentation
}
}
- public override BreakIterator GetBreakIterator(int script)
+ public override RuleBasedBreakIterator GetBreakIterator(int script)
{
if (breakers[script] != null)
{
- return (BreakIterator)breakers[script].Clone();
+ return (RuleBasedBreakIterator)breakers[script].Clone();
}
else
{
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs
index ceda09c..1228c5d 100644
--- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs
@@ -1,4 +1,4 @@
-// Lucene version compatibility level 7.1.0
+// Lucene version compatibility level 8.6.1
using ICU4N;
using ICU4N.Globalization;
using ICU4N.Text;