You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/09/01 14:36:30 UTC
[10/22] lucenenet git commit: Wrapped ICU4NET BreakIterator with a new class named ThaiWordBreaker to fix the broken behavior of not splitting non-Thai and Thai characters into separate words.

Wrapped ICU4NET BreakIterator with a new class named ThaiWordBreaker to fix the broken behavior of not splitting non-Thai and Thai characters into separate words.


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/63e3e22d
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/63e3e22d
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/63e3e22d

Branch: refs/heads/analysis-work
Commit: 63e3e22d8e4768e03295e7bdc07924120f307ad3
Parents: ddd93cb
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Thu Aug 25 15:49:56 2016 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Thu Aug 25 15:49:56 2016 +0700

----------------------------------------------------------------------
 .../Analysis/Th/ThaiTokenizer.cs                | 103 ++++++++++++++++++-
 .../Analysis/Th/ThaiWordFilter.cs               |   2 +-
 2 files changed, 102 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/63e3e22d/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index e11ebf8..d8625d9 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -3,7 +3,11 @@ using ICU4NETExtension;
 using Lucene.Net.Analysis.Tokenattributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
 using System.IO;
+using System.Linq;
+using System.Text.RegularExpressions;
 
 namespace Lucene.Net.Analysis.Th
 {
@@ -47,7 +51,7 @@ namespace Lucene.Net.Analysis.Th
             DBBI_AVAILABLE = proto.IsBoundary(4);
         }
 
-        private readonly BreakIterator wordBreaker;
+        private readonly ThaiWordBreaker wordBreaker;
         private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance();
 
         internal int sentenceStart;
@@ -72,7 +76,7 @@ namespace Lucene.Net.Analysis.Th
             {
                 throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
             }
-            wordBreaker = BreakIterator.CreateWordInstance(Locale.GetUS());
+            wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
             termAtt = AddAttribute<ICharTermAttribute>();
             offsetAtt = AddAttribute<IOffsetAttribute>();
         }
@@ -112,4 +116,99 @@ namespace Lucene.Net.Analysis.Th
             return true;
         }
     }
+
+    /// <summary>
+    /// LUCENENET specific class to patch the behavior of the ICU BreakIterator.
+    /// Corrects the breaking of words by finding transitions between Thai and non-Thai
+    /// characters.
+    /// 
+    /// This logic assumes that the Java BreakIterator also breaks up Thai numerals from
+    /// Arabic numerals (1, 2, 3, etc.). That is, it assumes the first test below passes
+    /// and the second test fails in Lucene (not attempted).
+    /// 
+    /// ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
+    /// AssertAnalyzesTo(analyzer, "\u0e51\u0e52\u0e53456", new string[] { "\u0e51\u0e52\u0e53", "456" });
+    /// AssertAnalyzesTo(analyzer, "\u0e51\u0e52\u0e53456", new string[] { "\u0e51\u0e52\u0e53456" });
+    /// </summary>
+    internal class ThaiWordBreaker
+    {
+        private readonly BreakIterator wordBreaker;
+        private string text;
+        private readonly IList<int> transitions = new List<int>();
+        private readonly static Regex thaiPattern = new Regex(@"\p{IsThai}", RegexOptions.Compiled | RegexOptions.CultureInvariant);
+
+        public ThaiWordBreaker(BreakIterator wordBreaker) 
+        {
+            if (wordBreaker == null)
+            {
+                throw new ArgumentNullException("wordBreaker");
+            }
+            this.wordBreaker = wordBreaker;
+        }
+
+        public void SetText(string text)
+        {
+            this.text = text;
+            wordBreaker.SetText(text);
+        }
+
+        public int Current()
+        {
+            if (transitions.Any())
+            {
+                return transitions.First();
+            }
+            return wordBreaker.Current();
+        }
+
+        public int Next()
+        {
+            if (transitions.Any())
+            {
+                transitions.RemoveAt(0);
+            }
+            if (transitions.Any())
+            {
+                return transitions.First();
+            }
+            return GetNext();
+        }
+
+        private int GetNext()
+        {
+            bool isThai = false, isNonThai = false;
+            bool prevWasThai = false, prevWasNonThai = false;
+            int prev = wordBreaker.Current();
+            int current = wordBreaker.Next();
+
+            if (current != BreakIterator.DONE && current - prev > 0)
+            {
+                // Find all of the transitions between Thai and non-Thai characters and digits
+                for (int i = prev; i < current; i++)
+                {
+                    char c = text[i];
+                    isThai = thaiPattern.IsMatch(c.ToString());
+                    isNonThai = char.IsLetterOrDigit(c) && !isThai;
+
+                    if ((prevWasThai && isNonThai) ||
+                        (prevWasNonThai && isThai))
+                    {
+                        transitions.Add(i);
+                    }
+
+                    // record the values for comparison with the next loop
+                    prevWasThai = isThai;
+                    prevWasNonThai = isNonThai;
+                }
+
+                if (transitions.Any())
+                {
+                    transitions.Add(current);
+                    return transitions.First();
+                }
+            }
+
+            return current;
+        }
+    }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/63e3e22d/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
index 9864b7c..cbd9b6a 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
@@ -49,7 +49,7 @@ namespace Lucene.Net.Analysis.Th
         /// If this is false, this filter will not work at all!
         /// </summary>
         public static readonly bool DBBI_AVAILABLE = ThaiTokenizer.DBBI_AVAILABLE;
-        private readonly BreakIterator breaker = BreakIterator.CreateWordInstance(new Locale());
+        private readonly ThaiWordBreaker breaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(new Locale()));
         private readonly CharArrayIterator charIterator = CharArrayIterator.NewWordInstance();
 
         private readonly bool handlePosIncr;