You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/09/01 14:36:30 UTC
[10/22] lucenenet git commit: Wrapped ICU4NET BreakIterator with a
new class named ThaiWordBreaker to fix the broken behavior of not splitting
non-Thai and Thai characters into separate words.
Wrapped ICU4NET BreakIterator with a new class named ThaiWordBreaker to fix the broken behavior of not splitting non-Thai and Thai characters into separate words.
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/63e3e22d
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/63e3e22d
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/63e3e22d
Branch: refs/heads/analysis-work
Commit: 63e3e22d8e4768e03295e7bdc07924120f307ad3
Parents: ddd93cb
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Thu Aug 25 15:49:56 2016 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Thu Aug 25 15:49:56 2016 +0700
----------------------------------------------------------------------
.../Analysis/Th/ThaiTokenizer.cs | 103 ++++++++++++++++++-
.../Analysis/Th/ThaiWordFilter.cs | 2 +-
2 files changed, 102 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/63e3e22d/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index e11ebf8..d8625d9 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -3,7 +3,11 @@ using ICU4NETExtension;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
using System.IO;
+using System.Linq;
+using System.Text.RegularExpressions;
namespace Lucene.Net.Analysis.Th
{
@@ -47,7 +51,7 @@ namespace Lucene.Net.Analysis.Th
DBBI_AVAILABLE = proto.IsBoundary(4);
}
- private readonly BreakIterator wordBreaker;
+ private readonly ThaiWordBreaker wordBreaker;
private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance();
internal int sentenceStart;
@@ -72,7 +76,7 @@ namespace Lucene.Net.Analysis.Th
{
throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
}
- wordBreaker = BreakIterator.CreateWordInstance(Locale.GetUS());
+ wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}
@@ -112,4 +116,99 @@ namespace Lucene.Net.Analysis.Th
return true;
}
}
+
+ /// <summary>
+ /// LUCENENET specific class to patch the behavior of the ICU BreakIterator.
+ /// Corrects the breaking of words by finding transitions between Thai and non-Thai
+ /// characters.
+ ///
+ /// This logic assumes that the Java BreakIterator also breaks up Thai numerals from
+ /// Arabic numerals (1, 2, 3, etc.). That is, it assumes the first test below passes
+ /// and the second test fails in Lucene (not attempted).
+ ///
+ /// ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
+ /// AssertAnalyzesTo(analyzer, "\u0e51\u0e52\u0e53456", new string[] { "\u0e51\u0e52\u0e53", "456" });
+ /// AssertAnalyzesTo(analyzer, "\u0e51\u0e52\u0e53456", new string[] { "\u0e51\u0e52\u0e53456" });
+ /// </summary>
+ internal class ThaiWordBreaker
+ {
+ private readonly BreakIterator wordBreaker;
+ private string text;
+ private readonly IList<int> transitions = new List<int>();
+ private readonly static Regex thaiPattern = new Regex(@"\p{IsThai}", RegexOptions.Compiled | RegexOptions.CultureInvariant);
+
+ public ThaiWordBreaker(BreakIterator wordBreaker)
+ {
+ if (wordBreaker == null)
+ {
+ throw new ArgumentNullException("wordBreaker");
+ }
+ this.wordBreaker = wordBreaker;
+ }
+
+ public void SetText(string text)
+ {
+ this.text = text;
+ wordBreaker.SetText(text);
+ }
+
+ public int Current()
+ {
+ if (transitions.Any())
+ {
+ return transitions.First();
+ }
+ return wordBreaker.Current();
+ }
+
+ public int Next()
+ {
+ if (transitions.Any())
+ {
+ transitions.RemoveAt(0);
+ }
+ if (transitions.Any())
+ {
+ return transitions.First();
+ }
+ return GetNext();
+ }
+
+ private int GetNext()
+ {
+ bool isThai = false, isNonThai = false;
+ bool prevWasThai = false, prevWasNonThai = false;
+ int prev = wordBreaker.Current();
+ int current = wordBreaker.Next();
+
+ if (current != BreakIterator.DONE && current - prev > 0)
+ {
+ // Find all of the transitions between Thai and non-Thai characters and digits
+ for (int i = prev; i < current; i++)
+ {
+ char c = text[i];
+ isThai = thaiPattern.IsMatch(c.ToString());
+ isNonThai = char.IsLetterOrDigit(c) && !isThai;
+
+ if ((prevWasThai && isNonThai) ||
+ (prevWasNonThai && isThai))
+ {
+ transitions.Add(i);
+ }
+
+ // record the values for comparison with the next loop
+ prevWasThai = isThai;
+ prevWasNonThai = isNonThai;
+ }
+
+ if (transitions.Any())
+ {
+ transitions.Add(current);
+ return transitions.First();
+ }
+ }
+
+ return current;
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/63e3e22d/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
index 9864b7c..cbd9b6a 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
@@ -49,7 +49,7 @@ namespace Lucene.Net.Analysis.Th
/// If this is false, this filter will not work at all!
/// </summary>
public static readonly bool DBBI_AVAILABLE = ThaiTokenizer.DBBI_AVAILABLE;
- private readonly BreakIterator breaker = BreakIterator.CreateWordInstance(new Locale());
+ private readonly ThaiWordBreaker breaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(new Locale()));
private readonly CharArrayIterator charIterator = CharArrayIterator.NewWordInstance();
private readonly bool handlePosIncr;