You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/09/01 14:36:28 UTC

[08/22] lucenenet git commit: Fixed several bugs that were causing most of the Analysis.Th tests to fail.

Fixed several bugs that were causing most of the Analysis.Th tests to fail.


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/edde0fba
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/edde0fba
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/edde0fba

Branch: refs/heads/analysis-work
Commit: edde0fba58612e1c82aed16da6d1ffb763798612
Parents: ab40446
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Thu Aug 25 02:00:29 2016 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Thu Aug 25 02:00:29 2016 +0700

----------------------------------------------------------------------
 .../Analysis/Th/ThaiTokenizer.cs                   | 14 +++++---------
 .../Analysis/Th/ThaiWordFilter.cs                  | 14 +++++++-------
 .../Analysis/Util/SegmentingTokenizerBase.cs       | 17 +++++++----------
 3 files changed, 19 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/edde0fba/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index ca41da1..e11ebf8 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -1,4 +1,5 @@
 \ufeffusing ICU4NET;
+using ICU4NETExtension;
 using Lucene.Net.Analysis.Tokenattributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Support;
@@ -38,18 +39,13 @@ namespace Lucene.Net.Analysis.Th
         /// If this is false, this tokenizer will not work at all!
         /// </summary>
         public static readonly bool DBBI_AVAILABLE;
-        private static readonly BreakIterator proto = BreakIterator.CreateWordInstance(Locale.GetUS());   //GetWordInstance(new Locale("th"));
+        private static readonly BreakIterator proto = BreakIterator.CreateWordInstance(Locale.GetUS());
         static ThaiTokenizer()
         {
             // check that we have a working dictionary-based break iterator for thai
             proto.SetText("\u0e20\u0e32\u0e29\u0e32\u0e44\u0e17\u0e22");
             DBBI_AVAILABLE = proto.IsBoundary(4);
         }
-        
-
-        /// <summary>
-        /// used for breaking the text into sentences </summary>
-        private static readonly BreakIterator sentenceProto = BreakIterator.CreateSentenceInstance(Locale.GetUS());    //GetSentenceInstance(Locale.ROOT);
 
         private readonly BreakIterator wordBreaker;
         private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance();
@@ -70,13 +66,13 @@ namespace Lucene.Net.Analysis.Th
         /// <summary>
         /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
         public ThaiTokenizer(AttributeFactory factory, TextReader reader)
-              : base(factory, reader, (BreakIterator)sentenceProto.Clone())
+              : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
         {
             if (!DBBI_AVAILABLE)
             {
                 throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
             }
-            wordBreaker = (BreakIterator)proto.Clone();
+            wordBreaker = BreakIterator.CreateWordInstance(Locale.GetUS());
             termAtt = AddAttribute<ICharTermAttribute>();
             offsetAtt = AddAttribute<IOffsetAttribute>();
         }
@@ -86,7 +82,7 @@ namespace Lucene.Net.Analysis.Th
             this.sentenceStart = sentenceStart;
             this.sentenceEnd = sentenceEnd;
             wrapper.SetText(buffer, sentenceStart, sentenceEnd - sentenceStart);
-            wordBreaker.SetText(new string(wrapper.Text));
+            wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
         }
 
         protected internal override bool IncrementWord()

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/edde0fba/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
index b0a23a0..9864b7c 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
@@ -1,9 +1,10 @@
 \ufeffusing ICU4NET;
-using System;
-using Lucene.Net.Analysis.Util;
+using ICU4NETExtension;
+using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
-using Lucene.Net.Analysis.Core;
+using System;
 using System.Text.RegularExpressions;
 
 namespace Lucene.Net.Analysis.Th
@@ -48,8 +49,7 @@ namespace Lucene.Net.Analysis.Th
         /// If this is false, this filter will not work at all!
         /// </summary>
         public static readonly bool DBBI_AVAILABLE = ThaiTokenizer.DBBI_AVAILABLE;
-        private static readonly BreakIterator proto = BreakIterator.CreateWordInstance(new Locale());    //.getWordInstance(new Locale("th"));
-        private readonly BreakIterator breaker = (BreakIterator)proto.Clone();
+        private readonly BreakIterator breaker = BreakIterator.CreateWordInstance(new Locale());
         private readonly CharArrayIterator charIterator = CharArrayIterator.NewWordInstance();
 
         private readonly bool handlePosIncr;
@@ -111,7 +111,7 @@ namespace Lucene.Net.Analysis.Th
                 return false;
             }
 
-            if (termAtt.Length == 0 || Regex.IsMatch(termAtt.ToString().Substring(0, 1), @"\p{IsThai}"))
+            if (termAtt.Length == 0 || !Regex.IsMatch(termAtt.ToString().Substring(0, 1), @"\p{IsThai}"))
             {
                 return true;
             }
@@ -136,7 +136,7 @@ namespace Lucene.Net.Analysis.Th
 
             // reinit CharacterIterator
             charIterator.SetText(clonedTermAtt.Buffer(), 0, clonedTermAtt.Length);
-            breaker.SetText(new string(charIterator.Text));
+            breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length));
             int end2 = breaker.Next();
             if (end2 != BreakIterator.DONE)
             {

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/edde0fba/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
index c4c4643..73944af 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
@@ -1,14 +1,11 @@
-\ufeffusing System;
+\ufeffusing ICU4NET;
+using Lucene.Net.Analysis.Tokenattributes;
+using System;
 using System.Diagnostics;
 using System.IO;
-using ICU4NET;
-using Lucene.Net.Analysis.Tokenattributes;
-using Reader = System.IO.TextReader;
-using Version = Lucene.Net.Util.LuceneVersion;
 
 namespace Lucene.Net.Analysis.Util
 {
-
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
@@ -70,7 +67,7 @@ namespace Lucene.Net.Analysis.Util
         /// be provided to this constructor.
         /// </para>
         /// </summary>
-        protected SegmentingTokenizerBase(Reader reader, BreakIterator iterator)
+        protected SegmentingTokenizerBase(TextReader reader, BreakIterator iterator)
             : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator)
         {
         }
@@ -78,7 +75,7 @@ namespace Lucene.Net.Analysis.Util
         /// <summary>
         /// Construct a new SegmenterBase, also supplying the AttributeFactory
         /// </summary>
-        protected SegmentingTokenizerBase(AttributeFactory factory, Reader reader, BreakIterator iterator)
+        protected SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator)
             : base(factory, reader)
         {
             offsetAtt = AddAttribute<IOffsetAttribute>();
@@ -106,7 +103,7 @@ namespace Lucene.Net.Analysis.Util
         {
             base.Reset();
             wrapper.SetText(buffer, 0, 0);
-            iterator.SetText(new string(wrapper.Text));
+            iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
             length = usableLength = offset = 0;
         }
 
@@ -177,7 +174,7 @@ namespace Lucene.Net.Analysis.Util
             }
 
             wrapper.SetText(buffer, 0, Math.Max(0, usableLength));
-            iterator.SetText(new string(wrapper.Text, 0, Math.Max(0, usableLength)));
+            iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
         }
 
         // TODO: refactor to a shared readFully somewhere