You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/09/01 14:39:59 UTC
[38/52] [abbrv] lucenenet git commit: Fixed several bugs that were
causing most of the Analysis.Th tests to fail.
Fixed several bugs that were causing most of the Analysis.Th tests to fail.
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/edde0fba
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/edde0fba
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/edde0fba
Branch: refs/heads/master
Commit: edde0fba58612e1c82aed16da6d1ffb763798612
Parents: ab40446
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Thu Aug 25 02:00:29 2016 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Thu Aug 25 02:00:29 2016 +0700
----------------------------------------------------------------------
.../Analysis/Th/ThaiTokenizer.cs | 14 +++++---------
.../Analysis/Th/ThaiWordFilter.cs | 14 +++++++-------
.../Analysis/Util/SegmentingTokenizerBase.cs | 17 +++++++----------
3 files changed, 19 insertions(+), 26 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/edde0fba/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index ca41da1..e11ebf8 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -1,4 +1,5 @@
\ufeffusing ICU4NET;
+using ICU4NETExtension;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support;
@@ -38,18 +39,13 @@ namespace Lucene.Net.Analysis.Th
/// If this is false, this tokenizer will not work at all!
/// </summary>
public static readonly bool DBBI_AVAILABLE;
- private static readonly BreakIterator proto = BreakIterator.CreateWordInstance(Locale.GetUS()); //GetWordInstance(new Locale("th"));
+ private static readonly BreakIterator proto = BreakIterator.CreateWordInstance(Locale.GetUS());
static ThaiTokenizer()
{
// check that we have a working dictionary-based break iterator for thai
proto.SetText("\u0e20\u0e32\u0e29\u0e32\u0e44\u0e17\u0e22");
DBBI_AVAILABLE = proto.IsBoundary(4);
}
-
-
- /// <summary>
- /// used for breaking the text into sentences </summary>
- private static readonly BreakIterator sentenceProto = BreakIterator.CreateSentenceInstance(Locale.GetUS()); //GetSentenceInstance(Locale.ROOT);
private readonly BreakIterator wordBreaker;
private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance();
@@ -70,13 +66,13 @@ namespace Lucene.Net.Analysis.Th
/// <summary>
/// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
public ThaiTokenizer(AttributeFactory factory, TextReader reader)
- : base(factory, reader, (BreakIterator)sentenceProto.Clone())
+ : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
{
if (!DBBI_AVAILABLE)
{
throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
}
- wordBreaker = (BreakIterator)proto.Clone();
+ wordBreaker = BreakIterator.CreateWordInstance(Locale.GetUS());
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}
@@ -86,7 +82,7 @@ namespace Lucene.Net.Analysis.Th
this.sentenceStart = sentenceStart;
this.sentenceEnd = sentenceEnd;
wrapper.SetText(buffer, sentenceStart, sentenceEnd - sentenceStart);
- wordBreaker.SetText(new string(wrapper.Text));
+ wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
}
protected internal override bool IncrementWord()
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/edde0fba/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
index b0a23a0..9864b7c 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
@@ -1,9 +1,10 @@
\ufeffusing ICU4NET;
-using System;
-using Lucene.Net.Analysis.Util;
+using ICU4NETExtension;
+using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
-using Lucene.Net.Analysis.Core;
+using System;
using System.Text.RegularExpressions;
namespace Lucene.Net.Analysis.Th
@@ -48,8 +49,7 @@ namespace Lucene.Net.Analysis.Th
/// If this is false, this filter will not work at all!
/// </summary>
public static readonly bool DBBI_AVAILABLE = ThaiTokenizer.DBBI_AVAILABLE;
- private static readonly BreakIterator proto = BreakIterator.CreateWordInstance(new Locale()); //.getWordInstance(new Locale("th"));
- private readonly BreakIterator breaker = (BreakIterator)proto.Clone();
+ private readonly BreakIterator breaker = BreakIterator.CreateWordInstance(new Locale());
private readonly CharArrayIterator charIterator = CharArrayIterator.NewWordInstance();
private readonly bool handlePosIncr;
@@ -111,7 +111,7 @@ namespace Lucene.Net.Analysis.Th
return false;
}
- if (termAtt.Length == 0 || Regex.IsMatch(termAtt.ToString().Substring(0, 1), @"\p{IsThai}"))
+ if (termAtt.Length == 0 || !Regex.IsMatch(termAtt.ToString().Substring(0, 1), @"\p{IsThai}"))
{
return true;
}
@@ -136,7 +136,7 @@ namespace Lucene.Net.Analysis.Th
// reinit CharacterIterator
charIterator.SetText(clonedTermAtt.Buffer(), 0, clonedTermAtt.Length);
- breaker.SetText(new string(charIterator.Text));
+ breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length));
int end2 = breaker.Next();
if (end2 != BreakIterator.DONE)
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/edde0fba/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
index c4c4643..73944af 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
@@ -1,14 +1,11 @@
-\ufeffusing System;
+\ufeffusing ICU4NET;
+using Lucene.Net.Analysis.Tokenattributes;
+using System;
using System.Diagnostics;
using System.IO;
-using ICU4NET;
-using Lucene.Net.Analysis.Tokenattributes;
-using Reader = System.IO.TextReader;
-using Version = Lucene.Net.Util.LuceneVersion;
namespace Lucene.Net.Analysis.Util
{
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -70,7 +67,7 @@ namespace Lucene.Net.Analysis.Util
/// be provided to this constructor.
/// </para>
/// </summary>
- protected SegmentingTokenizerBase(Reader reader, BreakIterator iterator)
+ protected SegmentingTokenizerBase(TextReader reader, BreakIterator iterator)
: this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator)
{
}
@@ -78,7 +75,7 @@ namespace Lucene.Net.Analysis.Util
/// <summary>
/// Construct a new SegmenterBase, also supplying the AttributeFactory
/// </summary>
- protected SegmentingTokenizerBase(AttributeFactory factory, Reader reader, BreakIterator iterator)
+ protected SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator)
: base(factory, reader)
{
offsetAtt = AddAttribute<IOffsetAttribute>();
@@ -106,7 +103,7 @@ namespace Lucene.Net.Analysis.Util
{
base.Reset();
wrapper.SetText(buffer, 0, 0);
- iterator.SetText(new string(wrapper.Text));
+ iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
length = usableLength = offset = 0;
}
@@ -177,7 +174,7 @@ namespace Lucene.Net.Analysis.Util
}
wrapper.SetText(buffer, 0, Math.Max(0, usableLength));
- iterator.SetText(new string(wrapper.Text, 0, Math.Max(0, usableLength)));
+ iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
}
// TODO: refactor to a shared readFully somewhere