You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2020/08/24 21:19:40 UTC
[lucenenet] 08/09: Lucene.Net.ICU: Added locking to ThaiTokenizer
to only allow a single thread to manipulate the BreakIterator at a time.
This helps, but is only a partial fix.
This is an automated email from the ASF dual-hosted git repository.
nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 5ff92583f219fa851375c9be12ae1b6bf52383c1
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Mon Aug 24 15:09:37 2020 +0700
Lucene.Net.ICU: Added locking to ThaiTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This helps, but is only a partial fix.
---
.../Analysis/Th/ThaiTokenizer.cs | 92 ++++++++++++++++------
1 file changed, 66 insertions(+), 26 deletions(-)
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index 7e0754c..d1f80f1 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -41,13 +41,27 @@ namespace Lucene.Net.Analysis.Th
/// </summary>
public class ThaiTokenizer : SegmentingTokenizerBase
{
+ private static readonly object syncLock = new object(); // LUCENENET specific - workaround until BreakIterator is made thread safe (LUCENENET TODO: TO REVERT)
+
// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
- private static readonly BreakIterator proto = BreakIterator.GetWordInstance(new CultureInfo("th"));
+ private static readonly BreakIterator proto = LoadProto();
/// <summary>
/// used for breaking the text into sentences
/// </summary>
- private static readonly BreakIterator sentenceProto = BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
+ private static readonly BreakIterator sentenceProto = LoadSentenceProto();
+
+ private static BreakIterator LoadProto()
+ {
+ lock (syncLock)
+ return BreakIterator.GetWordInstance(new CultureInfo("th"));
+ }
+
+ private static BreakIterator LoadSentenceProto()
+ {
+ lock (syncLock)
+ return BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
+ }
private readonly ThaiWordBreaker wordBreaker;
private readonly CharArrayIterator wrapper = Analysis.Util.CharArrayIterator.NewWordInstance();
@@ -68,48 +82,74 @@ namespace Lucene.Net.Analysis.Th
/// <summary>
/// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
public ThaiTokenizer(AttributeFactory factory, TextReader reader)
- : base(factory, reader, (BreakIterator)sentenceProto.Clone())
+ : base(factory, reader, CreateSentenceClone())
{
// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
- wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
+ lock (syncLock)
+ wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}
- protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
+ private static BreakIterator CreateSentenceClone()
{
- this.sentenceStart = sentenceStart;
- this.sentenceEnd = sentenceEnd;
- wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
- wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
+ lock (syncLock)
+ return (BreakIterator)sentenceProto.Clone();
}
- protected override bool IncrementWord()
+ public override void Reset()
+ {
+ lock (syncLock)
+ base.Reset();
+ }
+
+ public override State CaptureState()
+ {
+ lock (syncLock)
+ return base.CaptureState();
+ }
+
+ protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
{
- int start = wordBreaker.Current;
- if (start == BreakIterator.Done)
+ lock (syncLock)
{
- return false; // BreakIterator exhausted
+ this.sentenceStart = sentenceStart;
+ this.sentenceEnd = sentenceEnd;
+ wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
+ wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
}
+ }
- // find the next set of boundaries, skipping over non-tokens
- int end = wordBreaker.Next();
- while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
+ protected override bool IncrementWord()
+ {
+ int start, end;
+ lock (syncLock)
{
- start = end;
+ start = wordBreaker.Current;
+ if (start == BreakIterator.Done)
+ {
+ return false; // BreakIterator exhausted
+ }
+
+ // find the next set of boundaries, skipping over non-tokens
end = wordBreaker.Next();
- }
+ while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
+ {
+ start = end;
+ end = wordBreaker.Next();
+ }
- if (end == BreakIterator.Done)
- {
- return false; // BreakIterator exhausted
- }
+ if (end == BreakIterator.Done)
+ {
+ return false; // BreakIterator exhausted
+ }
- ClearAttributes();
- termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
- offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
- return true;
+ ClearAttributes();
+ termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
+ offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
+ return true;
+ }
}
}