You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/08/23 23:18:41 UTC
[48/50] [abbrv] lucenenet git commit: Fix for
CharTokenizer.IsTokenChar() to revert the parameter back to int as was
intended. A char cannot represent a surrogate pair,
which makes it impossible to use IsTokenChar() with surrogate pairs.
Fix for CharTokenizer.IsTokenChar() to revert the parameter back to int as was intended. A char cannot represent a surrogate pair, which makes it impossible to use IsTokenChar() with surrogate pairs.
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/053d3efc
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/053d3efc
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/053d3efc
Branch: refs/heads/analysis-work
Commit: 053d3efcb647dac4c681ddf3999eda18b3964b11
Parents: c36a0bd
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Tue Aug 23 14:37:12 2016 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Tue Aug 23 16:05:50 2016 +0700
----------------------------------------------------------------------
.../Analysis/Ar/ArabicLetterTokenizer.cs | 5 +++--
src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs | 2 +-
.../Analysis/Core/WhitespaceTokenizer.cs | 4 ++--
src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs | 2 +-
.../Analysis/Ru/RussianLetterTokenizer.cs | 5 +++--
src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs | 4 ++--
.../Analysis/Util/TestCharTokenizers.cs | 2 +-
7 files changed, 13 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs
index 9e36d25..5fa5827 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ar/ArabicLetterTokenizer.cs
@@ -1,4 +1,5 @@
\ufeffusing Lucene.Net.Analysis.Core;
+using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Globalization;
@@ -74,9 +75,9 @@ namespace Lucene.Net.Analysis.Ar
/// <summary>
/// Allows for Letter category or NonspacingMark category </summary>
/// <seealso cref= org.apache.lucene.analysis.core.LetterTokenizer#isTokenChar(int) </seealso>
- protected override bool IsTokenChar(char c)
+ protected override bool IsTokenChar(int c)
{
- return base.IsTokenChar(c) || char.GetUnicodeCategory((char)c) == UnicodeCategory.NonSpacingMark;
+ return base.IsTokenChar(c) || Character.GetType(c) == UnicodeCategory.NonSpacingMark;
}
}
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
index 9a0b57d..9d3dc2b 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
@@ -75,7 +75,7 @@ namespace Lucene.Net.Analysis.Core
/// Collects only characters which satisfy
/// <seealso cref="Character#isLetter(int)"/>.
/// </summary>
- protected override bool IsTokenChar(char c)
+ protected override bool IsTokenChar(int c)
{
return Character.IsLetter(c);
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
index 1567daf..5ccdbbf 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
@@ -69,9 +69,9 @@ namespace Lucene.Net.Analysis.Core
/// Collects only characters which do not satisfy
/// <seealso cref="Character#isWhitespace(int)"/>.
/// </summary>
- protected override bool IsTokenChar(char c)
+ protected override bool IsTokenChar(int c)
{
- return !char.IsWhiteSpace(c);
+ return !char.IsWhiteSpace((char)c);
}
}
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs
index 5117267..2de7baa 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/In/IndicTokenizer.cs
@@ -41,7 +41,7 @@ namespace Lucene.Net.Analysis.In
{
}
- protected override bool IsTokenChar(char c) // LUCENENET TODO: Change parameter back to int (for codepoint) rather than a single char since this could contain surrogate pairs
+ protected override bool IsTokenChar(int c)
{
UnicodeCategory category = Character.GetType(c);
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs
index e48c33f..15db0f7 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ru/RussianLetterTokenizer.cs
@@ -1,4 +1,5 @@
\ufeffusing Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.IO;
@@ -73,9 +74,9 @@ namespace Lucene.Net.Analysis.Ru
/// Collects only characters which satisfy
/// <seealso cref="Character#isLetter(int)"/>.
/// </summary>
- protected override bool IsTokenChar(char c)
+ protected override bool IsTokenChar(int c)
{
- return char.IsLetter(c) || (c >= DIGIT_0 && c <= DIGIT_9);
+ return Character.IsLetter(c) || (c >= DIGIT_0 && c <= DIGIT_9);
}
}
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs
index b4ea553..14047ca 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs
@@ -120,7 +120,7 @@ namespace Lucene.Net.Analysis.Util
/// predicate. Codepoints for which this is false are used to define token
/// boundaries and are not included in tokens.
/// </summary>
- protected abstract bool IsTokenChar(char c);
+ protected abstract bool IsTokenChar(int c);
/// <summary>
/// Called on each token character to normalize it before it is added to the
@@ -166,7 +166,7 @@ namespace Lucene.Net.Analysis.Util
int charCount = Character.CharCount(c);
bufferIndex += charCount;
- if (IsTokenChar((char)c)) // if it's a token char
+ if (IsTokenChar(c)) // if it's a token char
{
if (length == 0) // start of token
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/053d3efc/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
index d452d83..40ae0bb 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
@@ -273,7 +273,7 @@ namespace Lucene.Net.Tests.Analysis.Common.Analysis.Util
{
}
- protected override bool IsTokenChar(char c)
+ protected override bool IsTokenChar(int c)
{
if (char.IsNumber((char)c))
{