You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/09/01 14:39:48 UTC
[27/52] [abbrv] lucenenet git commit: Added test to demonstrate a
problem with making the CharTokenizer.IsTokenChar() parameter a char rather
than an int.
Added test to demonstrate a problem with making the CharTokenizer.IsTokenChar() parameter a char rather than an int.
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/c36a0bd1
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/c36a0bd1
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/c36a0bd1
Branch: refs/heads/master
Commit: c36a0bd1239061a07756b7735dcdd7f3dab016a8
Parents: 56cdc04
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Tue Aug 23 15:39:52 2016 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Tue Aug 23 15:55:19 2016 +0700
----------------------------------------------------------------------
.../Analysis/Util/TestCharTokenizers.cs | 46 +++++++++++++++++++-
1 file changed, 45 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/c36a0bd1/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
index 0d28101..d452d83 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
@@ -240,6 +240,50 @@ namespace Lucene.Net.Tests.Analysis.Common.Analysis.Util
}
}
}
- }
+ /// <summary>
+ /// LUCENENET: Added this test as proof that making the IsTokenChar parameter a char
+ /// is not going to work 100% of the time because of surrogate pairs.
+ /// </summary>
+
+ [Test]
+ public virtual void TestSurrogates()
+ {
+ var analyzer = new AnalyzerAnonymousInnerClassHelper3();
+
+ AssertAnalyzesTo(analyzer, "bar 123" + (char)55404 + (char)56321 + "34 5te 987", new string[] { "123\U0002b00134", "5", "987" });
+ AssertAnalyzesTo(analyzer, "787 " + (char)55297 + (char)56388 + "6" + (char)55404 + (char)56321 + " art true 734", new string[] { "787", "\U000104446\U0002b001", "734" });
+ }
+
+ private sealed class AnalyzerAnonymousInnerClassHelper3 : Analyzer
+ {
+ public AnalyzerAnonymousInnerClassHelper3()
+ { }
+
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new NumberAndSurrogatePairTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+
+ private sealed class NumberAndSurrogatePairTokenizer : CharTokenizer
+ {
+ public NumberAndSurrogatePairTokenizer(LuceneVersion matchVersion, TextReader reader)
+ : base(matchVersion, reader)
+ {
+ }
+
+ protected override bool IsTokenChar(char c)
+ {
+ if (char.IsNumber((char)c))
+ {
+ return true;
+ }
+
+ string character = char.ConvertFromUtf32(c);
+ return char.IsSurrogatePair(character, 0);
+ }
+ }
+ }
+ }
}
\ No newline at end of file