You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/09/01 14:39:48 UTC

[27/52] [abbrv] lucenenet git commit: Added test to demonstrate a problem with making the CharTokenizer.IsTokenChar() parameter a char rather than an int.

Added test to demonstrate a problem with making the CharTokenizer.IsTokenChar() parameter a char rather than an int.


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/c36a0bd1
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/c36a0bd1
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/c36a0bd1

Branch: refs/heads/master
Commit: c36a0bd1239061a07756b7735dcdd7f3dab016a8
Parents: 56cdc04
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Tue Aug 23 15:39:52 2016 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Tue Aug 23 15:55:19 2016 +0700

----------------------------------------------------------------------
 .../Analysis/Util/TestCharTokenizers.cs         | 46 +++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/c36a0bd1/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
index 0d28101..d452d83 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
@@ -240,6 +240,50 @@ namespace Lucene.Net.Tests.Analysis.Common.Analysis.Util
                 }
             }
         }
-    }
 
+        /// <summary>
+        /// LUCENENET: Added this test as proof that making the IsTokenChar parameter a char
+        /// is not going to work 100% of the time because of surrogate pairs.
+        /// </summary>
+
+        [Test]
+        public virtual void TestSurrogates()
+        {
+            var analyzer = new AnalyzerAnonymousInnerClassHelper3();
+
+            AssertAnalyzesTo(analyzer, "bar 123" + (char)55404 + (char)56321 + "34 5te 987", new string[] { "123\U0002b00134", "5", "987" });
+            AssertAnalyzesTo(analyzer, "787 " + (char)55297 + (char)56388 + "6" + (char)55404 + (char)56321 + " art true 734", new string[] { "787", "\U000104446\U0002b001", "734" });
+        }
+
+        private sealed class AnalyzerAnonymousInnerClassHelper3 : Analyzer
+        {
+            public AnalyzerAnonymousInnerClassHelper3()
+            { }
+
+            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+            {
+                Tokenizer tokenizer = new NumberAndSurrogatePairTokenizer(TEST_VERSION_CURRENT, reader);
+                return new TokenStreamComponents(tokenizer, tokenizer);
+            }
+
+            private sealed class NumberAndSurrogatePairTokenizer : CharTokenizer
+            {
+                public NumberAndSurrogatePairTokenizer(LuceneVersion matchVersion, TextReader reader)
+                    : base(matchVersion, reader)
+                {
+                }
+
+                protected override bool IsTokenChar(char c)
+                {
+                    if (char.IsNumber((char)c))
+                    {
+                        return true;
+                    }
+
+                    string character = char.ConvertFromUtf32(c);
+                    return char.IsSurrogatePair(character, 0);
+                }
+            }
+        }
+    }
 }
\ No newline at end of file