You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2020/08/02 15:18:45 UTC
[lucenenet] 02/02: BUG: Lucene.Net.Analysis.Common: Fixed classes that were originally using invariant culture to do so again. J2N's Character class default is to use the current culture, which had changed from the prior Character class that used invariant culture. Fixes TestICUFoldingFilter::TestRandomStrings().

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 9ce76e99474ce818590210e5adfa2f2ec607ff26
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Sun Aug 2 01:21:23 2020 +0700

    BUG: Lucene.Net.Analysis.Common: Fixed classes that were originally using invariant culture to do so again. J2N's Character class default is to use the current culture, which had changed from the prior Character class that used invariant culture. Fixes TestICUFoldingFilter::TestRandomStrings().
---
 .../Analysis/Core/LowerCaseTokenizer.cs                      |  5 +++--
 .../Analysis/Miscellaneous/StemmerOverrideFilter.cs          |  5 +++--
 .../Analysis/Synonym/SynonymFilter.cs                        | 10 ++++++----
 src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs | 12 ++++++------
 src/Lucene.Net.TestFramework/Analysis/MockTokenizer.cs       |  3 ++-
 .../Analysis/Util/TestCharArraySet.cs                        |  6 +++---
 6 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
index 96d8958..4ea4db6 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
@@ -1,5 +1,6 @@
 using J2N;
 using Lucene.Net.Util;
+using System.Globalization;
 using System.IO;
 
 namespace Lucene.Net.Analysis.Core
@@ -73,11 +74,11 @@ namespace Lucene.Net.Analysis.Core
 
         /// <summary>
         /// Converts char to lower case
-        /// <see cref="Character.ToLower(int)"/>.
+        /// <see cref="Character.ToLower(int, CultureInfo)"/> in the invariant culture.
         /// </summary>
         protected override int Normalize(int c)
         {
-            return Character.ToLower(c);
+            return Character.ToLower(c, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
         }
     }
 }
\ No newline at end of file
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
index ebd0178..bc81134 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
@@ -3,6 +3,7 @@ using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System.Collections.Generic;
+using System.Globalization;
 using System.IO;
 
 namespace Lucene.Net.Analysis.Miscellaneous
@@ -134,7 +135,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
                 while (bufUpto < bufferLen)
                 {
                     int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
-                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
+                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
                     {
                         return null;
                     }
@@ -192,7 +193,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
                     char[] buffer = charsSpare.Chars;
                     for (int i = 0; i < length;)
                     {
-                        i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i)), buffer, i);
+                        i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i);
                     }
                     UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                 }
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs
index 1e840da..6b039ee 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs
@@ -5,6 +5,7 @@ using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System;
 using System.Diagnostics;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.Synonym
 {
@@ -252,9 +253,10 @@ namespace Lucene.Net.Analysis.Synonym
 
         /// <param name="input"> input tokenstream </param>
         /// <param name="synonyms"> synonym map </param>
-        /// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int)"/>.
-        ///                   Note, if you set this to true, its your responsibility to lowercase
-        ///                   the input entries when you create the <see cref="SynonymMap"/> </param>
+        /// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int, CultureInfo)"/>
+        ///                   in using <see cref="CultureInfo.InvariantCulture"/>.
+        ///                   Note, if you set this to <c>true</c>, its your responsibility to lowercase
+        ///                   the input entries when you create the <see cref="SynonymMap"/>.</param>
         public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) 
             : base(input)
         {
@@ -411,7 +413,7 @@ namespace Lucene.Net.Analysis.Synonym
                 while (bufUpto < bufferLen)
                 {
                     int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
-                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
+                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
                     {
                         //System.out.println("    stop");
                         goto byTokenBreak;
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs
index f69094d..2bebdcb 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs
@@ -664,7 +664,7 @@ namespace Lucene.Net.Analysis.Util
                 for (int i = 0; i < length;)
                 {
                     var codePointAt = charUtils.CodePointAt(text1, offset + i, limit);
-                    if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
+                    if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
                     {
                         return false;
                     }
@@ -696,7 +696,7 @@ namespace Lucene.Net.Analysis.Util
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text1, i);
-                    if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
+                    if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
                     {
                         return false;
                     }
@@ -728,7 +728,7 @@ namespace Lucene.Net.Analysis.Util
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text1, i);
-                    if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
+                    if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
                     {
                         return false;
                     }
@@ -811,7 +811,7 @@ namespace Lucene.Net.Analysis.Util
                 for (int i = offset; i < stop;)
                 {
                     int codePointAt = charUtils.CodePointAt(text, i, stop);
-                    code = code * 31 + Character.ToLower(codePointAt);
+                    code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
                     i += Character.CharCount(codePointAt);
                 }
             }
@@ -839,7 +839,7 @@ namespace Lucene.Net.Analysis.Util
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text, i);
-                    code = code * 31 + Character.ToLower(codePointAt);
+                    code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
                     i += Character.CharCount(codePointAt);
                 }
             }
@@ -867,7 +867,7 @@ namespace Lucene.Net.Analysis.Util
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text, i);
-                    code = code * 31 + Character.ToLower(codePointAt);
+                    code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
                     i += Character.CharCount(codePointAt);
                 }
             }
diff --git a/src/Lucene.Net.TestFramework/Analysis/MockTokenizer.cs b/src/Lucene.Net.TestFramework/Analysis/MockTokenizer.cs
index 2303aab..4be7f18 100644
--- a/src/Lucene.Net.TestFramework/Analysis/MockTokenizer.cs
+++ b/src/Lucene.Net.TestFramework/Analysis/MockTokenizer.cs
@@ -7,6 +7,7 @@ using CharacterRunAutomaton = Lucene.Net.Util.Automaton.CharacterRunAutomaton;
 using Debug = Lucene.Net.Diagnostics.Debug; // LUCENENET NOTE: We cannot use System.Diagnostics.Debug because those calls will be optimized out of the release!
 using RegExp = Lucene.Net.Util.Automaton.RegExp;
 using Assert = Lucene.Net.TestFramework.Assert;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis
 {
@@ -290,7 +291,7 @@ namespace Lucene.Net.Analysis
 
         protected virtual int Normalize(int c)
         {
-            return lowerCase ? Character.ToLower(c) : c;
+            return lowerCase ? Character.ToLower(c, CultureInfo.InvariantCulture) : c; // LUCENENET specific - need to use invariant culture to match Java
         }
 
         public override void Reset()
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArraySet.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArraySet.cs
index e3c54d6..e637483 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArraySet.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArraySet.cs
@@ -419,7 +419,7 @@ namespace Lucene.Net.Analysis.Util
             IList<string> stopwordsUpper = new List<string>();
             foreach (string @string in stopwords)
             {
-                stopwordsUpper.Add(@string.ToUpper());
+                stopwordsUpper.Add(@string.ToUpperInvariant());
             }
             setIngoreCase.addAll(TEST_STOP_WORDS);
             setIngoreCase.Add(Convert.ToInt32(1));
@@ -472,7 +472,7 @@ namespace Lucene.Net.Analysis.Util
             IList<string> stopwordsUpper = new List<string>();
             foreach (string @string in stopwords)
             {
-                stopwordsUpper.Add(@string.ToUpper());
+                stopwordsUpper.Add(@string.ToUpperInvariant());
             }
             setIngoreCase.addAll(TEST_STOP_WORDS);
             setIngoreCase.Add(Convert.ToInt32(1));
@@ -523,7 +523,7 @@ namespace Lucene.Net.Analysis.Util
             IList<string> stopwordsUpper = new List<string>();
             foreach (string @string in stopwords)
             {
-                stopwordsUpper.Add(@string.ToUpper());
+                stopwordsUpper.Add(@string.ToUpperInvariant());
             }
             set.addAll(TEST_STOP_WORDS);