You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2016/09/01 14:36:40 UTC
[20/22] lucenenet git commit: Fixed a bug in the
Analysis.Tr.TurkishLowerCaseFilter that caused the
Analysis.Tr.TestTurkishLowerCaseFilter_.TestTurkishLowerCaseFilter() test to
fail.
Fixed a bug in the Analysis.Tr.TurkishLowerCaseFilter that caused the Analysis.Tr.TestTurkishLowerCaseFilter_.TestTurkishLowerCaseFilter() test to fail.
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/3664f1d7
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/3664f1d7
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/3664f1d7
Branch: refs/heads/analysis-work
Commit: 3664f1d7dc4f31ce28ef36cbccc7d1cf9b79577f
Parents: bc48844
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Fri Aug 26 23:45:11 2016 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Sat Aug 27 02:20:11 2016 +0700
----------------------------------------------------------------------
.../Analysis/Tr/TurkishLowerCaseFilter.cs | 34 ++++++--
src/Lucene.Net.Core/Lucene.Net.csproj | 1 +
src/Lucene.Net.Core/Support/CultureContext.cs | 81 ++++++++++++++++++++
3 files changed, 109 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3664f1d7/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs
index 8b53666..4aaee6a 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs
@@ -34,10 +34,11 @@ namespace Lucene.Net.Analysis.Tr
public sealed class TurkishLowerCaseFilter : TokenFilter
{
private const int LATIN_CAPITAL_LETTER_I = '\u0049';
- //private const int LATIN_CAPITAL_LETTER_I = '\u0130';
+ private const int LATIN_CAPITAL_LETTER_DOTTED_I = '\u0130';
private const int LATIN_SMALL_LETTER_I = '\u0069';
private const int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
private const int COMBINING_DOT_ABOVE = '\u0307';
+
private readonly ICharTermAttribute termAtt;
/// <summary>
@@ -62,12 +63,6 @@ namespace Lucene.Net.Analysis.Tr
int length = termAtt.Length;
for (int i = 0; i < length;)
{
-
- // LUCENENET TODO: This line is failing, causing the TestTurkishLowerCaseFilter() test to fail. According to the MSDN documentation
- // https://msdn.microsoft.com/en-us/library/system.globalization.unicodecategory(v=vs.110).aspx
- // a non-spacing mark is a modifier to a character. This logic is expecting the first codepoint to be an upper case Latin I,
- // and the second to be a non-spacing mark, but it is coming back as a single codepoint 304 that doesn't match Latin I.
- // Also, char.GetUnicodeCategory((char)304) returns UpperCaseLetter (not sure if that is pertinent).
int ch = Character.CodePointAt(buffer, i, length);
iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && char.GetUnicodeCategory((char)ch) == UnicodeCategory.NonSpacingMark));
@@ -99,6 +94,31 @@ namespace Lucene.Net.Analysis.Tr
}
}
+ using (var culture = new CultureContext("tr-TR"))
+ {
+ switch (ch)
+ {
+ // LUCENENET: The .NET char.ToLower() function works correctly in
+ // Turkish as long as the current thread is set to tr-TR (well, technically the
+ // culture change is only required for the LATIN_CAPITAL_LETTER_I case). .NET does
+ // not split these characters into separate letter/non-spacing mark characters,
+ // but the user might still input them that way so we still need the above
+ // block to handle that case.
+ //
+ // LUCENENET TODO: Oddly, the Character.ToLowerCase() function below does not work right
+ // for Turkish. Which begs the question, should this special case be there so Turkish works
+ // everywhere? Or should we leave it a special case here because that is the way it works in Java?
+ //
+ // References:
+ // http://haacked.com/archive/2012/07/05/turkish-i-problem-and-why-you-should-care.aspx/
+ // http://www.i18nguy.com/unicode/turkish-i18n.html
+ case LATIN_CAPITAL_LETTER_I:
+ case LATIN_CAPITAL_LETTER_DOTTED_I:
+ i += Character.ToChars(char.ToLower((char)ch), buffer, i);
+ continue;
+ }
+ }
+
i += Character.ToChars(Character.ToLowerCase(ch), buffer, i);
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3664f1d7/src/Lucene.Net.Core/Lucene.Net.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Lucene.Net.csproj b/src/Lucene.Net.Core/Lucene.Net.csproj
index 26c8906..9dbcd4c 100644
--- a/src/Lucene.Net.Core/Lucene.Net.csproj
+++ b/src/Lucene.Net.Core/Lucene.Net.csproj
@@ -623,6 +623,7 @@
<Compile Include="Support\Compatibility\Collections.cs" />
<Compile Include="Support\ConcurrentHashMapWrapper.cs" />
<Compile Include="Support\ConcurrentHashSet.cs" />
+ <Compile Include="Support\CultureContext.cs" />
<Compile Include="Support\ErrorHandling.cs" />
<Compile Include="Support\FileStreamExtensions.cs" />
<Compile Include="Support\HashCodeMerge.cs" />
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/3664f1d7/src/Lucene.Net.Core/Support/CultureContext.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Support/CultureContext.cs b/src/Lucene.Net.Core/Support/CultureContext.cs
new file mode 100644
index 0000000..59f578e
--- /dev/null
+++ b/src/Lucene.Net.Core/Support/CultureContext.cs
@@ -0,0 +1,81 @@
+\ufeffusing System;
+using System.Globalization;
+using System.Threading;
+
+namespace Lucene.Net.Support
+{
+ /// <summary>
+ /// Allows switching the current thread to a new culture in a using block that will automatically
+ /// return the culture to its previous state upon completion.
+ /// </summary>
+ public class CultureContext : IDisposable
+ {
+ public CultureContext(int culture)
+ : this(new CultureInfo(culture), Thread.CurrentThread.CurrentUICulture)
+ {
+ }
+
+ public CultureContext(int culture, int uiCulture)
+ : this(new CultureInfo(culture), new CultureInfo(uiCulture))
+ {
+ }
+
+ public CultureContext(string cultureName)
+ : this(new CultureInfo(cultureName), Thread.CurrentThread.CurrentUICulture)
+ {
+ }
+
+ public CultureContext(string cultureName, string uiCultureName)
+ : this(new CultureInfo(cultureName), new CultureInfo(uiCultureName))
+ {
+ }
+
+ public CultureContext(CultureInfo culture)
+ : this(culture, Thread.CurrentThread.CurrentUICulture)
+ {
+ }
+
+ public CultureContext(CultureInfo culture, CultureInfo uiCulture)
+ {
+ if (culture == null)
+ throw new ArgumentNullException("culture");
+ if (uiCulture == null)
+ throw new ArgumentNullException("uiCulture");
+
+ this.currentThread = Thread.CurrentThread;
+
+ // Record the current culture settings so they can be restored later.
+ this.originalCulture = this.currentThread.CurrentCulture;
+ this.originalUICulture = this.currentThread.CurrentUICulture;
+
+ // Set both the culture and UI culture for this context.
+ this.currentThread.CurrentCulture = culture;
+ this.currentThread.CurrentUICulture = uiCulture;
+ }
+
+ private readonly Thread currentThread;
+ private readonly CultureInfo originalCulture;
+ private readonly CultureInfo originalUICulture;
+
+ public CultureInfo OriginalCulture
+ {
+ get { return this.originalCulture; }
+ }
+
+ public CultureInfo OriginalUICulture
+ {
+ get { return this.originalUICulture; }
+ }
+
+ public void RestoreOriginalCulture()
+ {
+ // Restore the culture to the way it was before the constructor was called.
+ this.currentThread.CurrentCulture = this.originalCulture;
+ this.currentThread.CurrentUICulture = this.originalUICulture;
+ }
+ public void Dispose()
+ {
+ RestoreOriginalCulture();
+ }
+ }
+}