You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/25 08:33:25 UTC
[2/4] lucenenet git commit: Squashed commit of the following:
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationAttributeFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationAttributeFactory.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationAttributeFactory.cs
new file mode 100644
index 0000000..ed4c7f9
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationAttributeFactory.cs
@@ -0,0 +1,75 @@
+using Icu.Collation;
+using Lucene.Net.Collation.TokenAttributes;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System.Reflection;
+
+namespace Lucene.Net.Collation
+{
+ /// <summary>
+ /// Converts each token into its <see cref="System.Globalization.SortKey"/>, and
+ /// then encodes bytes as an index term.
+ /// </summary>
+ /// <remarks>
+ /// <strong>WARNING:</strong> Make sure you use exactly the same <see cref="Collator"/> at
+ /// index and query time -- <see cref="System.Globalization.SortKey"/>s are only comparable when produced by
+ /// the same <see cref="Collator"/>. <see cref="RuleBasedCollator"/>s are
+ /// independently versioned, so it is safe to search against stored
+ /// <see cref="System.Globalization.SortKey"/>s if the following are exactly the same (best practice is
+ /// to store this information with the index and check that they remain the
+ /// same at query time):
+ /// <para/>
+ /// <list type="number">
+ /// <item><description>Collator version - see <see cref="Collator"/> Version</description></item>
+ /// <item><description>The collation strength used - see <see cref="Collator.Strength"/></description></item>
+ /// </list>
+ /// <para/>
+ /// <see cref="System.Globalization.SortKey"/>s generated by ICU Collators are not compatible with those
+ /// generated by java.text.Collators. Specifically, if you use
+ /// <see cref="ICUCollationAttributeFactory"/> to generate index terms, do not use
+ /// CollationAttributeFactory on the query side, or vice versa.
+ /// <para/>
+ /// <see cref="ICUCollationAttributeFactory"/> is significantly faster and generates significantly
+ /// shorter keys than CollationAttributeFactory. See
+ /// <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+ /// >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+ /// generation timing and key length comparisons between ICU4J and
+ /// java.text.Collator over several languages.
+ /// </remarks>
+ [ExceptionToClassNameConvention]
+ public class ICUCollationAttributeFactory : AttributeSource.AttributeFactory
+ {
+ private readonly Collator collator;
+ private readonly AttributeSource.AttributeFactory @delegate;
+
+ /// <summary>
+ /// Create an <see cref="ICUCollationAttributeFactory"/>, using
+ /// <see cref="AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY"/> as the
+ /// factory for all other attributes.
+ /// </summary>
+ /// <param name="collator"><see cref="System.Globalization.SortKey"/> generator</param>
+ public ICUCollationAttributeFactory(Collator collator)
+ : this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator)
+ {
+ }
+
+ /// <summary>
+ /// Create an <see cref="ICUCollationAttributeFactory"/>, using the supplied Attribute
+ /// Factory as the factory for all other attributes.
+ /// </summary>
+ /// <param name="delegate">Attribute Factory</param>
+ /// <param name="collator"><see cref="System.Globalization.SortKey"/> generator</param>
+ public ICUCollationAttributeFactory(AttributeSource.AttributeFactory @delegate, Collator collator)
+ {
+ this.@delegate = @delegate;
+ this.collator = collator;
+ }
+
+ public override Util.Attribute CreateAttributeInstance<T>()
+ {
+ return typeof(T).GetTypeInfo().IsAssignableFrom(typeof(ICUCollatedTermAttribute))
+ ? new ICUCollatedTermAttribute(collator)
+ : @delegate.CreateAttributeInstance<T>();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationDocValuesField.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationDocValuesField.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationDocValuesField.cs
new file mode 100644
index 0000000..bddc095
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationDocValuesField.cs
@@ -0,0 +1,62 @@
+using Icu.Collation;
+using Lucene.Net.Documents;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System.Globalization;
+
+namespace Lucene.Net.Collation
+{
+ /// <summary>
+ /// Indexes sort keys as a single-valued <see cref="SortedDocValuesField"/>.
+ /// </summary>
+ /// <remarks>
+ /// This is more efficient that <see cref="ICUCollationKeyAnalyzer"/> if the field
+ /// only has one value: no uninversion is necessary to sort on the field,
+ /// locale-sensitive range queries can still work via <see cref="Search.FieldCacheRangeFilter"/>,
+ /// and the underlying data structures built at index-time are likely more efficient
+ /// and use less memory than FieldCache.
+ /// </remarks>
+ [ExceptionToClassNameConvention]
+ public sealed class ICUCollationDocValuesField : Field
+ {
+ private readonly string name;
+ private readonly Collator collator;
+ private readonly BytesRef bytes = new BytesRef();
+ private SortKey key;
+
+ /// <summary>
+ /// Create a new <see cref="ICUCollationDocValuesField"/>.
+ /// <para/>
+ /// NOTE: you should not create a new one for each document, instead
+ /// just make one and reuse it during your indexing process, setting
+ /// the value via <see cref="SetStringValue(string)"/>.
+ /// </summary>
+ /// <param name="name">Field name.</param>
+ /// <param name="collator">Collator for generating collation keys.</param>
+ // TODO: can we make this trap-free? maybe just synchronize on the collator
+ // instead?
+ public ICUCollationDocValuesField(string name, Collator collator)
+ : base(name, SortedDocValuesField.TYPE)
+ {
+ this.name = name;
+ this.collator = (Collator)collator.Clone();
+ m_fieldsData = bytes; // so wrong setters cannot be called
+ }
+
+ public override string Name
+ {
+ get
+ {
+ return name;
+ }
+ }
+
+ public override void SetStringValue(string value)
+ {
+ key = collator.GetSortKey(value);
+ bytes.Bytes = key.KeyData;
+ bytes.Offset = 0;
+ bytes.Length = key.KeyData.Length;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyAnalyzer.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyAnalyzer.cs
new file mode 100644
index 0000000..3b9d7c4
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyAnalyzer.cs
@@ -0,0 +1,96 @@
+using Icu.Collation;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Collation
+{
+ /// <summary>
+ /// Configures <see cref="KeywordTokenizer"/> with <see cref="ICUCollationAttributeFactory"/>.
+ /// </summary>
+ /// <remarks>
+ /// Converts the token into its <see cref="System.Globalization.SortKey"/>, and
+ /// then encodes the <see cref="System.Globalization.SortKey"/> either directly or with
+ /// <see cref="IndexableBinaryStringTools"/> (see <a href="#version">below</a>), to allow it to
+ /// be stored as an index term.
+ /// <para/>
+ /// <strong>WARNING:</strong> Make sure you use exactly the same <see cref="Collator"/> at
+ /// index and query time -- CollationKeys are only comparable when produced by
+ /// the same <see cref="Collator"/>. <see cref="RuleBasedCollator"/>s are
+ /// independently versioned, so it is safe to search against stored
+ /// <see cref="System.Globalization.SortKey"/>s if the following are exactly the same (best practice is
+ /// to store this information with the index and check that they remain the
+ /// same at query time):
+ /// <list type="number">
+ /// <item><description>Collator version - see <see cref="Collator"/> Version</description></item>
+ /// <item><description>The collation strength used - see <see cref="Collator.Strength"/></description></item>
+ /// </list>
+ /// <para/>
+ /// <see cref="System.Globalization.SortKey"/>s generated by ICU Collators are not compatible with those
+ /// generated by java.text.Collators. Specifically, if you use
+ /// <see cref="ICUCollationKeyAnalyzer"/> to generate index terms, do not use
+ /// CollationKeyAnalyzer on the query side, or vice versa.
+ /// <para/>
+ /// ICUCollationKeyAnalyzer is significantly faster and generates significantly
+ /// shorter keys than CollationKeyAnalyzer. See
+ /// <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+ /// >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+ /// generation timing and key length comparisons between ICU4J and
+ /// java.text.Collator over several languages.
+ /// <para/>
+ /// <a name="version"/>
+ /// You must specify the required <see cref="LuceneVersion"/>
+ /// compatibility when creating <see cref="ICUCollationKeyAnalyzer"/>:
+ /// <list type="bullet">
+ /// <item><description>As of 4.0, <see cref="System.Globalization.SortKey"/>s are directly encoded as bytes. Previous
+ /// versions will encode the bytes with <see cref="IndexableBinaryStringTools"/>.</description></item>
+ /// </list>
+ /// </remarks>
+ [ExceptionToClassNameConvention]
+ public sealed class ICUCollationKeyAnalyzer : Analyzer
+ {
+ private readonly Collator collator;
+ private readonly ICUCollationAttributeFactory factory;
+ private readonly LuceneVersion matchVersion;
+
+ /// <summary>
+ /// Create a new <see cref="ICUCollationKeyAnalyzer"/>, using the specified <paramref name="collator"/>.
+ /// </summary>
+ /// <param name="matchVersion">See <see cref="ICUCollationKeyAnalyzer"/>.</param>
+ /// <param name="collator"><see cref="System.Globalization.SortKey"/> generator.</param>
+ public ICUCollationKeyAnalyzer(LuceneVersion matchVersion, Collator collator)
+ {
+ this.matchVersion = matchVersion;
+ this.collator = collator;
+ this.factory = new ICUCollationAttributeFactory(collator);
+ }
+
+ [Obsolete("Use ICUCollationKeyAnalyzer.ICUCollationKeyAnalyzer(LuceneVersion, Collator) and specify a version instead. This ctor will be removed in Lucene 5.0")]
+ public ICUCollationKeyAnalyzer(Collator collator)
+ : this(LuceneVersion.LUCENE_31, collator)
+ {
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+#pragma warning disable 612, 618
+ if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40))
+#pragma warning restore 612, 618
+ {
+ KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ else
+ {
+ KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer,
+#pragma warning disable 612, 618
+ new ICUCollationKeyFilter(tokenizer, collator));
+#pragma warning restore 612, 618
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilter.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilter.cs
new file mode 100644
index 0000000..e6c595a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilter.cs
@@ -0,0 +1,86 @@
+using Icu.Collation;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Globalization;
+
+namespace Lucene.Net.Collation
+{
+ /// <summary>
+ /// Converts each token into its <see cref="SortKey"/>, and
+ /// then encodes the <see cref="SortKey"/> with <see cref="IndexableBinaryStringTools"/>, to
+ /// allow it to be stored as an index term.
+ /// </summary>
+ /// <remarks>
+ /// <strong>WARNING:</strong> Make sure you use exactly the same <see cref="Collator"/> at
+ /// index and query time -- CollationKeys are only comparable when produced by
+ /// the same <see cref="Collator"/>. <see cref="RuleBasedCollator"/>s are
+ /// independently versioned, so it is safe to search against stored
+ /// <see cref="System.Globalization.SortKey"/>s if the following are exactly the same (best practice is
+ /// to store this information with the index and check that they remain the
+ /// same at query time):
+ /// <list type="number">
+ /// <item><description>Collator version - see <see cref="Collator"/> Version</description></item>
+ /// <item><description>The collation strength used - see <see cref="Collator.Strength"/></description></item>
+ /// </list>
+ /// <para/>
+ /// <see cref="System.Globalization.SortKey"/>s generated by ICU Collators are not compatible with those
+ /// generated by java.text.Collators. Specifically, if you use
+ /// <see cref="ICUCollationKeyAnalyzer"/> to generate index terms, do not use
+ /// CollationKeyAnalyzer on the query side, or vice versa.
+ /// <para/>
+ /// ICUCollationKeyAnalyzer is significantly faster and generates significantly
+ /// shorter keys than CollationKeyAnalyzer. See
+ /// <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+ /// >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+ /// generation timing and key length comparisons between ICU4J and
+ /// java.text.Collator over several languages.
+ /// </remarks>
+ [Obsolete("Use ICUCollationAttributeFactory instead, which encodes terms directly as bytes. This filter will be removed in Lucene 5.0")]
+ [ExceptionToClassNameConvention]
+ public sealed class ICUCollationKeyFilter : TokenFilter
+ {
+ private Collator collator = null;
+ private SortKey reusableKey;
+ private readonly ICharTermAttribute termAtt;
+
+ /// <summary>
+ /// Creates a new <see cref="ICUCollationKeyFilter"/>.
+ /// </summary>
+ /// <param name="input">Source token stream.</param>
+ /// <param name="collator"><see cref="SortKey"/> generator.</param>
+ public ICUCollationKeyFilter(TokenStream input, Collator collator)
+ : base(input)
+ {
+ // clone the collator: see http://userguide.icu-project.org/collation/architecture
+ this.collator = (Collator)collator.Clone();
+ this.termAtt = AddAttribute<ICharTermAttribute>();
+ }
+
+ public override bool IncrementToken()
+ {
+ if (m_input.IncrementToken())
+ {
+ char[] termBuffer = termAtt.Buffer;
+ string termText = new string(termBuffer, 0, termAtt.Length);
+ reusableKey = collator.GetSortKey(termText);
+ int encodedLength = IndexableBinaryStringTools.GetEncodedLength(
+ reusableKey.KeyData, 0, reusableKey.KeyData.Length);
+ if (encodedLength > termBuffer.Length)
+ {
+ termAtt.ResizeBuffer(encodedLength);
+ }
+ termAtt.SetLength(encodedLength);
+ IndexableBinaryStringTools.Encode(reusableKey.KeyData, 0, reusableKey.KeyData.Length,
+ termAtt.Buffer, 0, encodedLength);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilterFactory.cs
new file mode 100644
index 0000000..7ecf357
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilterFactory.cs
@@ -0,0 +1,245 @@
+using Icu.Collation;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Collation
+{
+ /// <summary>
+ /// Factory for <see cref="ICUCollationKeyFilter"/>.
+ /// </summary>
+ /// <remarks>
+ /// This factory can be created in two ways:
+ /// <list type="bullet">
+ /// <item><description>Based upon a system collator associated with a Locale.</description></item>
+ /// <item><description>Based upon a tailored ruleset.</description></item>
+ /// </list>
+ /// <para/>
+ /// Using a System collator:
+ /// <list type="bullet">
+ /// <item><description>locale: RFC 3066 locale ID (mandatory)</description></item>
+ /// <item><description>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)</description></item>
+ /// <item><description>decomposition: 'no', or 'canonical' (optional)</description></item>
+ /// </list>
+ /// <para/>
+ /// Using a Tailored ruleset:
+ /// <list type="bullet">
+ /// <item><description>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)</description></item>
+ /// <item><description>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)</description></item>
+ /// <item><description>decomposition: 'no' or 'canonical' (optional)</description></item>
+ /// </list>
+ /// <para/>
+ /// Expert options:
+ /// <list type="bullet">
+ /// <item><description>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.</description></item>
+ /// <item><description>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.</description></item>
+ /// <item><description>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.</description></item>
+ /// <item><description>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10</description></item>
+ /// </list>
+ /// </remarks>
+ /// <seealso cref="Collator"/>
+ /// <seealso cref="RuleBasedCollator"/>
+ /// LUCENENET NOTE: variableTop is not supported by icu.net
+ [Obsolete("Use ICUCollationKeyAnalyzer instead.")]
+ [ExceptionToClassNameConvention]
+ public class ICUCollationKeyFilterFactory : TokenFilterFactory, IMultiTermAwareComponent, IResourceLoaderAware
+ {
+ private Collator collator;
+ private readonly string custom;
+ private readonly string localeID;
+ private readonly string strength;
+ private readonly string decomposition;
+
+ private readonly string alternate;
+ private readonly string caseLevel;
+ private readonly string caseFirst;
+ private readonly string numeric;
+ //private readonly string variableTop;
+
+ public ICUCollationKeyFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ custom = Get(args, "custom");
+ localeID = Get(args, "locale");
+ strength = Get(args, "strength");
+ decomposition = Get(args, "decomposition");
+
+ alternate = Get(args, "alternate");
+ caseLevel = Get(args, "caseLevel");
+ caseFirst = Get(args, "caseFirst");
+ numeric = Get(args, "numeric");
+
+ // LUCENENET TODO: variableTop is not supported by icu.net. Besides this,
+ // it is deprecated as of ICU 53 and has been superceded by maxVariable,
+ // but that feature is also not supported by icu.net at the time of this writing.
+ //variableTop = Get(args, "variableTop");
+
+ if (custom == null && localeID == null)
+ throw new ArgumentException("Either custom or locale is required.");
+
+ if (custom != null && localeID != null)
+ throw new ArgumentException("Cannot specify both locale and custom. "
+ + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
+ + "Then save the entire customized ruleset to a file, and use with the custom parameter");
+
+ if (args.Count != 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public virtual void Inform(IResourceLoader loader)
+ {
+ if (localeID != null)
+ {
+ // create from a system collator, based on Locale.
+ collator = CreateFromLocale(localeID);
+ }
+ else
+ {
+ // create from a custom ruleset
+ collator = CreateFromRules(custom, loader);
+ }
+
+ // set the strength flag, otherwise it will be the default.
+ if (strength != null)
+ {
+ if (strength.Equals("primary", StringComparison.OrdinalIgnoreCase))
+ collator.Strength = CollationStrength.Primary;
+ else if (strength.Equals("secondary", StringComparison.OrdinalIgnoreCase))
+ collator.Strength = CollationStrength.Secondary;
+ else if (strength.Equals("tertiary", StringComparison.OrdinalIgnoreCase))
+ collator.Strength = CollationStrength.Tertiary;
+ else if (strength.Equals("quaternary", StringComparison.OrdinalIgnoreCase))
+ collator.Strength = CollationStrength.Quaternary;
+ else if (strength.Equals("identical", StringComparison.OrdinalIgnoreCase))
+ collator.Strength = CollationStrength.Identical;
+ else
+ throw new ArgumentException("Invalid strength: " + strength);
+ }
+
+ // set the decomposition flag, otherwise it will be the default.
+ if (decomposition != null)
+ {
+ if (decomposition.Equals("no", StringComparison.OrdinalIgnoreCase))
+ collator.NormalizationMode = NormalizationMode.Off; // (Collator.NO_DECOMPOSITION);
+ else if (decomposition.Equals("canonical", StringComparison.OrdinalIgnoreCase))
+ collator.NormalizationMode = NormalizationMode.On; //.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+ else
+ throw new ArgumentException("Invalid decomposition: " + decomposition);
+ }
+
+ // expert options: concrete subclasses are always a RuleBasedCollator
+ RuleBasedCollator rbc = (RuleBasedCollator)collator;
+ if (alternate != null)
+ {
+ if (alternate.Equals("shifted", StringComparison.OrdinalIgnoreCase))
+ {
+ rbc.AlternateHandling = AlternateHandling.Shifted;// .setAlternateHandlingShifted(true);
+ }
+ else if (alternate.Equals("non-ignorable", StringComparison.OrdinalIgnoreCase))
+ {
+ rbc.AlternateHandling = AlternateHandling.NonIgnorable; //.setAlternateHandlingShifted(false);
+ }
+ else
+ {
+ throw new ArgumentException("Invalid alternate: " + alternate);
+ }
+ }
+ if (caseLevel != null)
+ {
+ rbc.CaseLevel = bool.Parse(caseLevel) ? CaseLevel.On : CaseLevel.Off; // setCaseLevel(Boolean.parseBoolean(caseLevel));
+ }
+ if (caseFirst != null)
+ {
+ if (caseFirst.Equals("lower", StringComparison.OrdinalIgnoreCase))
+ {
+ rbc.CaseFirst = CaseFirst.LowerFirst; //.setLowerCaseFirst(true);
+ }
+ else if (caseFirst.Equals("upper", StringComparison.OrdinalIgnoreCase))
+ {
+ rbc.CaseFirst = CaseFirst.UpperFirst; //.setUpperCaseFirst(true);
+ }
+ else
+ {
+ throw new ArgumentException("Invalid caseFirst: " + caseFirst);
+ }
+ }
+ if (numeric != null)
+ {
+ rbc.NumericCollation = bool.Parse(numeric) ? NumericCollation.On : NumericCollation.Off; //.setNumericCollation(Boolean.parseBoolean(numeric));
+ }
+
+ // LUCENENET TODO: variableTop is not supported by icu.net. Besides this,
+ // it is deprecated as of ICU 53 and has been superceded by maxVariable,
+ // but that feature is also not supported by icu.net at the time of this writing.
+ //if (variableTop != null)
+ //{
+ // rbc.setVariableTop(variableTop);
+ //}
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new ICUCollationKeyFilter(input, collator);
+ }
+
+ /// <summary>
+ /// Create a locale from <paramref name="localeID"/>.
+ /// Then return the appropriate collator for the locale.
+ /// </summary>
+ /// <param name="localeID"></param>
+ /// <returns>The appropriate collator for the locale.</returns>
+ private Collator CreateFromLocale(string localeID)
+ {
+ return Collator.Create(localeID);
+ }
+
+ /// <summary>
+ /// Read custom rules from a file, and create a <see cref="RuleBasedCollator"/>.
+ /// The file cannot support comments, as # might be in the rules!
+ /// </summary>
+ private Collator CreateFromRules(string fileName, IResourceLoader loader)
+ {
+ Stream input = null;
+ try
+ {
+ input = loader.OpenResource(fileName);
+ string rules = ToUTF8String(input);
+ return new RuleBasedCollator(rules);
+ }
+ catch (Exception e)
+ {
+ // io error or invalid rules
+ throw new Exception(e.ToString(), e);
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(input);
+ }
+ }
+
+ public virtual AbstractAnalysisFactory GetMultiTermComponent()
+ {
+ return this;
+ }
+
+ private string ToUTF8String(Stream input)
+ {
+ StringBuilder sb = new StringBuilder();
+ char[] buffer = new char[1024];
+ TextReader r = IOUtils.GetDecodingReader(input, Encoding.UTF8);
+ int len = 0;
+ while ((len = r.Read(buffer, 0, buffer.Length)) > 0)
+ {
+ sb.Append(buffer, 0, len);
+ }
+ return sb.ToString();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/TokenAttributes/ICUCollatedTermAttributeImpl.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Collation/TokenAttributes/ICUCollatedTermAttributeImpl.cs b/src/Lucene.Net.Analysis.ICU/Collation/TokenAttributes/ICUCollatedTermAttributeImpl.cs
new file mode 100644
index 0000000..ac1187e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Collation/TokenAttributes/ICUCollatedTermAttributeImpl.cs
@@ -0,0 +1,39 @@
+using Icu.Collation;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System.Globalization;
+
+namespace Lucene.Net.Collation.TokenAttributes
+{
+ /// <summary>
+ /// Extension of <see cref="CharTermAttribute"/> that encodes the term
+ /// text as a binary Unicode collation key instead of as UTF-8 bytes.
+ /// </summary>
+ [ExceptionToClassNameConvention]
+ public class ICUCollatedTermAttribute : CharTermAttribute
+ {
+ private readonly Collator collator;
+ //private readonly RawCollationKey key = new RawCollationKey();
+ private SortKey key;
+
+ /// <summary>
+ /// Create a new ICUCollatedTermAttribute
+ /// </summary>
+ /// <param name="collator"><see cref="SortKey"/> generator.</param>
+ public ICUCollatedTermAttribute(Collator collator)
+ {
+ // clone the collator: see http://userguide.icu-project.org/collation/architecture
+ this.collator = (Collator)collator.Clone();
+ }
+
+ public override void FillBytesRef()
+ {
+ BytesRef bytes = this.BytesRef;
+ key = collator.GetSortKey(ToString());
+ bytes.Bytes = key.KeyData;
+ bytes.Offset = 0;
+ bytes.Length = key.KeyData.Length;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj b/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj
index b1510b9..079f5c1 100644
--- a/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj
+++ b/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj
@@ -80,6 +80,24 @@
<Compile Include="..\Lucene.Net.Analysis.Common\Analysis\Util\SegmentingTokenizerBase.cs">
<Link>Analysis\Util\SegmentingTokenizerBase.cs</Link>
</Compile>
+ <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationAttributeFactory.cs">
+ <Link>Collation\ICUCollationAttributeFactory.cs</Link>
+ </Compile>
+ <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationDocValuesField.cs">
+ <Link>Collation\ICUCollationDocValuesField.cs</Link>
+ </Compile>
+ <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationKeyAnalyzer.cs">
+ <Link>Collation\ICUCollationKeyAnalyzer.cs</Link>
+ </Compile>
+ <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationKeyFilter.cs">
+ <Link>Collation\ICUCollationKeyFilter.cs</Link>
+ </Compile>
+ <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationKeyFilterFactory.cs">
+ <Link>Collation\ICUCollationKeyFilterFactory.cs</Link>
+ </Compile>
+ <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\TokenAttributes\ICUCollatedTermAttributeImpl.cs">
+ <Link>Collation\TokenAttributes\ICUCollatedTermAttributeImpl.cs</Link>
+ </Compile>
<Compile Include="..\Lucene.Net.Highlighter\PostingsHighlight\DefaultPassageFormatter.cs">
<Link>Search\PostingsHighlight\DefaultPassageFormatter.cs</Link>
</Compile>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs b/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs
index c079a6f..e8c6cdf 100644
--- a/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs
+++ b/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs
@@ -1,10 +1,9 @@
-#if FEATURE_COLLATION
using Icu.Collation;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
-using Lucene.Net.Support;
+using Lucene.Net.Support.Threading;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
@@ -55,7 +54,7 @@ namespace Lucene.Net.Analysis
/// <returns> The encoded collation key for the original String </returns>
/// @deprecated only for testing deprecated filters
[Obsolete("only for testing deprecated filters")]
- protected internal virtual string EncodeCollationKey(sbyte[] keyBits)
+ protected internal virtual string EncodeCollationKey(byte[] keyBits)
{
// Ensure that the backing char[] array is large enough to hold the encoded
// Binary String
@@ -268,7 +267,7 @@ namespace Lucene.Net.Analysis
}
finally
{
- IOUtils.CloseWhileHandlingException(priorException, ts);
+ IOUtils.DisposeWhileHandlingException(priorException, ts);
}
}
@@ -328,7 +327,7 @@ namespace Lucene.Net.Analysis
}
finally
{
- IOUtils.CloseWhileHandlingException(priorException, ts);
+ IOUtils.DisposeWhileHandlingException(priorException, ts);
}
}
}
@@ -339,5 +338,4 @@ namespace Lucene.Net.Analysis
}
}
}
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.TestFramework/project.json
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.TestFramework/project.json b/src/Lucene.Net.TestFramework/project.json
index 45e8d8a..ef35176 100644
--- a/src/Lucene.Net.TestFramework/project.json
+++ b/src/Lucene.Net.TestFramework/project.json
@@ -27,6 +27,7 @@
}
},
"dependencies": {
+ "icu.net": "54.1.1-alpha",
"Lucene.Net.Analysis.Common": "4.8.0",
"Lucene.Net.Codecs": "4.8.0",
"NUnit": "3.5.0"
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/Segmentation/TestCharArrayIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/Segmentation/TestCharArrayIterator.cs b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/Segmentation/TestCharArrayIterator.cs
new file mode 100644
index 0000000..cccd20a
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/Segmentation/TestCharArrayIterator.cs
@@ -0,0 +1,110 @@
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Icu.Segmentation
+{
+ public class TestCharArrayIterator : LuceneTestCase
+ {
+ [Test]
+ public void TestBasicUsage()
+ {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.SetText("testing".toCharArray(), 0, "testing".Length);
+ assertEquals(0, ci.BeginIndex);
+ assertEquals(7, ci.EndIndex);
+ assertEquals(0, ci.Index);
+ assertEquals('t', ci.Current);
+ assertEquals('e', ci.Next());
+ assertEquals('g', ci.Last());
+ assertEquals('n', ci.Previous());
+ assertEquals('t', ci.First());
+ assertEquals(CharacterIterator.DONE, ci.Previous());
+ }
+
+ [Test]
+ public void TestFirst()
+ {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.SetText("testing".toCharArray(), 0, "testing".Length);
+ ci.Next();
+ // Sets the position to getBeginIndex() and returns the character at that position.
+ assertEquals('t', ci.First());
+ assertEquals(ci.BeginIndex, ci.Index);
+ // or DONE if the text is empty
+ ci.SetText(new char[] { }, 0, 0);
+ assertEquals(CharacterIterator.DONE, ci.First());
+ }
+
+ [Test]
+ public void TestLast()
+ {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.SetText("testing".toCharArray(), 0, "testing".Length);
+ // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty)
+ // and returns the character at that position.
+ assertEquals('g', ci.Last());
+ assertEquals(ci.Index, ci.EndIndex - 1);
+ // or DONE if the text is empty
+ ci.SetText(new char[] { }, 0, 0);
+ assertEquals(CharacterIterator.DONE, ci.Last());
+ assertEquals(ci.EndIndex, ci.Index);
+ }
+
+ [Test]
+ public void TestCurrent()
+ {
+ CharArrayIterator ci = new CharArrayIterator();
+ // Gets the character at the current position (as returned by getIndex()).
+ ci.SetText("testing".toCharArray(), 0, "testing".Length);
+ assertEquals('t', ci.Current);
+ ci.Last();
+ ci.Next();
+ // or DONE if the current position is off the end of the text.
+ assertEquals(CharacterIterator.DONE, ci.Current);
+ }
+
+ [Test]
+ public void TestNext()
+ {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.SetText("te".toCharArray(), 0, 2);
+ // Increments the iterator's index by one and returns the character at the new index.
+ assertEquals('e', ci.Next());
+ assertEquals(1, ci.Index);
+ // or DONE if the new position is off the end of the text range.
+ assertEquals(CharacterIterator.DONE, ci.Next());
+ assertEquals(ci.EndIndex, ci.Index);
+ }
+
+ [Test]
+ public void TestSetIndex()
+ {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.SetText("test".toCharArray(), 0, "test".Length);
+ try
+ {
+ ci.SetIndex(5);
+ fail();
+ }
+ catch (Exception e)
+ {
+ assertTrue(e is ArgumentException);
+ }
+ }
+
+ [Test]
+ public void TestClone()
+ {
+ char[] text = "testing".toCharArray();
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.SetText(text, 0, text.Length);
+ ci.Next();
+ CharArrayIterator ci2 = (CharArrayIterator)ci.Clone();
+ assertEquals(ci.Index, ci2.Index);
+ assertEquals(ci.Next(), ci2.Next());
+ assertEquals(ci.Last(), ci2.Last());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2Filter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2Filter.cs b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2Filter.cs
new file mode 100644
index 0000000..da7cf0f
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2Filter.cs
@@ -0,0 +1,92 @@
+// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net
+
+//using Lucene.Net.Analysis.Core;
+//using Lucene.Net.Support;
+//using NUnit.Framework;
+//using System;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// /// <summary>
+// /// Tests the ICUNormalizer2Filter
+// /// </summary>
+// public class TestICUNormalizer2Filter : BaseTokenStreamTestCase
+// {
+// private readonly Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+// {
+// Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+// return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer));
+// });
+
+// [Test]
+// public void TestDefaults()
+// {
+// // case folding
+// AssertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" });
+
+// // case folding
+// AssertAnalyzesTo(a, "Ruß", new String[] { "russ" });
+
+// // case folding
+// AssertAnalyzesTo(a, "ΜΆΪΟΣ", new String[] { "μάϊοσ" });
+// AssertAnalyzesTo(a, "Μάϊος", new String[] { "μάϊοσ" });
+
+// // supplementary case folding
+// AssertAnalyzesTo(a, "𐐖", new String[] { "𐐾" });
+
+// // normalization
+// AssertAnalyzesTo(a, "ﴳﴺﰧ", new String[] { "طمطمطم" });
+
+// // removal of default ignorables
+// AssertAnalyzesTo(a, "क्ष", new String[] { "क्ष" });
+// }
+
+// [Test]
+// public void TestAlternate()
+// {
+// // Analyzer a = new Analyzer()
+// //{
+// // @Override
+// // public TokenStreamComponents createComponents(String fieldName, Reader reader)
+// //{
+// // Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+// // return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(
+// // tokenizer,
+// // /* specify nfc with decompose to get nfd */
+// // Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
+// //}
+// // };
+
+// Analyzer a = Analysis.Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+// {
+// Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+// return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(
+// tokenizer,
+// /* specify nfc with decompose to get nfd */
+// //Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
+// new Normalizer2(global::Icu.Normalizer.UNormalizationMode.UNORM_NFD))); // LUCENENET NOTE: "nfc" + "DECOMPOSE" = "UNORM_NFD"
+// });
+
+// // decompose EAcute into E + combining Acute
+// AssertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
+// }
+
+// /** blast some random strings through the analyzer */
+// [Test]
+// public void TestRandomStrings()
+// {
+// CheckRandomData(Random(), a, 1000 * RANDOM_MULTIPLIER);
+// }
+
+// [Test]
+// public void TestEmptyTerm()
+// {
+// Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+// {
+// Tokenizer tokenizer = new KeywordTokenizer(reader);
+// return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer));
+// });
+// CheckOneTerm(a, "", "");
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2FilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2FilterFactory.cs b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2FilterFactory.cs
new file mode 100644
index 0000000..8ee65a1
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2FilterFactory.cs
@@ -0,0 +1,45 @@
+// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net
+
+//using NUnit.Framework;
+//using System;
+//using System.Collections.Generic;
+//using System.IO;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// /// <summary>
+// /// basic tests for <see cref="ICUNormalizer2FilterFactory"/>
+// /// </summary>
+// public class TestICUNormalizer2FilterFactory : BaseTokenStreamTestCase
+// {
+// /** Test nfkc_cf defaults */
+// [Test]
+// public void TestDefaults()
+// {
+// TextReader reader = new StringReader("This is a Test");
+// ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory(new Dictionary<String, String>());
+// TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+// stream = factory.Create(stream);
+// AssertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
+// }
+
+// /** Test that bogus arguments result in exception */
+// [Test]
+// public void TestBogusArguments()
+// {
+// try
+// {
+// new ICUNormalizer2FilterFactory(new Dictionary<String, String>() {
+// { "bogusArg", "bogusValue" }
+// });
+// fail();
+// }
+// catch (ArgumentException expected)
+// {
+// assertTrue(expected.Message.Contains("Unknown parameters"));
+// }
+// }
+
+// // TODO: add tests for different forms
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationDocValuesField.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationDocValuesField.cs b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationDocValuesField.cs
new file mode 100644
index 0000000..ecfbdf6
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationDocValuesField.cs
@@ -0,0 +1,121 @@
+using Icu.Collation;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.Globalization;
+
+namespace Lucene.Net.Collation
+{
+ /// <summary>
+ /// trivial test of ICUCollationDocValuesField
+ /// </summary>
+ [SuppressCodecs("Lucene3x")]
+ public class TestICUCollationDocValuesField : LuceneTestCase
+ {
+ [Test]
+ public void TestBasic()
+ {
+ Directory dir = NewDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, Similarity, TimeZone);
+ Document doc = new Document();
+ Field field = NewField("field", "", StringField.TYPE_STORED);
+ ICUCollationDocValuesField collationField = new ICUCollationDocValuesField("collated", Collator.Create(new CultureInfo("en")));
+ doc.Add(field);
+ doc.Add(collationField);
+
+ field.SetStringValue("ABC");
+ collationField.SetStringValue("ABC");
+ iw.AddDocument(doc);
+
+ field.SetStringValue("abc");
+ collationField.SetStringValue("abc");
+ iw.AddDocument(doc);
+
+ IndexReader ir = iw.Reader;
+ iw.Dispose();
+
+ IndexSearcher @is = NewSearcher(ir);
+
+ SortField sortField = new SortField("collated", SortFieldType.STRING);
+
+ TopDocs td = @is.Search(new MatchAllDocsQuery(), 5, new Sort(sortField));
+ assertEquals("abc", ir.Document(td.ScoreDocs[0].Doc).Get("field"));
+ assertEquals("ABC", ir.Document(td.ScoreDocs[1].Doc).Get("field"));
+ ir.Dispose();
+ dir.Dispose();
+ }
+
+ [Test]
+ public void TestRanges()
+ {
+ Directory dir = NewDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, Similarity, TimeZone);
+ Document doc = new Document();
+ Field field = NewField("field", "", StringField.TYPE_STORED);
+ Collator collator = Collator.Create(CultureInfo.CurrentCulture, Collator.Fallback.FallbackAllowed); // uses -Dtests.locale
+ if (Random().nextBoolean())
+ {
+ collator.Strength = CollationStrength.Primary;
+ }
+ ICUCollationDocValuesField collationField = new ICUCollationDocValuesField("collated", collator);
+ doc.Add(field);
+ doc.Add(collationField);
+
+ int numDocs = AtLeast(500);
+ for (int i = 0; i < numDocs; i++)
+ {
+ String value = TestUtil.RandomSimpleString(Random());
+ field.SetStringValue(value);
+ collationField.SetStringValue(value);
+ iw.AddDocument(doc);
+ }
+
+ IndexReader ir = iw.Reader;
+ iw.Dispose();
+ IndexSearcher @is = NewSearcher(ir);
+
+ int numChecks = AtLeast(100);
+ for (int i = 0; i < numChecks; i++)
+ {
+ String start = TestUtil.RandomSimpleString(Random());
+ String end = TestUtil.RandomSimpleString(Random());
+ BytesRef lowerVal = new BytesRef(collator.GetSortKey(start).KeyData);
+ BytesRef upperVal = new BytesRef(collator.GetSortKey(end).KeyData);
+ Query query = new ConstantScoreQuery(FieldCacheRangeFilter.NewBytesRefRange("collated", lowerVal, upperVal, true, true));
+ DoTestRanges(@is, start, end, query, collator);
+ }
+
+ ir.Dispose();
+ dir.Dispose();
+ }
+
+ private void DoTestRanges(IndexSearcher @is, String startPoint, String endPoint, Query query, Collator collator)
+ {
+ QueryUtils.Check(query);
+
+ // positive test
+ TopDocs docs = @is.Search(query, @is.IndexReader.MaxDoc);
+ foreach (ScoreDoc doc in docs.ScoreDocs)
+ {
+ String value = @is.Doc(doc.Doc).Get("field");
+ assertTrue(collator.Compare(value, startPoint) >= 0);
+ assertTrue(collator.Compare(value, endPoint) <= 0);
+ }
+
+ // negative test
+ BooleanQuery bq = new BooleanQuery();
+ bq.Add(new MatchAllDocsQuery(), Occur.SHOULD);
+ bq.Add(query, Occur.MUST_NOT);
+ docs = @is.Search(bq, @is.IndexReader.MaxDoc);
+ foreach (ScoreDoc doc in docs.ScoreDocs)
+ {
+ String value = @is.Doc(doc.Doc).Get("field");
+ assertTrue(collator.Compare(value, startPoint) < 0 || collator.Compare(value, endPoint) > 0);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyAnalyzer.cs b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyAnalyzer.cs
new file mode 100644
index 0000000..55b0b3b
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyAnalyzer.cs
@@ -0,0 +1,98 @@
+using Icu.Collation;
+using Lucene.Net.Analysis;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.Globalization;
+
+namespace Lucene.Net.Collation
+{
+ [SuppressCodecs("Lucene3x")]
+ public class TestICUCollationKeyAnalyzer : CollationTestBase
+ {
+ private Collator collator = Collator.Create(new CultureInfo("fa"));
+ private Analyzer analyzer;
+
+ private BytesRef firstRangeBeginning;
+ private BytesRef firstRangeEnd;
+ private BytesRef secondRangeBeginning;
+ private BytesRef secondRangeEnd;
+
+ public override void SetUp()
+ {
+ base.SetUp();
+
+ this.analyzer = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
+ this.firstRangeBeginning = new BytesRef
+ (collator.GetSortKey(FirstRangeBeginningOriginal).KeyData);
+ this.firstRangeEnd = new BytesRef
+ (collator.GetSortKey(FirstRangeEndOriginal).KeyData);
+ this.secondRangeBeginning = new BytesRef
+ (collator.GetSortKey(SecondRangeBeginningOriginal).KeyData);
+ this.secondRangeEnd = new BytesRef
+ (collator.GetSortKey(SecondRangeEndOriginal).KeyData);
+ }
+
+ [Test]
+ public void TestFarsiRangeFilterCollating()
+ {
+ TestFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd,
+ secondRangeBeginning, secondRangeEnd);
+ }
+
+ [Test]
+ public void TestFarsiRangeQueryCollating()
+ {
+ TestFarsiRangeQueryCollating(analyzer, firstRangeBeginning, firstRangeEnd,
+ secondRangeBeginning, secondRangeEnd);
+ }
+
+ [Test]
+ public void TestFarsiTermRangeQuery()
+ {
+ TestFarsiTermRangeQuery
+ (analyzer, firstRangeBeginning, firstRangeEnd,
+ secondRangeBeginning, secondRangeEnd);
+ }
+
+ // Test using various international locales with accented characters (which
+ // sort differently depending on locale)
+ //
+ // Copied (and slightly modified) from
+ // org.apache.lucene.search.TestSort.testInternationalSort()
+ //
+ [Test]
+ public void TestCollationKeySort()
+ {
+ Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
+ (TEST_VERSION_CURRENT, Collator.Create(new CultureInfo("en-us"), Collator.Fallback.FallbackAllowed));
+
+ Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
+ (TEST_VERSION_CURRENT, Collator.Create(new CultureInfo("fr")));
+
+ Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
+ (TEST_VERSION_CURRENT, Collator.Create(new CultureInfo("sv-se"), Collator.Fallback.FallbackAllowed));
+
+ Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer
+ (TEST_VERSION_CURRENT, Collator.Create(new CultureInfo("da-dk"), Collator.Fallback.FallbackAllowed));
+
+ // The ICU Collator and java.text.Collator implementations differ in their
+ // orderings - "BFJHD" is the ordering for the ICU Collator for Locale.ROOT.
+ TestCollationKeySort
+ (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
+ "BFJHD", "ECAGI", "BJDFH", "BJDHF");
+ }
+
+ [Test]
+ public void TestThreadSafe()
+ {
+ int iters = 20 * RANDOM_MULTIPLIER;
+ for (int i = 0; i < iters; i++)
+ {
+ CultureInfo locale = new CultureInfo("de");
+ Collator collator = Collator.Create(locale);
+ collator.Strength = CollationStrength.Identical;
+ AssertThreadSafe(new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilter.cs b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilter.cs
new file mode 100644
index 0000000..a8a8cba
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilter.cs
@@ -0,0 +1,101 @@
+using Icu.Collation;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.Globalization;
+using System.IO;
+
+namespace Lucene.Net.Collation
+{
+ [Obsolete("remove this when ICUCollationKeyFilter is removed")]
+ public class TestICUCollationKeyFilter : CollationTestBase
+ {
+ private Collator collator = Collator.Create(new CultureInfo("fa"));
+ private Analyzer analyzer;
+
+ private BytesRef firstRangeBeginning;
+ private BytesRef firstRangeEnd;
+ private BytesRef secondRangeBeginning;
+ private BytesRef secondRangeEnd;
+
+
+ public override void SetUp()
+ {
+ base.SetUp();
+
+ this.analyzer = new TestAnalyzer(collator);
+ this.firstRangeBeginning = new BytesRef(EncodeCollationKey
+ (collator.GetSortKey(FirstRangeBeginningOriginal).KeyData));
+ this.firstRangeEnd = new BytesRef(EncodeCollationKey
+ (collator.GetSortKey(FirstRangeEndOriginal).KeyData));
+ this.secondRangeBeginning = new BytesRef(EncodeCollationKey
+ (collator.GetSortKey(SecondRangeBeginningOriginal).KeyData));
+ this.secondRangeEnd = new BytesRef(EncodeCollationKey
+ (collator.GetSortKey(SecondRangeEndOriginal).KeyData));
+ }
+
+ public sealed class TestAnalyzer : Analyzer
+ {
+ private Collator _collator;
+
+ internal TestAnalyzer(Collator collator)
+ {
+ _collator = collator;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer result = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(result, new ICUCollationKeyFilter(result, _collator));
+ }
+ }
+
+ [Test]
+ public void TestFarsiRangeFilterCollating()
+ {
+ TestFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd,
+ secondRangeBeginning, secondRangeEnd);
+ }
+
+ [Test]
+ public void TestFarsiRangeQueryCollating()
+ {
+ TestFarsiRangeQueryCollating(analyzer, firstRangeBeginning, firstRangeEnd,
+ secondRangeBeginning, secondRangeEnd);
+ }
+
+ [Test]
+ public void TestFarsiTermRangeQuery()
+ {
+ TestFarsiTermRangeQuery
+ (analyzer, firstRangeBeginning, firstRangeEnd,
+ secondRangeBeginning, secondRangeEnd);
+ }
+
+ // Test using various international locales with accented characters (which
+ // sort differently depending on locale)
+ //
+ // Copied (and slightly modified) from
+ // org.apache.lucene.search.TestSort.testInternationalSort()
+ //
+ [Test]
+ public void TestCollationKeySort()
+ {
+ Analyzer usAnalyzer = new TestAnalyzer(Collator.Create(new CultureInfo("en-us"), Collator.Fallback.FallbackAllowed));
+ Analyzer franceAnalyzer
+ = new TestAnalyzer(Collator.Create(new CultureInfo("fr")));
+ Analyzer swedenAnalyzer
+ = new TestAnalyzer(Collator.Create(new CultureInfo("sv-se"), Collator.Fallback.FallbackAllowed));
+ Analyzer denmarkAnalyzer
+ = new TestAnalyzer(Collator.Create(new CultureInfo("da-dk"), Collator.Fallback.FallbackAllowed));
+
+ // The ICU Collator and java.text.Collator implementations differ in their
+ // orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US.
+ TestCollationKeySort
+ (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
+ "BFJHD", "ECAGI", "BJDFH", "BJDHF");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilterFactory.cs
new file mode 100644
index 0000000..80aa910
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilterFactory.cs
@@ -0,0 +1,331 @@
+using Icu.Collation;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Reflection;
+using System.Text;
+
+namespace Lucene.Net.Collation
+{
+ [Obsolete]
+ public class TestICUCollationKeyFilterFactory : BaseTokenStreamTestCase
+ {
+ /// <summary>
+ /// Turkish has some funny casing.
+ /// This test shows how you can solve this kind of thing easily with collation.
+ /// Instead of using LowerCaseFilter, use a turkish collator with primary strength.
+ /// Then things will sort and match correctly.
+ /// </summary>
+ [Test]
+ public void TestBasicUsage()
+ {
+ String turkishUpperCase = "I WİLL USE TURKİSH CASING";
+ String turkishLowerCase = "ı will use turkish casıng";
+ TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
+ "locale", "tr",
+ "strength", "primary");
+ TokenStream tsUpper = factory.Create(
+ new KeywordTokenizer(new StringReader(turkishUpperCase)));
+ TokenStream tsLower = factory.Create(
+ new KeywordTokenizer(new StringReader(turkishLowerCase)));
+ assertCollatesToSame(tsUpper, tsLower);
+ }
+
+ /*
+ * Test usage of the decomposition option for unicode normalization.
+ */
+ [Test]
+ public void TestNormalization()
+ {
+ String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
+ String turkishLowerCase = "ı will use turkish casıng";
+ TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
+ "locale", "tr",
+ "strength", "primary",
+ "decomposition", "canonical");
+ TokenStream tsUpper = factory.Create(
+ new KeywordTokenizer(new StringReader(turkishUpperCase)));
+ TokenStream tsLower = factory.Create(
+ new KeywordTokenizer(new StringReader(turkishLowerCase)));
+ assertCollatesToSame(tsUpper, tsLower);
+ }
+
+ /*
+ * Test secondary strength, for english case is not significant.
+ */
+ [Test]
+ public void TestSecondaryStrength()
+ {
+ String upperCase = "TESTING";
+ String lowerCase = "testing";
+ TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
+ "locale", "en",
+ "strength", "secondary",
+ "decomposition", "no");
+ TokenStream tsUpper = factory.Create(
+ new KeywordTokenizer(new StringReader(upperCase)));
+ TokenStream tsLower = factory.Create(
+ new KeywordTokenizer(new StringReader(lowerCase)));
+ assertCollatesToSame(tsUpper, tsLower);
+ }
+
+ /*
+ * Setting alternate=shifted to shift whitespace, punctuation and symbols
+ * to quaternary level
+ */
+ [Test]
+ public void TestIgnorePunctuation()
+ {
+ String withPunctuation = "foo-bar";
+ String withoutPunctuation = "foo bar";
+ TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
+ "locale", "en",
+ "strength", "primary",
+ "alternate", "shifted");
+ TokenStream tsPunctuation = factory.Create(
+ new KeywordTokenizer(new StringReader(withPunctuation)));
+ TokenStream tsWithoutPunctuation = factory.Create(
+ new KeywordTokenizer(new StringReader(withoutPunctuation)));
+ assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
+ }
+
+ // LUCENENET TODO: variableTop is not supported by icu.net. Besides this,
+ // it is deprecated as of ICU 53 and has been superceded by maxVariable,
+ // but that feature is also not supported by icu.net at the time of this writing.
+
+ ///*
+ // * Setting alternate=shifted and variableTop to shift whitespace, but not
+ // * punctuation or symbols, to quaternary level
+ // */
+ //[Test]
+ //public void TestIgnoreWhitespace()
+ //{
+ // String withSpace = "foo bar";
+ // String withoutSpace = "foobar";
+ // String withPunctuation = "foo-bar";
+ // TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
+ // "locale", "en",
+ // "strength", "primary",
+ // "alternate", "shifted",
+ // "variableTop", " ");
+ // TokenStream tsWithSpace = factory.Create(
+ // new KeywordTokenizer(new StringReader(withSpace)));
+ // TokenStream tsWithoutSpace = factory.Create(
+ // new KeywordTokenizer(new StringReader(withoutSpace)));
+ // assertCollatesToSame(tsWithSpace, tsWithoutSpace);
+ // // now assert that punctuation still matters: foo-bar < foo bar
+ // tsWithSpace = factory.Create(
+ // new KeywordTokenizer(new StringReader(withSpace)));
+ // TokenStream tsWithPunctuation = factory.Create(
+ // new KeywordTokenizer(new StringReader(withPunctuation)));
+ // assertCollation(tsWithPunctuation, tsWithSpace, -1);
+ //}
+
+ /*
+ * Setting numeric to encode digits with numeric value, so that
+ * foobar-9 sorts before foobar-10
+ */
+ [Test]
+ public void TestNumerics()
+ {
+ String nine = "foobar-9";
+ String ten = "foobar-10";
+ TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
+ "locale", "en",
+ "numeric", "true");
+ TokenStream tsNine = factory.Create(
+ new KeywordTokenizer(new StringReader(nine)));
+ TokenStream tsTen = factory.Create(
+ new KeywordTokenizer(new StringReader(ten)));
+ assertCollation(tsNine, tsTen, -1);
+ }
+
+ /*
+ * Setting caseLevel=true to create an additional case level between
+ * secondary and tertiary
+ */
+ [Test]
+ public void TestIgnoreAccentsButNotCase()
+ {
+ String withAccents = "résumé";
+ String withoutAccents = "resume";
+ String withAccentsUpperCase = "Résumé";
+ String withoutAccentsUpperCase = "Resume";
+ TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
+ "locale", "en",
+ "strength", "primary",
+ "caseLevel", "true");
+ TokenStream tsWithAccents = factory.Create(
+ new KeywordTokenizer(new StringReader(withAccents)));
+ TokenStream tsWithoutAccents = factory.Create(
+ new KeywordTokenizer(new StringReader(withoutAccents)));
+ assertCollatesToSame(tsWithAccents, tsWithoutAccents);
+
+ TokenStream tsWithAccentsUpperCase = factory.Create(
+ new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
+ TokenStream tsWithoutAccentsUpperCase = factory.Create(
+ new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+ assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
+
+ // now assert that case still matters: resume < Resume
+ TokenStream tsLower = factory.Create(
+ new KeywordTokenizer(new StringReader(withoutAccents)));
+ TokenStream tsUpper = factory.Create(
+ new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+ assertCollation(tsLower, tsUpper, -1);
+ }
+
+ /*
+ * Setting caseFirst=upper to cause uppercase strings to sort
+ * before lowercase ones.
+ */
+ [Test]
+ public void TestUpperCaseFirst()
+ {
+ String lower = "resume";
+ String upper = "Resume";
+ TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
+ "locale", "en",
+ "strength", "tertiary",
+ "caseFirst", "upper");
+ TokenStream tsLower = factory.Create(
+ new KeywordTokenizer(new StringReader(lower)));
+ TokenStream tsUpper = factory.Create(
+ new KeywordTokenizer(new StringReader(upper)));
+ assertCollation(tsUpper, tsLower, -1);
+ }
+
+ /*
+ * For german, you might want oe to sort and match with o umlaut.
+ * This is not the default, but you can make a customized ruleset to do this.
+ *
+ * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
+ * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
+ */
+ [Test]
+ public void TestCustomRules()
+ {
+ String DIN5007_2_tailorings =
+ "& ae , a\u0308 & AE , A\u0308" +
+ "& oe , o\u0308 & OE , O\u0308" +
+ "& ue , u\u0308 & UE , u\u0308";
+
+ string baseRules = RuleBasedCollator.GetCollationRules(new Icu.Locale("de-DE"), UColRuleOption.UCOL_TAILORING_ONLY);
+ //RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseRules + DIN5007_2_tailorings);
+
+ string tailoredRules = baseRules + DIN5007_2_tailorings;
+ //
+ // at this point, you would save these tailoredRules to a file,
+ // and use the custom parameter.
+ //
+ String germanUmlaut = "Töne";
+ String germanOE = "Toene";
+ IDictionary<String, String> args = new Dictionary<String, String>();
+ args.Put("custom", "rules.txt");
+ args.Put("strength", "primary");
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(args);
+ factory.Inform(new StringMockResourceLoader(tailoredRules));
+ TokenStream tsUmlaut = factory.Create(
+ new KeywordTokenizer(new StringReader(germanUmlaut)));
+ TokenStream tsOE = factory.Create(
+ new KeywordTokenizer(new StringReader(germanOE)));
+
+ assertCollatesToSame(tsUmlaut, tsOE);
+ }
+
+ private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
+ {
+ assertCollation(stream1, stream2, 0);
+ }
+
+ private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison)
+ {
+ ICharTermAttribute term1 = stream1
+ .AddAttribute<ICharTermAttribute>();
+ ICharTermAttribute term2 = stream2
+ .AddAttribute<ICharTermAttribute>();
+ stream1.Reset();
+ stream2.Reset();
+ assertTrue(stream1.IncrementToken());
+ assertTrue(stream2.IncrementToken());
+ assertEquals(Number.Signum(comparison), Number.Signum(term1.toString().CompareToOrdinal(term2.toString())));
+ assertFalse(stream1.IncrementToken());
+ assertFalse(stream2.IncrementToken());
+ stream1.End();
+ stream2.End();
+ stream1.Dispose();
+ stream2.Dispose();
+ }
+
+ private class StringMockResourceLoader : IResourceLoader
+ {
+ String text;
+
+ internal StringMockResourceLoader(String text)
+ {
+ this.text = text;
+ }
+
+ public T NewInstance<T>(String cname)
+ {
+ return default(T);
+ }
+
+ public Type FindType(String cname)
+ {
+ return null;
+ }
+
+ public Stream OpenResource(String resource)
+ {
+ return new MemoryStream(Encoding.UTF8.GetBytes(text));
+ }
+ }
+
+ private TokenFilterFactory tokenFilterFactory(String name, params String[] keysAndValues)
+ {
+ Type clazz = TokenFilterFactory.LookupClass(name);
+ if (keysAndValues.Length % 2 == 1)
+ {
+ throw new ArgumentException("invalid keysAndValues map");
+ }
+ IDictionary<String, String> args = new Dictionary<String, String>();
+ for (int i = 0; i < keysAndValues.Length; i += 2)
+ {
+ String prev = args.Put(keysAndValues[i], keysAndValues[i + 1]);
+ assertNull("duplicate values for key: " + keysAndValues[i], prev);
+ }
+ String previous = args.Put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
+ assertNull("duplicate values for key: luceneMatchVersion", previous);
+ TokenFilterFactory factory = null;
+ try
+ {
+ //factory = clazz.getConstructor(Map.class).newInstance(args);
+ factory = (TokenFilterFactory)Activator.CreateInstance(clazz, args);
+ }
+ catch (TargetInvocationException e)
+ {
+ // to simplify tests that check for illegal parameters
+ if (e.InnerException is ArgumentException)
+ {
+ throw (ArgumentException)e.InnerException;
+ }
+ else
+ {
+ throw e;
+ }
+ }
+ if (factory is IResourceLoaderAware)
+ {
+ ((IResourceLoaderAware)factory).Inform(new ClasspathResourceLoader(GetType()));
+ }
+ return factory;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj b/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj
index 84d660a..8f5e312 100644
--- a/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj
+++ b/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj
@@ -74,6 +74,18 @@
<Compile Include="..\Lucene.Net.Tests.Analysis.Common\Analysis\Util\TestSegmentingTokenizerBase.cs">
<Link>Analysis\Util\TestSegmentingTokenizerBase.cs</Link>
</Compile>
+ <Compile Include="..\Lucene.Net.Tests.Analysis.ICU\Collation\TestICUCollationDocValuesField.cs">
+ <Link>Collation\TestICUCollationDocValuesField.cs</Link>
+ </Compile>
+ <Compile Include="..\Lucene.Net.Tests.Analysis.ICU\Collation\TestICUCollationKeyAnalyzer.cs">
+ <Link>Collation\TestICUCollationKeyAnalyzer.cs</Link>
+ </Compile>
+ <Compile Include="..\Lucene.Net.Tests.Analysis.ICU\Collation\TestICUCollationKeyFilter.cs">
+ <Link>Collation\TestICUCollationKeyFilter.cs</Link>
+ </Compile>
+ <Compile Include="..\Lucene.Net.Tests.Analysis.ICU\Collation\TestICUCollationKeyFilterFactory.cs">
+ <Link>Collation\TestICUCollationKeyFilterFactory.cs</Link>
+ </Compile>
<Compile Include="..\Lucene.Net.Tests.Highlighter\PostingsHighlight\TestMultiTermHighlighting.cs">
<Link>Search\PostingsHighlight\TestMultiTermHighlighting.cs</Link>
</Compile>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.ICU/project.json
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.ICU/project.json b/src/Lucene.Net.Tests.ICU/project.json
index 4dc5c7a..1c3c0cd 100644
--- a/src/Lucene.Net.Tests.ICU/project.json
+++ b/src/Lucene.Net.Tests.ICU/project.json
@@ -26,6 +26,7 @@
},
"dependencies": {
"dotnet-test-nunit-teamcity": "3.4.0-beta-3",
+ "icu.net": "54.1.1-alpha",
"Lucene.Net": "4.8.0",
"Lucene.Net.Analysis.Common": "4.8.0",
"Lucene.Net.Highlighter": "4.8.0",
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net/Support/StringBuilderExtensions.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net/Support/StringBuilderExtensions.cs b/src/Lucene.Net/Support/StringBuilderExtensions.cs
index 5200e02..ae68811 100644
--- a/src/Lucene.Net/Support/StringBuilderExtensions.cs
+++ b/src/Lucene.Net/Support/StringBuilderExtensions.cs
@@ -110,6 +110,29 @@ namespace Lucene.Net.Support
}
/// <summary>
+ /// Returns the character (Unicode code point) at the specified index.
+ /// The index refers to char values (Unicode code units) and ranges from 0 to Length - 1.
+ /// <para/>
+ /// If the char value specified at the given index is in the high-surrogate range,
+ /// the following index is less than the length of this sequence, and the char value
+ /// at the following index is in the low-surrogate range, then the
+ /// supplementary code point corresponding to this surrogate pair is returned.
+ /// Otherwise, the char value at the given index is returned.
+ /// </summary>
+ /// <param name="text">this <see cref="StringBuilder"/></param>
+ /// <param name="index">the index to the char values</param>
+ /// <returns>the code point value of the character at the index</returns>
+ /// <exception cref="IndexOutOfRangeException">if the index argument is negative or not less than the length of this sequence.</exception>
+ public static int CodePointAt(this StringBuilder text, int index)
+ {
+ if ((index < 0) || (index >= text.Length))
+ {
+ throw new IndexOutOfRangeException();
+ }
+ return Character.CodePointAt(text.ToString(), index);
+ }
+
+ /// <summary>
/// Copies the array from the <see cref="StringBuilder"/> into a new array
/// and returns it.
/// </summary>