You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/25 08:33:26 UTC
[3/4] lucenenet git commit: Squashed commit of the following:
Squashed commit of the following:
commit 480d8cf0f169258b1536d376fc63031de7fe9091
Author: Shad Storhaug <sh...@shadstorhaug.com>
Date: Tue Jul 25 15:21:15 2017 +0700
Fixed .NET Core compilation issues
commit dc336e73b7a45e822d74cd14228d728dc0d6bf22
Author: Shad Storhaug <sh...@shadstorhaug.com>
Date: Tue Jul 25 14:50:09 2017 +0700
Fixed up documentation comments
commit 25f40064828b32f3f30b688dd6a7a42ea5069b8b
Author: Shad Storhaug <sh...@shadstorhaug.com>
Date: Tue Jul 25 13:59:06 2017 +0700
Integrated Collation into Lucene.Net.ICU
commit 4145820997b5329ce7116f9bd723e36a7e064a17
Author: Shad Storhaug <sh...@shadstorhaug.com>
Date: Sat Jul 22 15:30:16 2017 +0700
Lucene.Net.Analysis.ICU: Fixed all collation bugs, added tests for TestCharArrayIterator, TestICUNormalizer2Filter, and TestICUNormalizer2FilterFactory.
commit 2580df83fe6f5b04639279455fdf61568f34e451
Author: Shad Storhaug <sh...@shadstorhaug.com>
Date: Fri Jul 21 02:58:37 2017 +0700
WIP: Analysis.ICU
commit 7be4d43ae06663610d0ff321dcf93ae5cc8499b7
Author: Shad Storhaug <sh...@shadstorhaug.com>
Date: Thu Jul 20 15:26:53 2017 +0700
First poke at Analysis.ICU
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/a4989ea1
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/a4989ea1
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/a4989ea1
Branch: refs/heads/master
Commit: a4989ea194f4b73f5cd9dcb81b6ae4ef358588b9
Parents: 7e6b0bc
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Tue Jul 25 15:22:31 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Tue Jul 25 15:25:20 2017 +0700
----------------------------------------------------------------------
.../Analysis/ICU/ICUFoldingFilter.cs | 32 ++
.../Analysis/ICU/ICUFoldingFilterFactory.cs | 31 ++
.../Analysis/ICU/ICUNormalizer2CharFilter.cs | 235 +++++++++++++
.../ICU/ICUNormalizer2CharFilterFactory.cs | 60 ++++
.../Analysis/ICU/ICUNormalizer2Filter.cs | 60 ++++
.../Analysis/ICU/ICUNormalizer2FilterFactory.cs | 59 ++++
.../Analysis/ICU/ICUTransformFilter.cs | 152 +++++++++
.../Analysis/ICU/ICUTransformFilterFactory.cs | 38 +++
.../ICU/Segmentation/BreakIteratorWrapper.cs | 166 ++++++++++
.../ICU/Segmentation/CharArrayIterator.cs | 134 ++++++++
.../ICU/Segmentation/CompositeBreakIterator.cs | 132 ++++++++
.../Segmentation/DefaultICUTokenizerConfig.cs | 127 +++++++
.../Analysis/ICU/Segmentation/ICUTokenizer.cs | 229 +++++++++++++
.../ICU/Segmentation/ICUTokenizerConfig.cs | 33 ++
.../ICU/Segmentation/ICUTokenizerFactory.cs | 139 ++++++++
.../Analysis/ICU/Segmentation/ScriptIterator.cs | 206 ++++++++++++
.../ICU/TokenAttributes/ScriptAttribute.cs | 42 +++
.../ICU/TokenAttributes/ScriptAttributeImpl.cs | 80 +++++
.../Collation/ICUCollationAttributeFactory.cs | 75 +++++
.../Collation/ICUCollationDocValuesField.cs | 62 ++++
.../Collation/ICUCollationKeyAnalyzer.cs | 96 ++++++
.../Collation/ICUCollationKeyFilter.cs | 86 +++++
.../Collation/ICUCollationKeyFilterFactory.cs | 245 ++++++++++++++
.../ICUCollatedTermAttributeImpl.cs | 39 +++
src/Lucene.Net.ICU/Lucene.Net.ICU.csproj | 18 +
.../Analysis/CollationTestBase.cs | 12 +-
src/Lucene.Net.TestFramework/project.json | 1 +
.../Icu/Segmentation/TestCharArrayIterator.cs | 110 ++++++
.../Analysis/Icu/TestICUNormalizer2Filter.cs | 92 ++++++
.../Icu/TestICUNormalizer2FilterFactory.cs | 45 +++
.../Collation/TestICUCollationDocValuesField.cs | 121 +++++++
.../Collation/TestICUCollationKeyAnalyzer.cs | 98 ++++++
.../Collation/TestICUCollationKeyFilter.cs | 101 ++++++
.../TestICUCollationKeyFilterFactory.cs | 331 +++++++++++++++++++
.../Lucene.Net.Tests.ICU.csproj | 12 +
src/Lucene.Net.Tests.ICU/project.json | 1 +
.../Support/StringBuilderExtensions.cs | 23 ++
37 files changed, 3516 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilter.cs
new file mode 100644
index 0000000..4ca8278
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilter.cs
@@ -0,0 +1,32 @@
+// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net
+
+//using Icu;
+//using Lucene.Net.Support;
+//using System;
+//using System.Collections.Generic;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// public sealed class ICUFoldingFilter : ICUNormalizer2Filter
+// {
+// private static readonly Normalizer2 normalizer;
+
+// /// <summary>
+// /// Create a new ICUFoldingFilter on the specified input
+// /// </summary>
+// public ICUFoldingFilter(TokenStream input)
+// : base(input, normalizer)
+// {
+// }
+
+// static ICUFoldingFilter()
+// {
+// normalizer = Normalizer2.GetInstance(
+// typeof(ICUFoldingFilter).Assembly.FindAndGetManifestResourceStream(typeof(ICUFoldingFilter), "utr30.nrm"),
+// "utr30", Normalizer2.Mode.COMPOSE);
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilterFactory.cs
new file mode 100644
index 0000000..c25cf93
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilterFactory.cs
@@ -0,0 +1,31 @@
+// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net
+
+//using Lucene.Net.Analysis.Util;
+//using System;
+//using System.Collections.Generic;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// public class ICUFoldingFilterFactory : TokenFilterFactory, IMultiTermAwareComponent
+// {
+// /// <summary>Creates a new ICUFoldingFilterFactory</summary>
+// public ICUFoldingFilterFactory(IDictionary<string, string> args)
+// : base(args)
+// {
+// if (args.Count != 0)
+// {
+// throw new ArgumentException("Unknown parameters: " + args);
+// }
+// }
+
+// public override TokenStream Create(TokenStream input)
+// {
+// return new ICUFoldingFilter(input);
+// }
+
+// public virtual AbstractAnalysisFactory GetMultiTermComponent()
+// {
+// return this;
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilter.cs
new file mode 100644
index 0000000..4254298
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilter.cs
@@ -0,0 +1,235 @@
+// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net
+
+//using Lucene.Net.Analysis.CharFilters;
+//using Lucene.Net.Support;
+//using System;
+//using System.Collections.Generic;
+//using System.IO;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// /// <summary>
+// /// Normalize token text with ICU's <see cref="Normalizer2"/>.
+// /// </summary>
+// public sealed class ICUNormalizer2CharFilter : BaseCharFilter
+// {
+// private static readonly int IO_BUFFER_SIZE = 128;
+
+// private readonly Normalizer2 normalizer;
+// private readonly StringBuilder inputBuffer = new StringBuilder();
+// private readonly StringBuilder resultBuffer = new StringBuilder();
+
+// private bool inputFinished;
+// private bool afterQuickCheckYes;
+// private int checkedInputBoundary;
+// private int charCount;
+
+
+// /**
+// * Create a new Normalizer2CharFilter that combines NFKC normalization, Case
+// * Folding, and removes Default Ignorables (NFKC_Casefold)
+// */
+// public ICUNormalizer2CharFilter(TextReader input)
+// : this(input, new Normalizer2(Icu.Normalizer.UNormalizationMode.UNORM_NFKC) /*Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)*/)
+// {
+// }
+
+// /**
+// * Create a new Normalizer2CharFilter with the specified Normalizer2
+// * @param in text
+// * @param normalizer normalizer to use
+// */
+// public ICUNormalizer2CharFilter(TextReader input, Normalizer2 normalizer)
+// : base(input)
+// {
+// if (normalizer == null)
+// {
+// throw new ArgumentNullException("normalizer");
+// }
+// this.normalizer = normalizer;
+// }
+
+// public override int Read(char[] cbuf, int off, int len)
+// {
+// if (off < 0) throw new ArgumentException("off < 0");
+// if (off >= cbuf.Length) throw new ArgumentException("off >= cbuf.length");
+// if (len <= 0) throw new ArgumentException("len <= 0");
+
+// while (!inputFinished || inputBuffer.Length > 0 || resultBuffer.Length > 0)
+// {
+// int retLen;
+
+// if (resultBuffer.Length > 0)
+// {
+// retLen = OutputFromResultBuffer(cbuf, off, len);
+// if (retLen > 0)
+// {
+// return retLen;
+// }
+// }
+
+// int resLen = ReadAndNormalizeFromInput();
+// if (resLen > 0)
+// {
+// retLen = OutputFromResultBuffer(cbuf, off, len);
+// if (retLen > 0)
+// {
+// return retLen;
+// }
+// }
+
+// ReadInputToBuffer();
+// }
+
+// return -1;
+// }
+
+// private readonly char[] tmpBuffer = new char[IO_BUFFER_SIZE];
+
+// private int ReadInputToBuffer()
+// {
+// int len = m_input.Read(tmpBuffer, 0, tmpBuffer.Length);
+// if (len == -1)
+// {
+// inputFinished = true;
+// return 0;
+// }
+// inputBuffer.Append(tmpBuffer, 0, len);
+
+// // if checkedInputBoundary was at the end of a buffer, we need to check that char again
+// checkedInputBoundary = Math.Max(checkedInputBoundary - 1, 0);
+// // this loop depends on 'isInert' (changes under normalization) but looks only at characters.
+// // so we treat all surrogates as non-inert for simplicity
+// if (normalizer.IsInert(tmpBuffer[len - 1]) && !char.IsSurrogate(tmpBuffer[len - 1]))
+// {
+// return len;
+// }
+// else return len + ReadInputToBuffer();
+// }
+
+// private int ReadAndNormalizeFromInput()
+// {
+// if (inputBuffer.Length <= 0)
+// {
+// afterQuickCheckYes = false;
+// return 0;
+// }
+// if (!afterQuickCheckYes)
+// {
+// int resLen2 = ReadFromInputWhileSpanQuickCheckYes();
+// afterQuickCheckYes = true;
+// if (resLen2 > 0) return resLen2;
+// }
+// int resLen = ReadFromIoNormalizeUptoBoundary();
+// if (resLen > 0)
+// {
+// afterQuickCheckYes = false;
+// }
+// return resLen;
+// }
+
+// private int ReadFromInputWhileSpanQuickCheckYes()
+// {
+// int end = normalizer.spanQuickCheckYes(inputBuffer);
+// if (end > 0)
+// {
+// //resultBuffer.Append(inputBuffer.subSequence(0, end));
+// resultBuffer.Append(inputBuffer.ToString(0, end));
+// //inputBuffer.delete(0, end);
+// inputBuffer.Remove(0, end);
+// checkedInputBoundary = Math.Max(checkedInputBoundary - end, 0);
+// charCount += end;
+// }
+// return end;
+// }
+
+// private int ReadFromIoNormalizeUptoBoundary()
+// {
+// // if there's no buffer to normalize, return 0
+// if (inputBuffer.Length <= 0)
+// {
+// return 0;
+// }
+
+// bool foundBoundary = false;
+// int bufLen = inputBuffer.Length;
+
+// while (checkedInputBoundary <= bufLen - 1)
+// {
+// int charLen = Character.CharCount(inputBuffer.CodePointAt(checkedInputBoundary));
+// checkedInputBoundary += charLen;
+// if (checkedInputBoundary < bufLen && normalizer.HasBoundaryBefore(inputBuffer
+// .CodePointAt(checkedInputBoundary)))
+// {
+// foundBoundary = true;
+// break;
+// }
+// }
+// if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished)
+// {
+// foundBoundary = true;
+// checkedInputBoundary = bufLen;
+// }
+
+// if (!foundBoundary)
+// {
+// return 0;
+// }
+
+// return NormalizeInputUpto(checkedInputBoundary);
+// }
+
+// private int NormalizeInputUpto(int length)
+// {
+// int destOrigLen = resultBuffer.Length;
+// normalizer.NormalizeSecondAndAppend(resultBuffer, inputBuffer.ToString(0, length));
+// //inputBuffer.SubSequence(0, length));
+
+// //inputBuffer.Delete(0, length);
+// inputBuffer.Remove(0, length);
+// checkedInputBoundary = Math.Max(checkedInputBoundary - length, 0);
+// int resultLength = resultBuffer.Length - destOrigLen;
+// RecordOffsetDiff(length, resultLength);
+// return resultLength;
+// }
+
+// private void RecordOffsetDiff(int inputLength, int outputLength)
+// {
+// if (inputLength == outputLength)
+// {
+// charCount += outputLength;
+// return;
+// }
+// int diff = inputLength - outputLength;
+// int cumuDiff = LastCumulativeDiff;
+// if (diff < 0)
+// {
+// for (int i = 1; i <= -diff; ++i)
+// {
+// AddOffCorrectMap(charCount + i, cumuDiff - i);
+// }
+// }
+// else
+// {
+// AddOffCorrectMap(charCount + outputLength, cumuDiff + diff);
+// }
+// charCount += outputLength;
+// }
+
+// private int OutputFromResultBuffer(char[] cbuf, int begin, int len)
+// {
+// len = Math.Min(resultBuffer.Length, len);
+// //resultBuffer.GetChars(0, len, cbuf, begin);
+// resultBuffer.CopyTo(0, cbuf, begin, len);
+// if (len > 0)
+// {
+// //resultBuffer.delete(0, len);
+// resultBuffer.Remove(0, len);
+// }
+// return len;
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilterFactory.cs
new file mode 100644
index 0000000..bd4cbe5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilterFactory.cs
@@ -0,0 +1,60 @@
+// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net
+
+//using Icu;
+//using Lucene.Net.Analysis.Util;
+//using Lucene.Net.Support;
+//using System;
+//using System.Collections.Generic;
+//using System.IO;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// public class ICUNormalizer2CharFilterFactory : CharFilterFactory, IMultiTermAwareComponent
+// {
+// private readonly Normalizer2 normalizer;
+
+// /// <summary>Creates a new ICUNormalizer2CharFilterFactory</summary>
+// public ICUNormalizer2CharFilterFactory(IDictionary<string, string> args)
+// : base(args)
+// {
+// string name = Get(args, "name", "NFKC");
+// //string name = Get(args, "name", "nfkc_cf");
+// //string mode = Get(args, "mode", new string[] { "compose", "decompose" }, "compose");
+// //Normalizer2 normalizer = Normalizer2.getInstance
+// // (null, name, "compose".Equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
+
+// var mode = (Icu.Normalizer.UNormalizationMode)Enum.Parse(typeof(Icu.Normalizer.UNormalizationMode), "UNORM_" + name);
+// Normalizer2 normalizer = new Normalizer2(mode);
+
+// string filter = Get(args, "filter");
+// if (filter != null)
+// {
+// //UnicodeSet set = new UnicodeSet(filter);
+// var set = UnicodeSet.ToCharacters(filter);
+// if (set.Any())
+// {
+// //set.freeze();
+// normalizer = new FilteredNormalizer2(normalizer, set);
+// }
+// }
+// if (args.Count != 0)
+// {
+// throw new ArgumentException("Unknown parameters: " + args);
+// }
+// this.normalizer = normalizer;
+// }
+
+// public override TextReader Create(TextReader input)
+// {
+// return new ICUNormalizer2CharFilter(input, normalizer);
+// }
+
+// public virtual AbstractAnalysisFactory GetMultiTermComponent()
+// {
+// return this;
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2Filter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2Filter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2Filter.cs
new file mode 100644
index 0000000..bca3d24
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2Filter.cs
@@ -0,0 +1,60 @@
+// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net
+
+//using Icu;
+//using Lucene.Net.Analysis.TokenAttributes;
+//using Lucene.Net.Support;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// public class ICUNormalizer2Filter : TokenFilter
+// {
+// private readonly ICharTermAttribute termAtt;
+// private readonly Normalizer2 normalizer;
+
+// /// <summary>
+// /// Create a new <see cref="Normalizer2Filter"/> that combines NFKC normalization, Case
+// /// Folding, and removes Default Ignorables (NFKC_Casefold)
+// /// </summary>
+// /// <param name="input"></param>
+// public ICUNormalizer2Filter(TokenStream input)
+// : this(input, new Normalizer2(Normalizer.UNormalizationMode.UNORM_NFKC) /*Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)*/)
+// {
+// }
+
+// /// <summary>
+// /// Create a new <see cref="Normalizer2Filter"/> with the specified <see cref="Normalizer2"/>
+// /// </summary>
+// /// <param name="input">stream</param>
+// /// <param name="normalizer">normalizer to use</param>
+// public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer)
+// : base(input)
+// {
+// this.normalizer = normalizer;
+// this.termAtt = AddAttribute<ICharTermAttribute>();
+// }
+
+// public override sealed bool IncrementToken()
+// {
+// if (m_input.IncrementToken())
+// {
+// var term = termAtt.ToString();
+// try
+// {
+// if (!normalizer.IsNormalized(term))
+// {
+// termAtt.SetEmpty().Append(normalizer.Normalize(term));
+// }
+// }
+// catch (System.Exception ex)
+// {
+
+// }
+// return true;
+// }
+// else
+// {
+// return false;
+// }
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2FilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2FilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2FilterFactory.cs
new file mode 100644
index 0000000..c17fb7f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2FilterFactory.cs
@@ -0,0 +1,59 @@
+// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net
+
+//using Icu;
+//using Lucene.Net.Analysis.Util;
+//using Lucene.Net.Support;
+//using System;
+//using System.Collections.Generic;
+//using System.Linq;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// public class ICUNormalizer2FilterFactory : TokenFilterFactory, IMultiTermAwareComponent
+// {
+// private readonly Normalizer2 normalizer;
+
+// /// <summary>Creates a new ICUNormalizer2FilterFactory</summary>
+// public ICUNormalizer2FilterFactory(IDictionary<string, string> args)
+// : base(args)
+// {
+// string name = Get(args, "name", "NFKC");
+// //string name = Get(args, "name", "nfkc_cf");
+// //string mode = Get(args, "mode", new string[] { "compose", "decompose" }, "compose");
+
+// var mode = (Normalizer.UNormalizationMode)Enum.Parse(typeof(Normalizer.UNormalizationMode), "UNORM_" + name);
+// Normalizer2 normalizer = new Normalizer2(mode);
+
+// //Normalizer2 normalizer = Normalizer2.getInstance
+// // (null, name, "compose".Equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
+
+// string filter = Get(args, "filter");
+// if (filter != null)
+// {
+// //UnicodeSet set = new UnicodeSet(filter);
+// var set = UnicodeSet.ToCharacters(filter);
+// if (set.Any())
+// {
+// //set.freeze();
+// normalizer = new FilteredNormalizer2(normalizer, set);
+// }
+// }
+// if (args.Count != 0)
+// {
+// throw new ArgumentException("Unknown parameters: " + args);
+// }
+// this.normalizer = normalizer;
+// }
+
+// // TODO: support custom normalization
+// public override TokenStream Create(TokenStream input)
+// {
+// return new ICUNormalizer2Filter(input, normalizer);
+// }
+
+// public virtual AbstractAnalysisFactory GetMultiTermComponent()
+// {
+// return this;
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilter.cs
new file mode 100644
index 0000000..7f22c3d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilter.cs
@@ -0,0 +1,152 @@
+// LUCENENET TODO: Port issues - missing Transliterator dependency from icu.net
+
+//using Lucene.Net.Analysis.TokenAttributes;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// public sealed class ICUTransformFilter : TokenFilter
+// {
+// // Transliterator to transform the text
+// private readonly Transliterator transform;
+
+// // Reusable position object
+// private readonly Transliterator.Position position = new Transliterator.Position();
+
+// // term attribute, will be updated with transformed text.
+// private readonly ICharTermAttribute termAtt;
+
+// // Wraps a termAttribute around the replaceable interface.
+// private readonly ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();
+
+// /// <summary>
+// /// Create a new ICUTransformFilter that transforms text on the given stream.
+// /// </summary>
+// /// <param name="input"><see cref="TokenStream"/> to filter.</param>
+// /// <param name="transform">Transliterator to transform the text.</param>
+// public ICUTransformFilter(TokenStream input, Transliterator transform)
+// : base(input)
+// {
+// this.transform = transform;
+// this.termAtt = AddAttribute<ICharTermAttribute>();
+
+// /*
+// * This is cheating, but speeds things up a lot.
+// * If we wanted to use pkg-private APIs we could probably do better.
+// */
+// if (transform.getFilter() == null && transform is com.ibm.icu.text.RuleBasedTransliterator)
+// {
+// UnicodeSet sourceSet = transform.getSourceSet();
+// if (sourceSet != null && !sourceSet.isEmpty())
+// transform.setFilter(sourceSet);
+// }
+// }
+
+// public override bool IncrementToken()
+// {
+// /*
+// * Wrap around replaceable. clear the positions, and transliterate.
+// */
+// if (m_input.IncrementToken())
+// {
+// replaceableAttribute.SetText(termAtt);
+
+// int length = termAtt.Length;
+// position.start = 0;
+// position.limit = length;
+// position.contextStart = 0;
+// position.contextLimit = length;
+
+// transform.FilteredTransliterate(replaceableAttribute, position, false);
+// return true;
+// }
+// else
+// {
+// return false;
+// }
+// }
+
+// /// <summary>
+// /// Wrap a <see cref="ICharTermAttribute"/> with the Replaceable API.
+// /// </summary>
+// private sealed class ReplaceableTermAttribute //: IReplaceable
+// {
+// private char[] buffer;
+// private int length;
+// private ICharTermAttribute token;
+
+// public void SetText(ICharTermAttribute token)
+// {
+// this.token = token;
+// this.buffer = token.Buffer;
+// this.length = token.Length;
+// }
+
+// public int Char32At(int pos)
+// {
+// return UTF16.charAt(buffer, 0, length, pos);
+// }
+
+// public char CharAt(int pos)
+// {
+// return buffer[pos];
+// }
+
+// public void Copy(int start, int limit, int dest)
+// {
+// char[] text = new char[limit - start];
+// GetChars(start, limit, text, 0);
+// Replace(dest, dest, text, 0, limit - start);
+// }
+
+// public void GetChars(int srcStart, int srcLimit, char[] dst, int dstStart)
+// {
+// System.Array.Copy(buffer, srcStart, dst, dstStart, srcLimit - srcStart);
+// }
+
+// public bool HasMetaData
+// {
+// get { return false; }
+// }
+
+// public int Length
+// {
+// get { return length; }
+// }
+
+// public void Replace(int start, int limit, string text)
+// {
+// int charsLen = text.Length;
+// int newLength = ShiftForReplace(start, limit, charsLen);
+// // insert the replacement text
+// //text.getChars(0, charsLen, buffer, start);
+// text.CopyTo(0, buffer, start, charsLen);
+// token.Length = (length = newLength);
+// }
+
+// public void Replace(int start, int limit, char[] text, int charsStart,
+// int charsLen)
+// {
+// // shift text if necessary for the replacement
+// int newLength = ShiftForReplace(start, limit, charsLen);
+// // insert the replacement text
+// System.Array.Copy(text, charsStart, buffer, start, charsLen);
+// token.Length = (length = newLength);
+// }
+
+// /// <summary>shift text (if necessary) for a replacement operation</summary>
+// private int ShiftForReplace(int start, int limit, int charsLen)
+// {
+// int replacementLength = limit - start;
+// int newLength = length - replacementLength + charsLen;
+// // resize if necessary
+// if (newLength > length)
+// buffer = token.ResizeBuffer(newLength);
+// // if the substring being replaced is longer or shorter than the
+// // replacement, need to shift things around
+// if (replacementLength != charsLen && limit < length)
+// System.Array.Copy(buffer, limit, buffer, start + charsLen, length - limit);
+// return newLength;
+// }
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilterFactory.cs
new file mode 100644
index 0000000..081ebf5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilterFactory.cs
@@ -0,0 +1,38 @@
+// LUCENENET TODO: Port issues - missing Transliterator dependency from icu.net
+
+//using Lucene.Net.Analysis.Util;
+//using System;
+//using System.Collections.Generic;
+
+//namespace Lucene.Net.Analysis.ICU
+//{
+// public class ICUTransformFilterFactory : TokenFilterFactory, IMultiTermAwareComponent
+// {
+// private readonly Transliterator transliterator;
+
+// // TODO: add support for custom rules
+// /// <summary>Creates a new ICUTransformFilterFactory</summary>
+// public ICUTransformFilterFactory(IDictionary<string, string> args)
+// : base(args)
+// {
+// string id = Require(args, "id");
+// string direction = Get(args, "direction", new string[] { "forward", "reverse" }, "forward", false);
+// int dir = "forward".Equals(direction) ? Transliterator.FORWARD : Transliterator.REVERSE;
+// transliterator = Transliterator.getInstance(id, dir);
+// if (args.Count != 0)
+// {
+// throw new ArgumentException("Unknown parameters: " + args);
+// }
+// }
+
+// public override TokenStream Create(TokenStream input)
+// {
+// return new ICUTransformFilter(input, transliterator);
+// }
+
+// public virtual AbstractAnalysisFactory GetMultiTermComponent()
+// {
+// return this;
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/BreakIteratorWrapper.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/BreakIteratorWrapper.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/BreakIteratorWrapper.cs
new file mode 100644
index 0000000..c124a88
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/BreakIteratorWrapper.cs
@@ -0,0 +1,166 @@
+// LUCENENET TODO: Port issues - missing dependencies
+
+//using Icu;
+//using Lucene.Net.Analysis.Util;
+//using Lucene.Net.Support;
+//using System;
+//using System.Collections.Generic;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU.Segmentation
+//{
+// /// <summary>
+// /// Contain all the issues surrounding BreakIterators in ICU in one place.
+// /// Basically this boils down to the fact that they aren't very friendly to any
+// /// sort of OO design.
+// /// <para/>
+// /// http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
+// /// BreakIterator from RuleBasedBreakIterator
+// /// <para/>
+// /// DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
+// /// doesn't actually behave as a subclass: it always returns 0 for
+// /// getRuleStatus():
+// /// http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
+// /// tags
+// /// <para/>
+// /// @lucene.experimental
+// /// </summary>
+// internal abstract class BreakIteratorWrapper
+// {
+// protected readonly CharArrayIterator textIterator = new CharArrayIterator();
+// protected char[] text;
+// protected int start;
+// protected int length;
+
+// public abstract int Next();
+// public abstract int Current { get; }
+// public abstract int GetRuleStatus();
+// public abstract void SetText(CharacterIterator text);
+
+// public void SetText(char[] text, int start, int length)
+// {
+// this.text = text;
+// this.start = start;
+// this.length = length;
+// textIterator.SetText(text, start, length);
+// SetText(textIterator);
+// }
+
+// /**
+// * If its a RuleBasedBreakIterator, the rule status can be used for token type. If its
+// * any other BreakIterator, the rulestatus method is not available, so treat
+// * it like a generic BreakIterator.
+// */
+// public static BreakIteratorWrapper Wrap(Icu.BreakIterator breakIterator)
+// {
+// if (breakIterator is Icu.RuleBasedBreakIterator)
+// return new RBBIWrapper((Icu.RuleBasedBreakIterator)breakIterator);
+// else
+// return new BIWrapper(breakIterator);
+// }
+
+// /**
+// * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not
+// * a DictionaryBasedBreakIterator) behaves correctly.
+// */
+// private sealed class RBBIWrapper : BreakIteratorWrapper
+// {
+// private readonly Icu.RuleBasedBreakIterator rbbi;
+
+// internal RBBIWrapper(Icu.RuleBasedBreakIterator rbbi)
+// {
+// this.rbbi = rbbi;
+// }
+
+// public override int Current
+// {
+// get { return rbbi.Current; }
+// }
+
+// public override int GetRuleStatus()
+// {
+// return rbbi.GetRuleStatus();
+// }
+
+// public override int Next()
+// {
+// return rbbi.Next();
+// }
+
+// public override void SetText(CharacterIterator text)
+// {
+// rbbi.SetText(text);
+// }
+// }
+
+// /**
+// * Generic BreakIterator wrapper: Either the rulestatus method is not
+// * available or always returns 0. Calculate a rulestatus here so it behaves
+// * like RuleBasedBreakIterator.
+// *
+// * Note: This is slower than RuleBasedBreakIterator.
+// */
+// private sealed class BIWrapper : BreakIteratorWrapper
+// {
+// private readonly Support.BreakIterator bi;
+// private int status;
+
+// internal BIWrapper(Support.BreakIterator bi)
+// {
+// this.bi = bi;
+// }
+
+// public override int Current
+// {
+// get { return bi.Current; }
+// }
+
+// public override int GetRuleStatus()
+// {
+// return status;
+// }
+
+// public override int Next()
+// {
+// int current = bi.Current;
+// int next = bi.Next();
+// status = CalcStatus(current, next);
+// return next;
+// }
+
+// private int CalcStatus(int current, int next)
+// {
+// if (current == Support.BreakIterator.DONE || next == Support.BreakIterator.DONE)
+// return RuleBasedBreakIterator.WORD_NONE;
+
+// int begin = start + current;
+// int end = start + next;
+
+// int codepoint;
+// for (int i = begin; i < end; i += UTF16.getCharCount(codepoint))
+// {
+// codepoint = UTF16.charAt(text, 0, end, begin);
+
+// if (UCharacter.isDigit(codepoint))
+// return RuleBasedBreakIterator.WORD_NUMBER;
+// else if (UCharacter.isLetter(codepoint))
+// {
+// // TODO: try to separately specify ideographic, kana?
+// // [currently all bundled as letter for this case]
+// return RuleBasedBreakIterator.WORD_LETTER;
+// }
+// }
+
+// return RuleBasedBreakIterator.WORD_NONE;
+// }
+
+// public override void SetText(CharacterIterator text)
+// {
+// bi.SetText(text);
+// status = RuleBasedBreakIterator.WORD_NONE;
+// }
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CharArrayIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CharArrayIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CharArrayIterator.cs
new file mode 100644
index 0000000..209d583
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CharArrayIterator.cs
@@ -0,0 +1,134 @@
+using Lucene.Net.Support;
+using System;
+using System.Diagnostics.CodeAnalysis;
+
+namespace Lucene.Net.Analysis.Icu.Segmentation
+{
+ /// <summary>
+ /// Wraps a char[] as CharacterIterator for processing with a BreakIterator
+ /// <para/>
+ /// @lucene.experimental
+ /// </summary>
+ internal sealed class CharArrayIterator : CharacterIterator
+ {
+ private char[] array;
+ private int start;
+ private int index;
+ private int length;
+ private int limit;
+
+ [WritableArray]
+ [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")]
+ public char[] Text
+ {
+ get
+ {
+ return array;
+ }
+ }
+
+ public int Start
+ {
+ get { return start; }
+ }
+
+ public int Length
+ {
+ get { return length; }
+ }
+
+ /// <summary>
+ /// Set a new region of text to be examined by this iterator
+ /// </summary>
+ /// <param name="array">text buffer to examine</param>
+ /// <param name="start">offset into buffer</param>
+ /// <param name="length"> maximum length to examine</param>
+ public void SetText(char[] array, int start, int length)
+ {
+ this.array = array;
+ this.start = start;
+ this.index = start;
+ this.length = length;
+ this.limit = start + length;
+ }
+
+ public override char Current
+ {
+ get { return (index == limit) ? DONE : array[index]; }
+ }
+
+ public override char First()
+ {
+ index = start;
+ return Current;
+ }
+
+ public override int BeginIndex
+ {
+ get { return 0; }
+ }
+
+ public override int EndIndex
+ {
+ get { return length; }
+ }
+
+ public override int Index
+ {
+ get { return index - start; }
+ }
+
+ public override char Last()
+ {
+ index = (limit == start) ? limit : limit - 1;
+ return Current;
+ }
+
+ public override char Next()
+ {
+ if (++index >= limit)
+ {
+ index = limit;
+ return DONE;
+ }
+ else
+ {
+ return Current;
+ }
+ }
+
+ public override char Previous()
+ {
+ if (--index < start)
+ {
+ index = start;
+ return DONE;
+ }
+ else
+ {
+ return Current;
+ }
+ }
+
+ public override char SetIndex(int position)
+ {
+ if (position < BeginIndex || position > EndIndex)
+ throw new ArgumentException("Illegal Position: " + position);
+ index = start + position;
+ return Current;
+ }
+
+ public override string GetTextAsString()
+ {
+ return new string(array);
+ }
+
+ public override object Clone()
+ {
+ CharArrayIterator clone = new CharArrayIterator();
+ clone.SetText(array, start, length);
+ clone.index = index;
+ return clone;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CompositeBreakIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CompositeBreakIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CompositeBreakIterator.cs
new file mode 100644
index 0000000..a004193
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CompositeBreakIterator.cs
@@ -0,0 +1,132 @@
+// LUCENENET TODO: Port issues - missing dependencies
+
+//using System;
+//using System.Collections.Generic;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU.Segmentation
+//{
+// /// <summary>
+// /// An internal BreakIterator for multilingual text, following recommendations
+// /// from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
+// /// <para/>
+// /// See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
+// /// design.
+// /// <para/>
+// /// Text is first divided into script boundaries. The processing is then
+// /// delegated to the appropriate break iterator for that specific script.
+// /// <para/>
+// /// This break iterator also allows you to retrieve the ISO 15924 script code
+// /// associated with a piece of text.
+// /// <para/>
+// /// See also UAX #29, UTR #24
+// /// <para/>
+// /// @lucene.experimental
+// /// </summary>
+// internal sealed class CompositeBreakIterator
+// {
+// private readonly ICUTokenizerConfig config;
+// private readonly BreakIteratorWrapper[] wordBreakers = new BreakIteratorWrapper[UScript.CODE_LIMIT];
+
+// private BreakIteratorWrapper rbbi;
+// private readonly ScriptIterator scriptIterator;
+
+// private char[] text;
+
+// public CompositeBreakIterator(ICUTokenizerConfig config)
+// {
+// this.config = config;
+// this.scriptIterator = new ScriptIterator(config.CombineCJ);
+// }
+
+// /**
+// * Retrieve the next break position. If the RBBI range is exhausted within the
+// * script boundary, examine the next script boundary.
+// *
+// * @return the next break position or BreakIterator.DONE
+// */
+// public int Next()
+// {
+// int next = rbbi.Next();
+// while (next == Support.BreakIterator.DONE && scriptIterator.Next())
+// {
+// rbbi = GetBreakIterator(scriptIterator.GetScriptCode());
+// rbbi.SetText(text, scriptIterator.GetScriptStart(),
+// scriptIterator.GetScriptLimit() - scriptIterator.GetScriptStart());
+// next = rbbi.Next();
+// }
+// return (next == Support.BreakIterator.DONE) ? Support.BreakIterator.DONE : next
+// + scriptIterator.GetScriptStart();
+// }
+
+// /**
+// * Retrieve the current break position.
+// *
+// * @return the current break position or BreakIterator.DONE
+// */
+// public int Current
+// {
+// get
+// {
+// int current = rbbi.Current;
+// return (current == Support.BreakIterator.DONE) ? Support.BreakIterator.DONE : current
+// + scriptIterator.GetScriptStart();
+// }
+// }
+
+// /**
+// * Retrieve the rule status code (token type) from the underlying break
+// * iterator
+// *
+// * @return rule status code (see RuleBasedBreakIterator constants)
+// */
+// public int GetRuleStatus()
+// {
+// return rbbi.GetRuleStatus();
+// }
+
+// /**
+// * Retrieve the UScript script code for the current token. This code can be
+// * decoded with UScript into a name or ISO 15924 code.
+// *
+// * @return UScript script code for the current token.
+// */
+// public int GetScriptCode()
+// {
+// return scriptIterator.GetScriptCode();
+// }
+
+// /**
+// * Set a new region of text to be examined by this iterator
+// *
+// * @param text buffer of text
+// * @param start offset into buffer
+// * @param length maximum length to examine
+// */
+// public void SetText(char[] text, int start, int length)
+// {
+// this.text = text;
+// scriptIterator.SetText(text, start, length);
+// if (scriptIterator.Next())
+// {
+// rbbi = GetBreakIterator(scriptIterator.GetScriptCode());
+// rbbi.SetText(text, scriptIterator.GetScriptStart(),
+// scriptIterator.GetScriptLimit() - scriptIterator.GetScriptStart());
+// }
+// else
+// {
+// rbbi = GetBreakIterator(UScript.COMMON);
+// rbbi.SetText(text, 0, 0);
+// }
+// }
+
+// private BreakIteratorWrapper GetBreakIterator(int scriptCode)
+// {
+// if (wordBreakers[scriptCode] == null)
+// wordBreakers[scriptCode] = BreakIteratorWrapper.Wrap(config.GetBreakIterator(scriptCode));
+// return wordBreakers[scriptCode];
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/DefaultICUTokenizerConfig.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/DefaultICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/DefaultICUTokenizerConfig.cs
new file mode 100644
index 0000000..fc2a989
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/DefaultICUTokenizerConfig.cs
@@ -0,0 +1,127 @@
+// LUCENENET TODO: Port issues - missing dependencies
+
+//using Icu;
+//using Lucene.Net.Analysis.Standard;
+//using Lucene.Net.Support;
+//using System;
+//using System.Collections.Generic;
+//using System.Globalization;
+//using System.IO;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU.Segmentation
+//{
+// /// <summary>
+// /// Default <see cref="ICUTokenizerConfig"/> that is generally applicable
+// /// to many languages.
+// /// </summary>
+// /// <remarks>
+// /// Generally tokenizes Unicode text according to UAX#29
+// /// ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
+// /// but with the following tailorings:
+// /// <list type="bullet">
+// /// <item><description>Thai, Lao, and CJK text is broken into words with a dictionary.</description></item>
+// /// <item><description>Myanmar, and Khmer text is broken into syllables based on custom BreakIterator rules.</description></item>
+// /// </list>
+// /// <para/>
+// /// @lucene.experimental
+// /// </remarks>
+// public class DefaultICUTokenizerConfig : ICUTokenizerConfig
+// {
+// /** Token type for words containing ideographic characters */
+// public static readonly string WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+// /** Token type for words containing Japanese hiragana */
+// public static readonly string WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+// /** Token type for words containing Japanese katakana */
+// public static readonly string WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+// /** Token type for words containing Korean hangul */
+// public static readonly string WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+// /** Token type for words that contain letters */
+// public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
+// /** Token type for words that appear to be numbers */
+// public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
+
+// /*
+// * the default breakiterators in use. these can be expensive to
+// * instantiate, cheap to clone.
+// */
+// // we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
+// // is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
+// private static readonly Icu.BreakIterator cjkBreakIterator = new Icu.RuleBasedBreakIterator(Icu.BreakIterator.UBreakIteratorType.WORD, new Locale()); //BreakIterator.getWordInstance(ULocale.ROOT);
+// // the same as ROOT, except no dictionary segmentation for cjk
+// private static readonly Icu.BreakIterator defaultBreakIterator =
+// ReadBreakIterator("Default.brk");
+// private static readonly Icu.BreakIterator khmerBreakIterator =
+// ReadBreakIterator("Khmer.brk");
+// private static readonly Icu.BreakIterator myanmarBreakIterator =
+// ReadBreakIterator("Myanmar.brk");
+
+// // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
+// private readonly bool cjkAsWords;
+
+// /**
+// * Creates a new config. This object is lightweight, but the first
+// * time the class is referenced, breakiterators will be initialized.
+// * @param cjkAsWords true if cjk text should undergo dictionary-based segmentation,
+// * otherwise text will be segmented according to UAX#29 defaults.
+// * If this is true, all Han+Hiragana+Katakana words will be tagged as
+// * IDEOGRAPHIC.
+// */
+// public DefaultICUTokenizerConfig(bool cjkAsWords)
+// {
+// this.cjkAsWords = cjkAsWords;
+// }
+
+// public override bool CombineCJ
+// {
+// get { return cjkAsWords; }
+// }
+
+// public override Icu.BreakIterator GetBreakIterator(int script)
+// {
+// switch (script)
+// {
+// case UScript.KHMER: return (Icu.BreakIterator)khmerBreakIterator.Clone();
+// case UScript.MYANMAR: return (Icu.BreakIterator)myanmarBreakIterator.Clone();
+// case UScript.JAPANESE: return (Icu.BreakIterator)cjkBreakIterator.Clone();
+// default: return (Icu.BreakIterator)defaultBreakIterator.Clone();
+// }
+// }
+
+// public override string GetType(int script, int ruleStatus)
+// {
+// switch (ruleStatus)
+// {
+// case RuleBasedBreakIterator.WORD_IDEO:
+// return WORD_IDEO;
+// case RuleBasedBreakIterator.WORD_KANA:
+// return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
+// case RuleBasedBreakIterator.WORD_LETTER:
+// return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
+// case RuleBasedBreakIterator.WORD_NUMBER:
+// return WORD_NUMBER;
+// default: /* some other custom code */
+// return "<OTHER>";
+// }
+// }
+
+// private static RuleBasedBreakIterator ReadBreakIterator(string filename)
+// {
+// Stream @is =
+// typeof(DefaultICUTokenizerConfig).Assembly.FindAndGetManifestResourceStream(typeof(DefaultICUTokenizerConfig), filename);
+// try
+// {
+// RuleBasedBreakIterator bi =
+// RuleBasedBreakIterator.GetInstanceFromCompiledRules(@is);
+// @is.Dispose();
+// return bi;
+// }
+// catch (IOException e)
+// {
+// throw new Exception(e.ToString(), e);
+// }
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizer.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizer.cs
new file mode 100644
index 0000000..7677c0c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizer.cs
@@ -0,0 +1,229 @@
+// LUCENENET TODO: Port issues - missing dependencies
+
+//using Lucene.Net.Analysis.ICU.TokenAttributes;
+//using Lucene.Net.Analysis.TokenAttributes;
+//using System;
+//using System.Collections.Generic;
+//using System.Diagnostics;
+//using System.IO;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU.Segmentation
+//{
+// /// <summary>
+// /// Breaks text into words according to UAX #29: Unicode Text Segmentation
+// /// (http://www.unicode.org/reports/tr29/)
+// /// <para/>
+// /// Words are broken across script boundaries, then segmented according to
+// /// the BreakIterator and typing provided by the <see cref="ICUTokenizerConfig"/>
+// /// <para/>
+// /// @lucene.experimental
+// /// </summary>
+// /// <seealso cref="ICUTokenizerConfig"/>
+// public sealed class ICUTokenizer : Tokenizer
+// {
+// private static readonly int IOBUFFER = 4096;
+// private readonly char[] buffer = new char[IOBUFFER];
+// /** true length of text in the buffer */
+// private int length = 0;
+// /** length in buffer that can be evaluated safely, up to a safe end point */
+// private int usableLength = 0;
+// /** accumulated offset of previous buffers for this reader, for offsetAtt */
+// private int offset = 0;
+
+// private readonly CompositeBreakIterator breaker; /* tokenizes a char[] of text */
+// private readonly ICUTokenizerConfig config;
+// private readonly IOffsetAttribute offsetAtt;
+// private readonly ICharTermAttribute termAtt;
+// private readonly ITypeAttribute typeAtt;
+// private readonly IScriptAttribute scriptAtt;
+
+// /**
+// * Construct a new ICUTokenizer that breaks text into words from the given
+// * Reader.
+// * <p>
+// * The default script-specific handling is used.
+// * <p>
+// * The default attribute factory is used.
+// *
+// * @param input Reader containing text to tokenize.
+// * @see DefaultICUTokenizerConfig
+// */
+// public ICUTokenizer(TextReader input)
+// : this(input, new DefaultICUTokenizerConfig(true))
+// {
+// }
+
+// /**
+// * Construct a new ICUTokenizer that breaks text into words from the given
+// * Reader, using a tailored BreakIterator configuration.
+// * <p>
+// * The default attribute factory is used.
+// *
+// * @param input Reader containing text to tokenize.
+// * @param config Tailored BreakIterator configuration
+// */
+// public ICUTokenizer(TextReader input, ICUTokenizerConfig config)
+// : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, config)
+// {
+// }
+
+// /**
+// * Construct a new ICUTokenizer that breaks text into words from the given
+// * Reader, using a tailored BreakIterator configuration.
+// *
+// * @param factory AttributeFactory to use
+// * @param input Reader containing text to tokenize.
+// * @param config Tailored BreakIterator configuration
+// */
+// public ICUTokenizer(AttributeFactory factory, TextReader input, ICUTokenizerConfig config)
+// : base(factory, input)
+// {
+// this.config = config;
+// breaker = new CompositeBreakIterator(config);
+
+// this.offsetAtt = AddAttribute<IOffsetAttribute>();
+// this.termAtt = AddAttribute<ICharTermAttribute>();
+// this.typeAtt = AddAttribute<ITypeAttribute>();
+// this.scriptAtt = AddAttribute<IScriptAttribute>();
+// }
+
+
+// public override bool IncrementToken()
+// {
+// ClearAttributes();
+// if (length == 0)
+// Refill();
+// while (!IncrementTokenBuffer())
+// {
+// Refill();
+// if (length <= 0) // no more bytes to read;
+// return false;
+// }
+// return true;
+// }
+
+
+// public override void Reset()
+// {
+// base.Reset();
+// breaker.SetText(buffer, 0, 0);
+// length = usableLength = offset = 0;
+// }
+
+// public override void End()
+// {
+// base.End();
+// int finalOffset = (length < 0) ? offset : offset + length;
+// offsetAtt.SetOffset(CorrectOffset(finalOffset), CorrectOffset(finalOffset));
+// }
+
+// /*
+// * This tokenizes text based upon the longest matching rule, and because of
+// * this, isn't friendly to a Reader.
+// *
+// * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of
+// * text, the last unambiguous break point is found (in this implementation:
+// * white space character) Any remaining characters represent possible partial
+// * words, so are appended to the front of the next chunk.
+// *
+// * There is the possibility that there are no unambiguous break points within
+// * an entire 4kB chunk of text (binary data). So there is a maximum word limit
+// * of 4kB since it will not try to grow the buffer in this case.
+// */
+
+// /**
+// * Returns the last unambiguous break position in the text.
+// *
+// * @return position of character, or -1 if one does not exist
+// */
+// private int FindSafeEnd()
+// {
+// for (int i = length - 1; i >= 0; i--)
+// if (char.IsWhiteSpace(buffer[i]))
+// return i + 1;
+// return -1;
+// }
+
+// /**
+// * Refill the buffer, accumulating the offset and setting usableLength to the
+// * last unambiguous break position
+// *
+// * @throws IOException If there is a low-level I/O error.
+// */
+// private void Refill()
+// {
+// offset += usableLength;
+// int leftover = length - usableLength;
+// System.Array.Copy(buffer, usableLength, buffer, 0, leftover);
+// int requested = buffer.Length - leftover;
+// int returned = Read(m_input, buffer, leftover, requested);
+// length = returned + leftover;
+// if (returned < requested) /* reader has been emptied, process the rest */
+// usableLength = length;
+// else
+// { /* still more data to be read, find a safe-stopping place */
+// usableLength = FindSafeEnd();
+// if (usableLength < 0)
+// usableLength = length; /*
+// * more than IOBUFFER of text without space,
+// * gonna possibly truncate tokens
+// */
+// }
+
+// breaker.SetText(buffer, 0, Math.Max(0, usableLength));
+// }
+
+// // TODO: refactor to a shared readFully somewhere
+// // (NGramTokenizer does this too):
+// /** commons-io's readFully, but without bugs if offset != 0 */
+// private static int Read(TextReader input, char[] buffer, int offset, int length)
+// {
+// Debug.Assert(length >= 0, "length must not be negative: " + length);
+
+// int remaining = length;
+// while (remaining > 0)
+// {
+// int location = length - remaining;
+// int count = input.Read(buffer, offset + location, remaining);
+// if (-1 == count)
+// { // EOF
+// break;
+// }
+// remaining -= count;
+// }
+// return length - remaining;
+// }
+
+// /*
+// * return true if there is a token from the buffer, or null if it is
+// * exhausted.
+// */
+// private bool IncrementTokenBuffer()
+// {
+// int start = breaker.Current;
+// if (start == Support.BreakIterator.DONE)
+// return false; // BreakIterator exhausted
+
+// // find the next set of boundaries, skipping over non-tokens (rule status 0)
+// int end = breaker.Next();
+// while (start != Support.BreakIterator.DONE && breaker.GetRuleStatus() == 0)
+// {
+// start = end;
+// end = breaker.Next();
+// }
+
+// if (start == Support.BreakIterator.DONE)
+// return false; // BreakIterator exhausted
+
+// termAtt.CopyBuffer(buffer, start, end - start);
+// offsetAtt.SetOffset(CorrectOffset(offset + start), CorrectOffset(offset + end));
+// typeAtt.Type = config.GetType(breaker.GetScriptCode(), breaker.GetRuleStatus());
+// scriptAtt.Code = breaker.GetScriptCode();
+
+// return true;
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerConfig.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerConfig.cs
new file mode 100644
index 0000000..0c13316
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerConfig.cs
@@ -0,0 +1,33 @@
+// LUCENENET TODO: Port issues - missing dependencies
+
+//using Lucene.Net.Support;
+
+//namespace Lucene.Net.Analysis.ICU.Segmentation
+//{
+// /// <summary>
+// /// Class that allows for tailored Unicode Text Segmentation on
+// /// a per-writing system basis.
+// /// <para/>
+// /// @lucene.experimental
+// /// </summary>
+// public abstract class ICUTokenizerConfig
+// {
+// /// <summary>
+// /// Sole constructor. (For invocation by subclass
+// /// constructors, typically implicit.)
+// /// </summary>
+// public ICUTokenizerConfig() { }
+// /// <summary>
+// /// Return a breakiterator capable of processing a given script.
+// /// </summary>
+// public abstract Icu.BreakIterator GetBreakIterator(int script);
+// /// <summary>
+// /// Return a token type value for a given script and BreakIterator rule status.
+// /// </summary>
+// public abstract string GetType(int script, int ruleStatus);
+// /// <summary>
+// /// true if Han, Hiragana, and Katakana scripts should all be returned as Japanese
+// /// </summary>
+// public abstract bool CombineCJ { get; }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerFactory.cs
new file mode 100644
index 0000000..14aa9c0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerFactory.cs
@@ -0,0 +1,139 @@
+// LUCENENET TODO: Port issues - missing dependencies
+
+//using Icu;
+//using Lucene.Net.Analysis.Util;
+//using Lucene.Net.Support;
+//using Lucene.Net.Util;
+//using System;
+//using System.Collections.Generic;
+//using System.Diagnostics;
+//using System.IO;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU.Segmentation
+//{
+
+// public class ICUTokenizerFactory : TokenizerFactory, IResourceLoaderAware
+// {
+// internal static readonly string RULEFILES = "rulefiles";
+// private readonly IDictionary<int, string> tailored;
+// private ICUTokenizerConfig config;
+// private readonly bool cjkAsWords;
+
+// /// <summary>Creates a new ICUTokenizerFactory</summary>
+// public ICUTokenizerFactory(IDictionary<string, string> args)
+// : base(args)
+// {
+// tailored = new Dictionary<int, string>();
+// string rulefilesArg = Get(args, RULEFILES);
+// if (rulefilesArg != null)
+// {
+// IList<string> scriptAndResourcePaths = SplitFileNames(rulefilesArg);
+// foreach (string scriptAndResourcePath in scriptAndResourcePaths)
+// {
+// int colonPos = scriptAndResourcePath.IndexOf(":");
+// string scriptCode = scriptAndResourcePath.Substring(0, colonPos - 0).Trim();
+// string resourcePath = scriptAndResourcePath.Substring(colonPos + 1).Trim();
+// tailored[UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode)] = resourcePath;
+// }
+// }
+// cjkAsWords = GetBoolean(args, "cjkAsWords", true);
+// if (args.Count != 0)
+// {
+// throw new ArgumentException("Unknown parameters: " + args);
+// }
+// }
+
+// public virtual void Inform(IResourceLoader loader)
+// {
+// Debug.Assert(tailored != null, "init must be called first!");
+// if (tailored.Count == 0)
+// {
+// config = new DefaultICUTokenizerConfig(cjkAsWords);
+// }
+// else
+// {
+// config = new DefaultICUTokenizerConfigAnonymousHelper(cjkAsWords, tailored, loader);
+
+// //BreakIterator[] breakers = new BreakIterator[UScript.CODE_LIMIT];
+// //foreach (var entry in tailored)
+// //{
+// // int code = entry.Key;
+// // string resourcePath = entry.Value;
+// // breakers[code] = ParseRules(resourcePath, loader);
+// //}
+// // config = new DefaultICUTokenizerConfig(cjkAsWords)
+// // {
+
+// // public override BreakIterator GetBreakIterator(int script)
+// // {
+// // if (breakers[script] != null)
+// // {
+// // return (BreakIterator)breakers[script].clone();
+// // }
+// // else
+// // {
+// // return base.GetBreakIterator(script);
+// // }
+// // }
+// // // TODO: we could also allow codes->types mapping
+// //};
+// }
+// }
+
+// private class DefaultICUTokenizerConfigAnonymousHelper : DefaultICUTokenizerConfig
+// {
+// private readonly Icu.BreakIterator[] breakers;
+// public DefaultICUTokenizerConfigAnonymousHelper(bool cjkAsWords, IDictionary<int, string> tailored, IResourceLoader loader)
+// : base(cjkAsWords)
+// {
+// breakers = new Icu.BreakIterator[UScript.CODE_LIMIT];
+// foreach (var entry in tailored)
+// {
+// int code = entry.Key;
+// string resourcePath = entry.Value;
+// breakers[code] = ParseRules(resourcePath, loader);
+// }
+// }
+
+// public override Icu.BreakIterator GetBreakIterator(int script)
+// {
+// if (breakers[script] != null)
+// {
+// return (Icu.BreakIterator)breakers[script].Clone();
+// }
+// else
+// {
+// return base.GetBreakIterator(script);
+// }
+// }
+
+// private Icu.BreakIterator ParseRules(string filename, IResourceLoader loader)
+// {
+// StringBuilder rules = new StringBuilder();
+// Stream rulesStream = loader.OpenResource(filename);
+// using (TextReader reader = IOUtils.GetDecodingReader(rulesStream, Encoding.UTF8))
+// {
+// string line = null;
+// while ((line = reader.ReadLine()) != null)
+// {
+// if (!line.StartsWith("#", StringComparison.Ordinal))
+// {
+// rules.Append(line);
+// }
+// rules.Append('\n');
+// }
+// }
+// return new RuleBasedBreakIterator(rules.ToString());
+// }
+// }
+
+// public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
+// {
+// Debug.Assert(config != null, "inform must be called first!");
+// return new ICUTokenizer(factory, input, config);
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ScriptIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ScriptIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ScriptIterator.cs
new file mode 100644
index 0000000..f328851
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ScriptIterator.cs
@@ -0,0 +1,206 @@
+// LUCENENET TODO: Port issues - missing dependencies
+
+//using System;
+//using System.Collections.Generic;
+//using System.Linq;
+//using System.Text;
+//using System.Text.RegularExpressions;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU.Segmentation
+//{
+// /// <summary>
+// /// An iterator that locates ISO 15924 script boundaries in text.
+// /// </summary>
+// /// <remarks>
+// /// This is not the same as simply looking at the Unicode block, or even the
+// /// Script property. Some characters are 'common' across multiple scripts, and
+// /// some 'inherit' the script value of text surrounding them.
+// /// <para/>
+// /// This is similar to ICU (internal-only) UScriptRun, with the following
+// /// differences:
+// /// <list type="bullet">
+// /// <item><description>
+// /// Doesn't attempt to match paired punctuation. For tokenization purposes, this
+// /// is not necessary. Its also quite expensive.
+// /// </description></item>
+// /// <item><description>
+// /// Non-spacing marks inherit the script of their base character, following
+// /// recommendations from UTR #24.
+// /// </description></item>
+// /// </list>
+// /// <para/>
+// /// @lucene.experimental
+// /// </remarks>
+// internal sealed class ScriptIterator
+// {
+// private char[] text;
+// private int start;
+// private int limit;
+// private int index;
+
+// private int scriptStart;
+// private int scriptLimit;
+// private int scriptCode;
+
+// private readonly bool combineCJ;
+
+// /**
+// * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
+// */
+// internal ScriptIterator(bool combineCJ)
+// {
+// this.combineCJ = combineCJ;
+// }
+
+// /**
+// * Get the start of this script run
+// *
+// * @return start position of script run
+// */
+// public int ScriptStart
+// {
+// get { return scriptStart; }
+// }
+
+// /**
+// * Get the index of the first character after the end of this script run
+// *
+// * @return position of the first character after this script run
+// */
+// public int ScriptLimit
+// {
+// get { return scriptLimit; }
+// }
+
+// /**
+// * Get the UScript script code for this script run
+// *
+// * @return code for the script of the current run
+// */
+// public int ScriptCode
+// {
+// get { return scriptCode; }
+// }
+
+// /**
+// * Iterates to the next script run, returning true if one exists.
+// *
+// * @return true if there is another script run, false otherwise.
+// */
+// public bool Next()
+// {
+// if (scriptLimit >= limit)
+// return false;
+
+// scriptCode = UScript.COMMON;
+// scriptStart = scriptLimit;
+
+// while (index < limit)
+// {
+// //int ch = UTF16.charAt(text, start, limit, index - start);
+// int ch = Encoding.Unicode.(text, start, limit);
+// int sc = GetScript(ch);
+
+// /*
+// * From UTR #24: Implementations that determine the boundaries between
+// * characters of given scripts should never break between a non-spacing
+// * mark and its base character. Thus for boundary determinations and
+// * similar sorts of processing, a non-spacing mark — whatever its script
+// * value — should inherit the script value of its base character.
+// */
+// if (isSameScript(scriptCode, sc)
+// || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK)
+// {
+// //index += UTF16.getCharCount(ch);
+// index += Encoding.Unicode.GetCharCount()
+
+// /*
+// * Inherited or Common becomes the script code of the surrounding text.
+// */
+// if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED)
+// {
+// scriptCode = sc;
+// }
+
+// }
+// else
+// {
+// break;
+// }
+// }
+
+// scriptLimit = index;
+// return true;
+// }
+
+// /** Determine if two scripts are compatible. */
+// private static bool IsSameScript(int scriptOne, int scriptTwo)
+// {
+// return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
+// || scriptOne == scriptTwo;
+// }
+
+// /**
+// * Set a new region of text to be examined by this iterator
+// *
+// * @param text text buffer to examine
+// * @param start offset into buffer
+// * @param length maximum length to examine
+// */
+// public void SetText(char[] text, int start, int length)
+// {
+// this.text = text;
+// this.start = start;
+// this.index = start;
+// this.limit = start + length;
+// this.scriptStart = start;
+// this.scriptLimit = start;
+// this.scriptCode = UScript.INVALID_CODE;
+// }
+
+// /** linear fast-path for basic latin case */
+// private static readonly int[] basicLatin = new int[128];
+
+// static ScriptIterator()
+// {
+// for (int i = 0; i < basicLatin.Length; i++)
+// basicLatin[i] = UScript.GetScript(i);
+// }
+
+// /** fast version of UScript.getScript(). Basic Latin is an array lookup */
+// private int GetScript(int codepoint)
+// {
+// if (0 <= codepoint && codepoint < basicLatin.Length)
+// {
+// return basicLatin[codepoint];
+// }
+// else
+// {
+// //int script = UScript.GetScript(codepoint);
+// if (combineCJ)
+// {
+// if (Regex.IsMatch(new string(Support.Character.ToChars(codepoint)), @"\p{IsHangulCompatibilityJamo}+|\p{IsHiragana}+|\p{IsKatakana}+"))
+// //if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA)
+// {
+// return UScript.JAPANESE;
+// }
+// else if (codepoint >= 0xFF10 && codepoint <= 0xFF19)
+// {
+// // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
+// // they are treated as punctuation. we currently have no cleaner way to fix this!
+// return UScript.LATIN;
+// }
+// else
+// {
+// return script;
+// }
+// }
+// else
+// {
+// return script;
+// }
+// }
+// }
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttribute.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttribute.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttribute.cs
new file mode 100644
index 0000000..abc1ae2
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttribute.cs
@@ -0,0 +1,42 @@
+// LUCENENET TODO: Port issues - missing dependencies
+
+//using Lucene.Net.Util;
+//using System;
+//using System.Collections.Generic;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU.TokenAttributes
+//{
+// /// <summary>
+// /// This attribute stores the UTR #24 script value for a token of text.
+// /// <para/>
+// /// @lucene.experimental
+// /// </summary>
+// public interface IScriptAttribute : IAttribute
+// {
+// /**
+// * Get the numeric code for this script value.
+// * This is the constant value from {@link UScript}.
+// * @return numeric code
+// */
+// int Code { get; set; }
+// ///**
+// // * Set the numeric code for this script value.
+// // * This is the constant value from {@link UScript}.
+// // * @param code numeric code
+// // */
+// //public void setCode(int code);
+// /**
+// * Get the full name.
+// * @return UTR #24 full name.
+// */
+// string GetName();
+// /**
+// * Get the abbreviated name.
+// * @return UTR #24 abbreviated name.
+// */
+// string GetShortName();
+// }
+//}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttributeImpl.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttributeImpl.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttributeImpl.cs
new file mode 100644
index 0000000..6fa4512
--- /dev/null
+++ b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttributeImpl.cs
@@ -0,0 +1,80 @@
+// LUCENENET TODO: Port issues - missing dependencies
+
+//using Lucene.Net.Util;
+//using System.Collections.Generic;
+//using System.Linq;
+//using System.Text;
+//using System.Threading.Tasks;
+
+//namespace Lucene.Net.Analysis.ICU.TokenAttributes
+//{
+// /// <summary>
+// /// Implementation of <see cref="IScriptAttribute"/> that stores the script
+// /// as an integer.
+// /// <para/>
+// /// @lucene.experimental
+// /// </summary>
+// public class ScriptAttribute : Attribute, IScriptAttribute
+// {
+// private int code = UScript.COMMON;
+
+// /** Initializes this attribute with <code>UScript.COMMON</code> */
+// public ScriptAttribute() { }
+
+// public virtual int Code
+// {
+// get { return code; }
+// set { code = value; }
+// }
+
+// public virtual string GetName()
+// {
+// return UScript.GetName(code);
+// }
+
+// public virtual string GetShortName()
+// {
+// return UScript.GetShortName(code);
+// }
+
+// public override void Clear()
+// {
+// code = UScript.COMMON;
+// }
+
+// public override void CopyTo(IAttribute target)
+// {
+// ScriptAttribute t = (ScriptAttribute)target;
+// t.Code = code;
+// }
+
+// public override bool Equals(object other)
+// {
+// if (this == other)
+// {
+// return true;
+// }
+
+// if (other is ScriptAttribute)
+// {
+// return ((ScriptAttribute)other).code == code;
+// }
+
+// return false;
+// }
+
+// public override int GetHashCode()
+// {
+// return code;
+// }
+
+// public override void ReflectWith(IAttributeReflector reflector)
+// {
+// // when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to
+// // mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset),
+// // but this is just to help prevent confusion.
+// string name = code == UScript.JAPANESE ? "Chinese/Japanese" : GetName();
+// reflector.Reflect<IScriptAttribute>("script", name);
+// }
+// }
+//}