You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/08 00:12:36 UTC
[32/34] lucenenet git commit: Raw porting of
Lucene.Net.Analysis.Common
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs
new file mode 100644
index 0000000..c091904
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs
@@ -0,0 +1,370 @@
+using System;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+ /// <summary>
+ /// CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
+ /// <para>
+ /// The tokens returned are every two adjacent characters with overlap match.
+ /// </para>
+ /// <para>
+ /// Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
+ /// </para>
+ /// Additionally, the following is applied to Latin text (such as English):
+ /// <ul>
+ /// <li>Text is converted to lowercase.
+ /// <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
+ /// <li>Full-width forms are converted to half-width forms.
+ /// </ul>
+ /// For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
+ /// please search <a
+ /// href="http://www.google.com/search?q=word+chinese+segment">google</a>
+ /// </summary>
+ /// @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.
+ [Obsolete("Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.")]
+ public sealed class CJKTokenizer : Tokenizer
+ {
+ //~ Static fields/initializers ---------------------------------------------
+ /// <summary>
+ /// Word token type </summary>
+ internal const int WORD_TYPE = 0;
+
+ /// <summary>
+ /// Single byte token type </summary>
+ internal const int SINGLE_TOKEN_TYPE = 1;
+
+ /// <summary>
+ /// Double byte token type </summary>
+ internal const int DOUBLE_TOKEN_TYPE = 2;
+
+ /// <summary>
+ /// Names for token types </summary>
+ internal static readonly string[] TOKEN_TYPE_NAMES = new string[] {"word", "single", "double"};
+
+ /// <summary>
+ /// Max word length </summary>
+ private const int MAX_WORD_LEN = 255;
+
+ /// <summary>
+ /// buffer size: </summary>
+ private const int IO_BUFFER_SIZE = 256;
+
+ //~ Instance fields --------------------------------------------------------
+
+ /// <summary>
+ /// word offset, used to imply which character(in ) is parsed </summary>
+ private int offset = 0;
+
+ /// <summary>
+ /// the index used only for ioBuffer </summary>
+ private int bufferIndex = 0;
+
+ /// <summary>
+ /// data length </summary>
+ private int dataLen = 0;
+
+ /// <summary>
+ /// character buffer, store the characters which are used to compose <br>
+ /// the returned Token
+ /// </summary>
+ private readonly char[] buffer = new char[MAX_WORD_LEN];
+
+ /// <summary>
+ /// I/O buffer, used to store the content of the input(one of the <br>
+ /// members of Tokenizer)
+ /// </summary>
+ private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ /// <summary>
+ /// word type: single=>ASCII double=>non-ASCII word=>default </summary>
+ private int tokenType = WORD_TYPE;
+
+ /// <summary>
+ /// tag: previous character is a cached double-byte character "C1C2C3C4"
+ /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+ /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+ /// </summary>
+ private bool preIsTokened = false;
+
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+ private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+
+ //~ Constructors -----------------------------------------------------------
+
+ /// <summary>
+ /// Construct a token stream processing the given input.
+ /// </summary>
+ /// <param name="in"> I/O reader </param>
+ public CJKTokenizer(Reader @in) : base(@in)
+ {
+ }
+
+ public CJKTokenizer(AttributeFactory factory, Reader @in) : base(factory, @in)
+ {
+ }
+
+ //~ Methods ----------------------------------------------------------------
+
+ /// <summary>
+ /// Returns true for the next token in the stream, or false at EOS.
+ /// See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
+ /// for detail.
+ /// </summary>
+ /// <returns> false for end of stream, true otherwise
+ /// </returns>
+ /// <exception cref="java.io.IOException"> - throw IOException when read error <br>
+ /// happened in the InputStream
+ /// </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ clearAttributes();
+ /// <summary>
+ /// how many character(s) has been stored in buffer </summary>
+
+ while (true) // loop until we find a non-empty token
+ {
+
+ int length = 0;
+
+ /// <summary>
+ /// the position used to create Token </summary>
+ int start = offset;
+
+ while (true) // loop until we've found a full token
+ {
+ /// <summary>
+ /// current character </summary>
+ char c;
+
+ /// <summary>
+ /// unicode block of current character for detail </summary>
+ char.UnicodeBlock ub;
+
+ offset++;
+
+ if (bufferIndex >= dataLen)
+ {
+ dataLen = input.read(ioBuffer);
+ bufferIndex = 0;
+ }
+
+ if (dataLen == -1)
+ {
+ if (length > 0)
+ {
+ if (preIsTokened == true)
+ {
+ length = 0;
+ preIsTokened = false;
+ }
+ else
+ {
+ offset--;
+ }
+
+ break;
+ }
+ else
+ {
+ offset--;
+ return false;
+ }
+ }
+ else
+ {
+ //get current character
+ c = ioBuffer[bufferIndex++];
+
+ //get the UnicodeBlock of the current character
+ ub = char.UnicodeBlock.of(c);
+ }
+
+ //if the current character is ASCII or Extend ASCII
+ if ((ub == char.UnicodeBlock.BASIC_LATIN) || (ub == char.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS))
+ {
+ if (ub == char.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
+ {
+ int i = (int) c;
+ if (i >= 65281 && i <= 65374)
+ {
+ // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
+ i = i - 65248;
+ c = (char) i;
+ }
+ }
+
+ // if the current character is a letter or "_" "+" "#"
+ if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')))
+ {
+ if (length == 0)
+ {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the current character begin to token the ASCII
+ // letter
+ start = offset - 1;
+ }
+ else if (tokenType == DOUBLE_TOKEN_TYPE)
+ {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the previous non-ASCII
+ // : the current character
+ offset--;
+ bufferIndex--;
+
+ if (preIsTokened == true)
+ {
+ // there is only one non-ASCII has been stored
+ length = 0;
+ preIsTokened = false;
+ break;
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ // store the LowerCase(c) in the buffer
+ buffer[length++] = char.ToLower(c);
+ tokenType = SINGLE_TOKEN_TYPE;
+
+ // break the procedure if buffer overflowed!
+ if (length == MAX_WORD_LEN)
+ {
+ break;
+ }
+ }
+ else if (length > 0)
+ {
+ if (preIsTokened == true)
+ {
+ length = 0;
+ preIsTokened = false;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+ else
+ {
+ // non-ASCII letter, e.g."C1C2C3C4"
+ if (char.IsLetter(c))
+ {
+ if (length == 0)
+ {
+ start = offset - 1;
+ buffer[length++] = c;
+ tokenType = DOUBLE_TOKEN_TYPE;
+ }
+ else
+ {
+ if (tokenType == SINGLE_TOKEN_TYPE)
+ {
+ offset--;
+ bufferIndex--;
+
+ //return the previous ASCII characters
+ break;
+ }
+ else
+ {
+ buffer[length++] = c;
+ tokenType = DOUBLE_TOKEN_TYPE;
+
+ if (length == 2)
+ {
+ offset--;
+ bufferIndex--;
+ preIsTokened = true;
+
+ break;
+ }
+ }
+ }
+ }
+ else if (length > 0)
+ {
+ if (preIsTokened == true)
+ {
+ // empty the buffer
+ length = 0;
+ preIsTokened = false;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+ }
+
+ if (length > 0)
+ {
+ termAtt.copyBuffer(buffer, 0, length);
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start + length));
+ typeAtt.Type = TOKEN_TYPE_NAMES[tokenType];
+ return true;
+ }
+ else if (dataLen == -1)
+ {
+ offset--;
+ return false;
+ }
+
+ // Cycle back and try for the next token (don't
+ // return an empty string)
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final void end() throws java.io.IOException
+ public override void end()
+ {
+ base.end();
+ // set final offset
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int finalOffset = correctOffset(offset);
+ int finalOffset = correctOffset(offset);
+ this.offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ offset = bufferIndex = dataLen = 0;
+ preIsTokened = false;
+ tokenType = WORD_TYPE;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs
new file mode 100644
index 0000000..526b1b4
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs
@@ -0,0 +1,58 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenizerFactory = org.apache.lucene.analysis.util.TokenizerFactory;
+ using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
+
+
+ /// <summary>
+ /// Factory for <seealso cref="CJKTokenizer"/>.
+ /// <pre class="prettyprint" >
+ /// <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.CJKTokenizerFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre> </summary>
+ /// @deprecated Use <seealso cref="CJKBigramFilterFactory"/> instead.
+ [Obsolete("Use <seealso cref="CJKBigramFilterFactory"/> instead.")]
+ public class CJKTokenizerFactory : TokenizerFactory
+ {
+
+ /// <summary>
+ /// Creates a new CJKTokenizerFactory </summary>
+ public CJKTokenizerFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override CJKTokenizer create(AttributeFactory factory, Reader @in)
+ {
+ return new CJKTokenizer(factory, @in);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs
new file mode 100644
index 0000000..8beffcc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs
@@ -0,0 +1,113 @@
+namespace org.apache.lucene.analysis.cjk
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using StemmerUtil = org.apache.lucene.analysis.util.StemmerUtil;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that normalizes CJK width differences:
+ /// <ul>
+ /// <li>Folds fullwidth ASCII variants into the equivalent basic latin
+ /// <li>Folds halfwidth Katakana variants into the equivalent kana
+ /// </ul>
+ /// <para>
+ /// NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD
+ /// Unicode normalization. See the normalization support in the ICU package
+ /// for full normalization.
+ /// </para>
+ /// </summary>
+ public sealed class CJKWidthFilter : TokenFilter
+ {
+ private CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+ /* halfwidth kana mappings: 0xFF65-0xFF9D
+ *
+ * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
+ * as a fallback when they cannot properly combine with a preceding
+ * character into a composed form.
+ */
+ private static readonly char[] KANA_NORM = new char[] {0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A};
+
+ public CJKWidthFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ char[] text = termAtt.buffer();
+ int length = termAtt.length();
+ for (int i = 0; i < length; i++)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char ch = text[i];
+ char ch = text[i];
+ if (ch >= 0xFF01 && ch <= 0xFF5E)
+ {
+ // Fullwidth ASCII variants
+ text[i] -= 0xFEE0;
+ }
+ else if (ch >= 0xFF65 && ch <= 0xFF9F)
+ {
+ // Halfwidth Katakana variants
+ if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, ch))
+ {
+ length = StemmerUtil.delete(text, i--, length);
+ }
+ else
+ {
+ text[i] = KANA_NORM[ch - 0xFF65];
+ }
+ }
+ }
+ termAtt.Length = length;
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /* kana combining diffs: 0x30A6-0x30FD */
+ private static readonly sbyte[] KANA_COMBINE_VOICED = new sbyte[] {78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+ private static readonly sbyte[] KANA_COMBINE_HALF_VOICED = new sbyte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ /// <summary>
+ /// returns true if we successfully combined the voice mark </summary>
+ private static bool combine(char[] text, int pos, char ch)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char prev = text[pos-1];
+ char prev = text[pos - 1];
+ if (prev >= 0x30A6 && prev <= 0x30FD)
+ {
+ text[pos - 1] += (ch == 0xFF9F) ? KANA_COMBINE_HALF_VOICED[prev - 0x30A6] : KANA_COMBINE_VOICED[prev - 0x30A6];
+ return text[pos - 1] != prev;
+ }
+ return false;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs
new file mode 100644
index 0000000..a917f90
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs
@@ -0,0 +1,66 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using AbstractAnalysisFactory = org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+ using MultiTermAwareComponent = org.apache.lucene.analysis.util.MultiTermAwareComponent;
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="CJKWidthFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_cjk" class="solr.TextField">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.CJKWidthFilterFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.CJKBigramFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class CJKWidthFilterFactory : TokenFilterFactory, MultiTermAwareComponent
+ {
+
+ /// <summary>
+ /// Creates a new CJKWidthFilterFactory </summary>
+ public CJKWidthFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ return new CJKWidthFilter(input);
+ }
+
+ public virtual AbstractAnalysisFactory MultiTermComponent
+ {
+ get
+ {
+ return this;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniAnalyzer.cs
new file mode 100644
index 0000000..d964550
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniAnalyzer.cs
@@ -0,0 +1,139 @@
+using System;
+
+namespace org.apache.lucene.analysis.ckb
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+ using WordlistLoader = org.apache.lucene.analysis.util.WordlistLoader;
+ using IOUtils = org.apache.lucene.util.IOUtils;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// <seealso cref="Analyzer"/> for Sorani Kurdish.
+ /// </summary>
+ public sealed class SoraniAnalyzer : StopwordAnalyzerBase
+ {
+ private readonly CharArraySet stemExclusionSet;
+
+ /// <summary>
+ /// File containing default Kurdish stopwords. </summary>
+ public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /// <summary>
+ /// Returns an unmodifiable instance of the default stop words set. </summary>
+ /// <returns> default stop words set. </returns>
+ public static CharArraySet DefaultStopSet
+ {
+ get
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+ }
+
+ /// <summary>
+ /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ /// accesses the static final set the first time.;
+ /// </summary>
+ private class DefaultSetHolder
+ {
+ internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+ static DefaultSetHolder()
+ {
+ try
+ {
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(typeof(SoraniAnalyzer), DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
+ }
+ catch (IOException)
+ {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new Exception("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
+ /// </summary>
+ public SoraniAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="matchVersion"> lucene compatibility version </param>
+ /// <param name="stopwords"> a stopword set </param>
+ public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
+ /// stemming.
+ /// </summary>
+ /// <param name="matchVersion"> lucene compatibility version </param>
+ /// <param name="stopwords"> a stopword set </param>
+ /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
+ public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
+ {
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+ }
+
+ /// <summary>
+ /// Creates a
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
+ /// </summary>
+ /// <returns> A
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// built from an <seealso cref="StandardTokenizer"/> filtered with
+ /// <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>,
+ /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
+ /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
+ /// provided and <seealso cref="SoraniStemFilter"/>. </returns>
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ result = new SoraniNormalizationFilter(result);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (!stemExclusionSet.Empty)
+ {
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ }
+ result = new SoraniStemFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilter.cs
new file mode 100644
index 0000000..17133ba
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilter.cs
@@ -0,0 +1,52 @@
+namespace org.apache.lucene.analysis.ckb
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that applies <seealso cref="SoraniNormalizer"/> to normalize the
+ /// orthography.
+ /// </summary>
+ public sealed class SoraniNormalizationFilter : TokenFilter
+ {
+ private readonly SoraniNormalizer normalizer = new SoraniNormalizer();
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+ public SoraniNormalizationFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
+ int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
+ termAtt.Length = newlen;
+ return true;
+ }
+ return false;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilterFactory.cs
new file mode 100644
index 0000000..5f68eb7
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilterFactory.cs
@@ -0,0 +1,64 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.ckb
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using AbstractAnalysisFactory = org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+ using MultiTermAwareComponent = org.apache.lucene.analysis.util.MultiTermAwareComponent;
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="SoraniNormalizationFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_ckbnormal" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.SoraniNormalizationFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class SoraniNormalizationFilterFactory : TokenFilterFactory, MultiTermAwareComponent
+ {
+
+ /// <summary>
+ /// Creates a new SoraniNormalizationFilterFactory </summary>
+ public SoraniNormalizationFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override SoraniNormalizationFilter create(TokenStream input)
+ {
+ return new SoraniNormalizationFilter(input);
+ }
+
+ public virtual AbstractAnalysisFactory MultiTermComponent
+ {
+ get
+ {
+ return this;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizer.cs
new file mode 100644
index 0000000..9c3f551
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizer.cs
@@ -0,0 +1,140 @@
+namespace org.apache.lucene.analysis.ckb
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+// import static org.apache.lucene.analysis.util.StemmerUtil.delete;
+
+ /// <summary>
+ /// Normalizes the Unicode representation of Sorani text.
+ /// <para>
+ /// Normalization consists of:
+ /// <ul>
+ /// <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH)
+ /// <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH)
+ /// <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE)
+ /// <li>Alternate (joining) form of 'h' (06BE) is converted to 0647
+ /// <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW)
+ /// <li>Harakat, tatweel, and formatting characters such as directional controls are removed.
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public class SoraniNormalizer
+ {
+
+ internal const char YEH = '\u064A';
+ internal const char DOTLESS_YEH = '\u0649';
+ internal const char FARSI_YEH = '\u06CC';
+
+ internal const char KAF = '\u0643';
+ internal const char KEHEH = '\u06A9';
+
+ internal const char HEH = '\u0647';
+ internal const char AE = '\u06D5';
+ internal const char ZWNJ = '\u200C';
+ internal const char HEH_DOACHASHMEE = '\u06BE';
+ internal const char TEH_MARBUTA = '\u0629';
+
+ internal const char REH = '\u0631';
+ internal const char RREH = '\u0695';
+ internal const char RREH_ABOVE = '\u0692';
+
+ internal const char TATWEEL = '\u0640';
+ internal const char FATHATAN = '\u064B';
+ internal const char DAMMATAN = '\u064C';
+ internal const char KASRATAN = '\u064D';
+ internal const char FATHA = '\u064E';
+ internal const char DAMMA = '\u064F';
+ internal const char KASRA = '\u0650';
+ internal const char SHADDA = '\u0651';
+ internal const char SUKUN = '\u0652';
+
+ /// <summary>
+ /// Normalize an input buffer of Sorani text
+ /// </summary>
+ /// <param name="s"> input buffer </param>
+ /// <param name="len"> length of input buffer </param>
+ /// <returns> length of input buffer after normalization </returns>
+ public virtual int normalize(char[] s, int len)
+ {
+ for (int i = 0; i < len; i++)
+ {
+ switch (s[i])
+ {
+ case YEH:
+ case DOTLESS_YEH:
+ s[i] = FARSI_YEH;
+ break;
+ case KAF:
+ s[i] = KEHEH;
+ break;
+ case ZWNJ:
+ if (i > 0 && s[i - 1] == HEH)
+ {
+ s[i - 1] = AE;
+ }
+ len = delete(s, i, len);
+ i--;
+ break;
+ case HEH:
+ if (i == len - 1)
+ {
+ s[i] = AE;
+ }
+ break;
+ case TEH_MARBUTA:
+ s[i] = AE;
+ break;
+ case HEH_DOACHASHMEE:
+ s[i] = HEH;
+ break;
+ case REH:
+ if (i == 0)
+ {
+ s[i] = RREH;
+ }
+ break;
+ case RREH_ABOVE:
+ s[i] = RREH;
+ break;
+ case TATWEEL:
+ case KASRATAN:
+ case DAMMATAN:
+ case FATHATAN:
+ case FATHA:
+ case DAMMA:
+ case KASRA:
+ case SHADDA:
+ case SUKUN:
+ len = delete(s, i, len);
+ i--;
+ break;
+ default:
+ if (char.getType(s[i]) == char.FORMAT)
+ {
+ len = delete(s, i, len);
+ i--;
+ }
+ break;
+ }
+ }
+ return len;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilter.cs
new file mode 100644
index 0000000..5d79be0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilter.cs
@@ -0,0 +1,66 @@
+namespace org.apache.lucene.analysis.ckb
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
+ using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that applies <seealso cref="SoraniStemmer"/> to stem Sorani words.
+ /// <para>
+ /// To prevent terms from being stemmed use an instance of
+ /// <seealso cref="SetKeywordMarkerFilter"/> or a custom <seealso cref="TokenFilter"/> that sets
+ /// the <seealso cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+ /// </para> </summary>
+ /// <seealso cref= SetKeywordMarkerFilter </seealso>
+
+ public sealed class SoraniStemFilter : TokenFilter
+ {
+ private readonly SoraniStemmer stemmer = new SoraniStemmer();
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+ public SoraniStemFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ if (!keywordAttr.Keyword)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.Length = newlen;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilterFactory.cs
new file mode 100644
index 0000000..67018ad
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.ckb
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="SoraniStemFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_ckbstem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.SoraniNormalizationFilterFactory"/>
+ /// <filter class="solr.SoraniStemFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class SoraniStemFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new SoraniStemFilterFactory </summary>
+ public SoraniStemFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override SoraniStemFilter create(TokenStream input)
+ {
+ return new SoraniStemFilter(input);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemmer.cs
new file mode 100644
index 0000000..4ec57cb
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemmer.cs
@@ -0,0 +1,139 @@
+namespace org.apache.lucene.analysis.ckb
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+// import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
+
+ /// <summary>
+ /// Light stemmer for Sorani
+ /// </summary>
+ public class SoraniStemmer
+ {
+
+ /// <summary>
+ /// Stem an input buffer of Sorani text.
+ /// </summary>
+ /// <param name="s"> input buffer </param>
+ /// <param name="len"> length of input buffer </param>
+ /// <returns> length of input buffer after normalization </returns>
+ public virtual int stem(char[] s, int len)
+ {
+ // postposition
+ if (len > 5 && endsWith(s, len, "دا"))
+ {
+ len -= 2;
+ }
+ else if (len > 4 && endsWith(s, len, "نا"))
+ {
+ len--;
+ }
+ else if (len > 6 && endsWith(s, len, "ەوە"))
+ {
+ len -= 3;
+ }
+
+ // possessive pronoun
+ if (len > 6 && (endsWith(s, len, "مان") || endsWith(s, len, "یان") || endsWith(s, len, "تان")))
+ {
+ len -= 3;
+ }
+
+ // indefinite singular ezafe
+ if (len > 6 && endsWith(s, len, "ێکی"))
+ {
+ return len - 3;
+ }
+ else if (len > 7 && endsWith(s, len, "یەکی"))
+ {
+ return len - 4;
+ }
+ // indefinite singular
+ if (len > 5 && endsWith(s, len, "ێک"))
+ {
+ return len - 2;
+ }
+ else if (len > 6 && endsWith(s, len, "یەک"))
+ {
+ return len - 3;
+ }
+ // definite singular
+ else if (len > 6 && endsWith(s, len, "ەکە"))
+ {
+ return len - 3;
+ }
+ else if (len > 5 && endsWith(s, len, "کە"))
+ {
+ return len - 2;
+ }
+ // definite plural
+ else if (len > 7 && endsWith(s, len, "ەکان"))
+ {
+ return len - 4;
+ }
+ else if (len > 6 && endsWith(s, len, "کان"))
+ {
+ return len - 3;
+ }
+ // indefinite plural ezafe
+ else if (len > 7 && endsWith(s, len, "یانی"))
+ {
+ return len - 4;
+ }
+ else if (len > 6 && endsWith(s, len, "انی"))
+ {
+ return len - 3;
+ }
+ // indefinite plural
+ else if (len > 6 && endsWith(s, len, "یان"))
+ {
+ return len - 3;
+ }
+ else if (len > 5 && endsWith(s, len, "ان"))
+ {
+ return len - 2;
+ }
+ // demonstrative plural
+ else if (len > 7 && endsWith(s, len, "یانە"))
+ {
+ return len - 4;
+ }
+ else if (len > 6 && endsWith(s, len, "انە"))
+ {
+ return len - 3;
+ }
+ // demonstrative singular
+ else if (len > 5 && (endsWith(s, len, "ایە") || endsWith(s, len, "ەیە")))
+ {
+ return len - 2;
+ }
+ else if (len > 4 && endsWith(s, len, "ە"))
+ {
+ return len - 1;
+ }
+ // absolute singular ezafe
+ else if (len > 4 && endsWith(s, len, "ی"))
+ {
+ return len - 1;
+ }
+ return len;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs
new file mode 100644
index 0000000..9023664
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs
@@ -0,0 +1,49 @@
+using System;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using StandardAnalyzer = org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
+
+ /// <summary>
+ /// An <seealso cref="Analyzer"/> that tokenizes text with <seealso cref="ChineseTokenizer"/> and
+ /// filters with <seealso cref="ChineseFilter"/> </summary>
+ /// @deprecated (3.1) Use <seealso cref="StandardAnalyzer"/> instead, which has the same functionality.
+ /// This analyzer will be removed in Lucene 5.0
+ [Obsolete("(3.1) Use <seealso cref="StandardAnalyzer"/> instead, which has the same functionality.")]
+ public sealed class ChineseAnalyzer : Analyzer
+ /// <summary>
+ /// Creates
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
+ /// </summary>
+ /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// built from a <seealso cref="ChineseTokenizer"/> filtered with
+ /// <seealso cref="ChineseFilter"/> </returns>
+ {
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new ChineseTokenizer(reader);
+ Tokenizer source = new ChineseTokenizer(reader);
+ return new TokenStreamComponents(source, new ChineseFilter(source));
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs
new file mode 100644
index 0000000..a631a04
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs
@@ -0,0 +1,104 @@
+using System;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> with a stop word table.
+ /// <ul>
+ /// <li>Numeric tokens are removed.
+ /// <li>English tokens must be larger than 1 character.
+ /// <li>One Chinese character as one Chinese word.
+ /// </ul>
+ /// TO DO:
+ /// <ol>
+ /// <li>Add Chinese stop words, such as \ue400
+ /// <li>Dictionary based Chinese word extraction
+ /// <li>Intelligent Chinese word extraction
+ /// </ol>
+ /// </summary>
+ /// @deprecated (3.1) Use <seealso cref="StopFilter"/> instead, which has the same functionality.
+ /// This filter will be removed in Lucene 5.0
+ [Obsolete("(3.1) Use <seealso cref="StopFilter"/> instead, which has the same functionality.")]
+ public sealed class ChineseFilter : TokenFilter
+ {
+
+
+ // Only English now, Chinese to be added later.
+ public static readonly string[] STOP_WORDS = new string[] {"and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
+
+
+ private CharArraySet stopTable;
+
+ private CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+ public ChineseFilter(TokenStream @in) : base(@in)
+ {
+
+ stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+
+ while (input.incrementToken())
+ {
+ char[] text = termAtt.buffer();
+ int termLength = termAtt.length();
+
+ // why not key off token type here assuming ChineseTokenizer comes first?
+ if (!stopTable.contains(text, 0, termLength))
+ {
+ switch (char.getType(text[0]))
+ {
+
+ case char.LOWERCASE_LETTER:
+ case char.UPPERCASE_LETTER:
+
+ // English word/token should larger than 1 character.
+ if (termLength > 1)
+ {
+ return true;
+ }
+ break;
+ case char.OTHER_LETTER:
+
+ // One Chinese character as one Chinese word.
+ // Chinese word extraction to be added later here.
+
+ return true;
+ }
+
+ }
+
+ }
+ return false;
+ }
+
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs
new file mode 100644
index 0000000..8e496d7
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs
@@ -0,0 +1,51 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using StopFilterFactory = org.apache.lucene.analysis.core.StopFilterFactory; // javadocs
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="ChineseFilter"/> </summary>
+ /// @deprecated Use <seealso cref="StopFilterFactory"/> instead.
+ [Obsolete("Use <seealso cref="StopFilterFactory"/> instead.")]
+ public class ChineseFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new ChineseFilterFactory </summary>
+ public ChineseFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override ChineseFilter create(TokenStream @in)
+ {
+ return new ChineseFilter(@in);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs
new file mode 100644
index 0000000..b2fb638
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs
@@ -0,0 +1,199 @@
+using System;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+
+ /// <summary>
+ /// Tokenize Chinese text as individual chinese characters.
+ ///
+ /// <para>
+ /// The difference between ChineseTokenizer and
+ /// CJKTokenizer is that they have different
+ /// token parsing logic.
+ /// </para>
+ /// <para>
+ /// For example, if the Chinese text
+ /// "C1C2C3C4" is to be indexed:
+ /// <ul>
+ /// <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4.
+ /// <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
+ /// </ul>
+ /// </para>
+ /// <para>
+ /// Therefore the index created by CJKTokenizer is much larger.
+ /// </para>
+ /// <para>
+ /// The problem is that when searching for C1, C1C2, C1C3,
+ /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
+ /// CJKTokenizer will not work.
+ /// </para> </summary>
+ /// @deprecated (3.1) Use <seealso cref="StandardTokenizer"/> instead, which has the same functionality.
+ /// This filter will be removed in Lucene 5.0
+ [Obsolete("(3.1) Use <seealso cref="StandardTokenizer"/> instead, which has the same functionality.")]
+ public sealed class ChineseTokenizer : Tokenizer
+ {
+
+
+ public ChineseTokenizer(Reader @in) : base(@in)
+ {
+ }
+
+ public ChineseTokenizer(AttributeFactory factory, Reader @in) : base(factory, @in)
+ {
+ }
+
+ private int offset = 0, bufferIndex = 0, dataLen = 0;
+ private const int MAX_WORD_LEN = 255;
+ private const int IO_BUFFER_SIZE = 1024;
+ private readonly char[] buffer = new char[MAX_WORD_LEN];
+ private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+
+ private int length;
+ private int start;
+
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+ private void push(char c)
+ {
+
+ if (length == 0) // start of token
+ {
+ start = offset - 1;
+ }
+ buffer[length++] = char.ToLower(c); // buffer it
+
+ }
+
+ private bool flush()
+ {
+
+ if (length > 0)
+ {
+ //System.out.println(new String(buffer, 0,
+ //length));
+ termAtt.copyBuffer(buffer, 0, length);
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start + length));
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ clearAttributes();
+
+ length = 0;
+ start = offset;
+
+
+ while (true)
+ {
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char c;
+ char c;
+ offset++;
+
+ if (bufferIndex >= dataLen)
+ {
+ dataLen = input.read(ioBuffer);
+ bufferIndex = 0;
+ }
+
+ if (dataLen == -1)
+ {
+ offset--;
+ return flush();
+ }
+ else
+ {
+ c = ioBuffer[bufferIndex++];
+ }
+
+
+ switch (char.getType(c))
+ {
+
+ case char.DECIMAL_DIGIT_NUMBER:
+ case char.LOWERCASE_LETTER:
+ case char.UPPERCASE_LETTER:
+ push(c);
+ if (length == MAX_WORD_LEN)
+ {
+ return flush();
+ }
+ break;
+
+ case char.OTHER_LETTER:
+ if (length > 0)
+ {
+ bufferIndex--;
+ offset--;
+ return flush();
+ }
+ push(c);
+ return flush();
+
+ default:
+ if (length > 0)
+ {
+ return flush();
+ }
+ break;
+ }
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final void end() throws java.io.IOException
+ public override void end()
+ {
+ base.end();
+ // set final offset
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int finalOffset = correctOffset(offset);
+ int finalOffset = correctOffset(offset);
+ this.offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ offset = bufferIndex = dataLen = 0;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs
new file mode 100644
index 0000000..3abb93f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs
@@ -0,0 +1,52 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using TokenizerFactory = org.apache.lucene.analysis.util.TokenizerFactory;
+ using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="ChineseTokenizer"/> </summary>
+ /// @deprecated Use <seealso cref="org.apache.lucene.analysis.standard.StandardTokenizerFactory"/> instead.
+ [Obsolete("Use <seealso cref="org.apache.lucene.analysis.standard.StandardTokenizerFactory"/> instead.")]
+ public class ChineseTokenizerFactory : TokenizerFactory
+ {
+
+ /// <summary>
+ /// Creates a new ChineseTokenizerFactory </summary>
+ public ChineseTokenizerFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override ChineseTokenizer create(AttributeFactory factory, Reader @in)
+ {
+ return new ChineseTokenizer(factory, @in);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilter.cs
new file mode 100644
index 0000000..2b97da8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilter.cs
@@ -0,0 +1,199 @@
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.commongrams
+{
+
+ using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+ using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using PositionLengthAttribute = org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using Version = org.apache.lucene.util.Version;
+
+ /*
+ * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
+ */
+
+ /// <summary>
+ /// Construct bigrams for frequently occurring terms while indexing. Single terms
+ /// are still indexed too, with bigrams overlaid. This is achieved through the
+ /// use of <seealso cref="PositionIncrementAttribute#setPositionIncrement(int)"/>. Bigrams have a type
+ /// of <seealso cref="#GRAM_TYPE"/> Example:
+ /// <ul>
+ /// <li>input:"the quick brown fox"</li>
+ /// <li>output:|"the","the-quick"|"brown"|"fox"|</li>
+ /// <li>"the-quick" has a position increment of 0 so it is in the same position
+ /// as "the" "the-quick" has a term.type() of "gram"</li>
+ ///
+ /// </ul>
+ /// </summary>
+
+ /*
+ * Constructors and makeCommonSet based on similar code in StopFilter
+ */
+ public sealed class CommonGramsFilter : TokenFilter
+ {
+
+ public const string GRAM_TYPE = "gram";
+ private const char SEPARATOR = '_';
+
+ private readonly CharArraySet commonWords;
+
+ private readonly StringBuilder buffer = new StringBuilder();
+
+ private readonly CharTermAttribute termAttribute = addAttribute(typeof(CharTermAttribute));
+ private readonly OffsetAttribute offsetAttribute = addAttribute(typeof(OffsetAttribute));
+ private readonly TypeAttribute typeAttribute = addAttribute(typeof(TypeAttribute));
+ private readonly PositionIncrementAttribute posIncAttribute = addAttribute(typeof(PositionIncrementAttribute));
+ private readonly PositionLengthAttribute posLenAttribute = addAttribute(typeof(PositionLengthAttribute));
+
+ private int lastStartOffset;
+ private bool lastWasCommon;
+ private State savedState;
+
+ /// <summary>
+ /// Construct a token stream filtering the given input using a Set of common
+ /// words to create bigrams. Outputs both unigrams with position increment and
+ /// bigrams with position increment 0 type=gram where one or both of the words
+ /// in a potential bigram are in the set of common words .
+ /// </summary>
+ /// <param name="input"> TokenStream input in filter chain </param>
+ /// <param name="commonWords"> The set of common words. </param>
+ public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) : base(input)
+ {
+ this.commonWords = commonWords;
+ }
+
+ /// <summary>
+ /// Inserts bigrams for common words into a token stream. For each input token,
+ /// output the token. If the token and/or the following token are in the list
+ /// of common words also output a bigram with position increment 0 and
+ /// type="gram"
+ ///
+ /// TODO:Consider adding an option to not emit unigram stopwords
+ /// as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
+ /// changed to work with this.
+ ///
+ /// TODO: Consider optimizing for the case of three
+ /// commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
+ /// "of-the", "the-year" but with proper management of positions we could
+ /// eliminate the middle bigram "of-the"and save a disk seek and a whole set of
+ /// position lookups.
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ // get the next piece of input
+ if (savedState != null)
+ {
+ restoreState(savedState);
+ savedState = null;
+ saveTermBuffer();
+ return true;
+ }
+ else if (!input.incrementToken())
+ {
+ return false;
+ }
+
+ /* We build n-grams before and after stopwords.
+ * When valid, the buffer always contains at least the separator.
+ * If its empty, there is nothing before this stopword.
+ */
+ if (lastWasCommon || (Common && buffer.Length > 0))
+ {
+ savedState = captureState();
+ gramToken();
+ return true;
+ }
+
+ saveTermBuffer();
+ return true;
+ }
+
+ /// <summary>
+ /// {@inheritDoc}
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ lastWasCommon = false;
+ savedState = null;
+ buffer.Length = 0;
+ }
+
+ // ================================================= Helper Methods ================================================
+
+ /// <summary>
+ /// Determines if the current token is a common term
+ /// </summary>
+ /// <returns> {@code true} if the current token is a common term, {@code false} otherwise </returns>
+ private bool Common
+ {
+ get
+ {
+ return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length());
+ }
+ }
+
+ /// <summary>
+ /// Saves this information to form the left part of a gram
+ /// </summary>
+ private void saveTermBuffer()
+ {
+ buffer.Length = 0;
+ buffer.Append(termAttribute.buffer(), 0, termAttribute.length());
+ buffer.Append(SEPARATOR);
+ lastStartOffset = offsetAttribute.startOffset();
+ lastWasCommon = Common;
+ }
+
+ /// <summary>
+ /// Constructs a compound token.
+ /// </summary>
+ private void gramToken()
+ {
+ buffer.Append(termAttribute.buffer(), 0, termAttribute.length());
+ int endOffset = offsetAttribute.endOffset();
+
+ clearAttributes();
+
+ int length = buffer.Length;
+ char[] termText = termAttribute.buffer();
+ if (length > termText.Length)
+ {
+ termText = termAttribute.resizeBuffer(length);
+ }
+
+ buffer.getChars(0, length, termText, 0);
+ termAttribute.Length = length;
+ posIncAttribute.PositionIncrement = 0;
+ posLenAttribute.PositionLength = 2; // bigram
+ offsetAttribute.setOffset(lastStartOffset, endOffset);
+ typeAttribute.Type = GRAM_TYPE;
+ buffer.Length = 0;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs
new file mode 100644
index 0000000..2233e83
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs
@@ -0,0 +1,104 @@
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Util;
+using org.apache.lucene.analysis.commongrams;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.CommonGrams
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Constructs a <seealso cref="CommonGramsFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_cmmngrms" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.CommonGramsFilterFactory" words="commongramsstopwords.txt" ignoreCase="false"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class CommonGramsFilterFactory : TokenFilterFactory, ResourceLoaderAware
+ {
+ // TODO: shared base class for Stop/Keep/CommonGrams?
+ private CharArraySet commonWords;
+ private readonly string commonWordFiles;
+ private readonly string format;
+ private readonly bool ignoreCase;
+
+ /// <summary>
+ /// Creates a new CommonGramsFilterFactory </summary>
+ public CommonGramsFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ commonWordFiles = get(args, "words");
+ format = get(args, "format");
+ ignoreCase = getBoolean(args, "ignoreCase", false);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(ResourceLoader loader) throws java.io.IOException
+ public virtual void inform(ResourceLoader loader)
+ {
+ if (commonWordFiles != null)
+ {
+ if ("snowball".Equals(format, StringComparison.CurrentCultureIgnoreCase))
+ {
+ commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
+ }
+ else
+ {
+ commonWords = GetWordSet(loader, commonWordFiles, ignoreCase);
+ }
+ }
+ else
+ {
+ commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+ }
+ }
+
+ public virtual bool IgnoreCase
+ {
+ get
+ {
+ return ignoreCase;
+ }
+ }
+
+ public virtual CharArraySet CommonWords
+ {
+ get
+ {
+ return commonWords;
+ }
+ }
+
+ public override TokenFilter Create(TokenStream input)
+ {
+ CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
+ return commonGrams;
+ }
+ }
+
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilter.cs
new file mode 100644
index 0000000..b787bde
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilter.cs
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+namespace org.apache.lucene.analysis.commongrams
+{
+
+ using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+// import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
+
+ /// <summary>
+ /// Wrap a CommonGramsFilter optimizing phrase queries by only returning single
+ /// words when they are not a member of a bigram.
+ ///
+ /// Example:
+ /// <ul>
+ /// <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
+ /// <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
+ /// |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
+ /// <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain",
+ /// "falls", "mainly"
+ /// </ul>
+ /// </summary>
+
+ /*
+ * See:http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/TokenStream.html and
+ * http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/analysis/package.html?revision=718798
+ */
+ public sealed class CommonGramsQueryFilter : TokenFilter
+ {
+
+ private readonly TypeAttribute typeAttribute = addAttribute(typeof(TypeAttribute));
+ private readonly PositionIncrementAttribute posIncAttribute = addAttribute(typeof(PositionIncrementAttribute));
+
+ private State previous;
+ private string previousType;
+ private bool exhausted;
+
+ /// <summary>
+ /// Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
+ /// </summary>
+ /// <param name="input"> CommonGramsFilter the QueryFilter will use </param>
+ public CommonGramsQueryFilter(CommonGramsFilter input) : base(input)
+ {
+ }
+
+ /// <summary>
+ /// {@inheritDoc}
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ previous = null;
+ previousType = null;
+ exhausted = false;
+ }
+
+ /// <summary>
+ /// Output bigrams whenever possible to optimize queries. Only output unigrams
+ /// when they are not a member of a bigram. Example:
+ /// <ul>
+ /// <li>input: "the rain in spain falls mainly"
+ /// <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
+ /// </ul>
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ while (!exhausted && input.incrementToken())
+ {
+ State current = captureState();
+
+ if (previous != null && !GramType)
+ {
+ restoreState(previous);
+ previous = current;
+ previousType = typeAttribute.type();
+
+ if (GramType)
+ {
+ posIncAttribute.PositionIncrement = 1;
+ }
+ return true;
+ }
+
+ previous = current;
+ }
+
+ exhausted = true;
+
+ if (previous == null || GRAM_TYPE.Equals(previousType))
+ {
+ return false;
+ }
+
+ restoreState(previous);
+ previous = null;
+
+ if (GramType)
+ {
+ posIncAttribute.PositionIncrement = 1;
+ }
+ return true;
+ }
+
+ // ================================================= Helper Methods ================================================
+
+ /// <summary>
+ /// Convenience method to check if the current type is a gram type
+ /// </summary>
+ /// <returns> {@code true} if the current type is a gram type, {@code false} otherwise </returns>
+ public bool GramType
+ {
+ get
+ {
+ return GRAM_TYPE.Equals(typeAttribute.type());
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilterFactory.cs
new file mode 100644
index 0000000..ddee353
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+using Lucene.Net.Analysis.CommonGrams;
+
+namespace org.apache.lucene.analysis.commongrams
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ /// <summary>
+ /// Construct <seealso cref="CommonGramsQueryFilter"/>.
+ ///
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_cmmngrmsqry" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.CommonGramsQueryFilterFactory" words="commongramsquerystopwords.txt" ignoreCase="false"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class CommonGramsQueryFilterFactory : CommonGramsFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new CommonGramsQueryFilterFactory </summary>
+ public CommonGramsQueryFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ }
+
+ /// <summary>
+ /// Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
+ /// </summary>
+ public override TokenFilter create(TokenStream input)
+ {
+ CommonGramsFilter commonGrams = (CommonGramsFilter) base.create(input);
+ return new CommonGramsQueryFilter(commonGrams);
+ }
+ }
+
+}
\ No newline at end of file