You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/02/04 20:32:58 UTC
[39/39] lucenenet git commit: Lucene.Net.Analysis.Ngram - renamed
NGram in Git
Lucene.Net.Analysis.Ngram - renamed NGram in Git
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/ab81d913
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/ab81d913
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/ab81d913
Branch: refs/heads/api-work
Commit: ab81d91313149500e6c88b4ceabd6ff5aa4e0d63
Parents: 3201465
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Sun Feb 5 03:17:39 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Sun Feb 5 03:29:11 2017 +0700
----------------------------------------------------------------------
.../Analysis/NGram/EdgeNGramFilterFactory.cs | 60 +++
.../Analysis/NGram/EdgeNGramTokenFilter.cs | 245 ++++++++++++
.../Analysis/NGram/EdgeNGramTokenizer.cs | 72 ++++
.../Analysis/NGram/EdgeNGramTokenizerFactory.cs | 75 ++++
.../NGram/Lucene43EdgeNGramTokenizer.cs | 297 ++++++++++++++
.../Analysis/NGram/Lucene43NGramTokenizer.cs | 173 ++++++++
.../Analysis/NGram/NGramFilterFactory.cs | 56 +++
.../Analysis/NGram/NGramTokenFilter.cs | 252 ++++++++++++
.../Analysis/NGram/NGramTokenizer.cs | 319 +++++++++++++++
.../Analysis/NGram/NGramTokenizerFactory.cs | 70 ++++
.../Analysis/Ngram/EdgeNGramFilterFactory.cs | 60 ---
.../Analysis/Ngram/EdgeNGramTokenFilter.cs | 245 ------------
.../Analysis/Ngram/EdgeNGramTokenizer.cs | 72 ----
.../Analysis/Ngram/EdgeNGramTokenizerFactory.cs | 75 ----
.../Ngram/Lucene43EdgeNGramTokenizer.cs | 297 --------------
.../Analysis/Ngram/Lucene43NGramTokenizer.cs | 173 --------
.../Analysis/Ngram/NGramFilterFactory.cs | 56 ---
.../Analysis/Ngram/NGramTokenFilter.cs | 252 ------------
.../Analysis/Ngram/NGramTokenizer.cs | 319 ---------------
.../Analysis/Ngram/NGramTokenizerFactory.cs | 70 ----
.../Analysis/NGram/EdgeNGramTokenFilterTest.cs | 390 +++++++++++++++++++
.../Analysis/NGram/EdgeNGramTokenizerTest.cs | 278 +++++++++++++
.../Analysis/NGram/NGramTokenFilterTest.cs | 249 ++++++++++++
.../Analysis/NGram/NGramTokenizerTest.cs | 303 ++++++++++++++
.../Analysis/NGram/TestNGramFilters.cs | 196 ++++++++++
.../Analysis/Ngram/EdgeNGramTokenFilterTest.cs | 390 -------------------
.../Analysis/Ngram/EdgeNGramTokenizerTest.cs | 278 -------------
.../Analysis/Ngram/NGramTokenFilterTest.cs | 249 ------------
.../Analysis/Ngram/NGramTokenizerTest.cs | 303 --------------
.../Analysis/Ngram/TestNGramFilters.cs | 196 ----------
30 files changed, 3035 insertions(+), 3035 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs
new file mode 100644
index 0000000..70b44d3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs
@@ -0,0 +1,60 @@
+\ufeffusing Lucene.Net.Analysis.Util;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Creates new instances of <see cref="EdgeNGramTokenFilter"/>.
+ /// <code>
+ /// <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/>
+ /// </analyzer>
+ /// </fieldType></code>
+ /// </summary>
+ public class EdgeNGramFilterFactory : TokenFilterFactory
+ {
+ private readonly int maxGramSize;
+ private readonly int minGramSize;
+ private readonly string side;
+
+ /// <summary>
+ /// Creates a new <see cref="EdgeNGramFilterFactory"/> </summary>
+ public EdgeNGramFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
+ maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
+ side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString());
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+#pragma warning disable 612, 618
+ return new EdgeNGramTokenFilter(m_luceneMatchVersion, input, side, minGramSize, maxGramSize);
+#pragma warning restore 612, 618
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs
new file mode 100644
index 0000000..8cf8172
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs
@@ -0,0 +1,245 @@
+\ufeffusing Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tokenizes the given token into n-grams of given size(s).
+ /// <para>
+ /// This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token.
+ /// </para>
+ /// <para>As of Lucene 4.4, this filter does not support
+ /// <see cref="Side.BACK"/> (you can use <see cref="Reverse.ReverseStringFilter"/> up-front and
+ /// afterward to get the same behavior), handles supplementary characters
+ /// correctly and does not update offsets anymore.
+ /// </para>
+ /// </summary>
+ public sealed class EdgeNGramTokenFilter : TokenFilter
+ {
+ public const Side DEFAULT_SIDE = Side.FRONT;
+ public const int DEFAULT_MAX_GRAM_SIZE = 1;
+ public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ /// <summary>
+ /// Specifies which side of the input the n-gram should be generated from </summary>
+ public enum Side
+ {
+ /// <summary>
+ /// Get the n-gram from the front of the input </summary>
+ FRONT,
+
+ /// <summary>
+ /// Get the n-gram from the end of the input </summary>
+ [System.Obsolete]
+ BACK,
+ }
+
+ /// <summary>
+ /// Get the appropriate <see cref="Side"/> from a string
+ /// </summary>
+ public static Side GetSide(string sideName)
+ {
+ Side result;
+ if (!Enum.TryParse(sideName, true, out result))
+ {
+ result = Side.FRONT;
+ }
+ return result;
+ }
+
+ private readonly LuceneVersion version;
+ private readonly CharacterUtils charUtils;
+ private readonly int minGram;
+ private readonly int maxGram;
+ private Side side;
+ private char[] curTermBuffer;
+ private int curTermLength;
+ private int curCodePointCount;
+ private int curGramSize;
+ private int tokStart;
+ private int tokEnd; // only used if the length changed before this filter
+ private bool updateOffsets; // never if the length changed before this filter
+ private int savePosIncr;
+ private int savePosLen;
+
+ private readonly ICharTermAttribute termAtt;
+ private readonly IOffsetAttribute offsetAtt;
+ private readonly IPositionIncrementAttribute posIncrAtt;
+ private readonly IPositionLengthAttribute posLenAtt;
+
+ /// <summary>
+ /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+ /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
+ : base(input)
+ {
+
+ //if (version == null)
+ //{
+ // throw new System.ArgumentException("version must not be null");
+ //}
+
+ if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
+ {
+ throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
+ }
+
+ if (!Enum.IsDefined(typeof(Side), side))
+ {
+ throw new System.ArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+
+ this.version = version;
+ this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.side = side;
+
+ this.termAtt = AddAttribute<ICharTermAttribute>();
+ this.offsetAtt = AddAttribute<IOffsetAttribute>();
+ this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ this.posLenAtt = AddAttribute<IPositionLengthAttribute>();
+ }
+
+ /// <summary>
+ /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+ /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, string sideLabel, int minGram, int maxGram)
+ : this(version, input, GetSide(sideLabel), minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
+#pragma warning disable 612, 618
+ : this(version, input, Side.FRONT, minGram, maxGram)
+#pragma warning restore 612, 618
+ {
+ }
+
+ public override sealed bool IncrementToken()
+ {
+ while (true)
+ {
+ if (curTermBuffer == null)
+ {
+ if (!m_input.IncrementToken())
+ {
+ return false;
+ }
+ else
+ {
+ curTermBuffer = (char[])termAtt.Buffer.Clone();
+ curTermLength = termAtt.Length;
+ curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
+ curGramSize = minGram;
+ tokStart = offsetAtt.StartOffset;
+ tokEnd = offsetAtt.EndOffset;
+#pragma warning disable 612, 618
+ if (version.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+ {
+ // Never update offsets
+ updateOffsets = false;
+ }
+ else
+ {
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ updateOffsets = (tokStart + curTermLength) == tokEnd;
+ }
+ savePosIncr += posIncrAtt.PositionIncrement;
+ savePosLen = posLenAtt.PositionLength;
+ }
+ }
+ if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit
+ {
+ if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
+ {
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
+ int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+ ClearAttributes();
+ if (updateOffsets)
+ {
+ offsetAtt.SetOffset(tokStart + start, tokStart + end);
+ }
+ else
+ {
+ offsetAtt.SetOffset(tokStart, tokEnd);
+ }
+ // first ngram gets increment, others don't
+ if (curGramSize == minGram)
+ {
+ posIncrAtt.PositionIncrement = savePosIncr;
+ savePosIncr = 0;
+ }
+ else
+ {
+ posIncrAtt.PositionIncrement = 0;
+ }
+ posLenAtt.PositionLength = savePosLen;
+ termAtt.CopyBuffer(curTermBuffer, start, end - start);
+ curGramSize++;
+ return true;
+ }
+ }
+ curTermBuffer = null;
+ }
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ curTermBuffer = null;
+ savePosIncr = 0;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs
new file mode 100644
index 0000000..ed2cb3d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs
@@ -0,0 +1,72 @@
+\ufeffusing Lucene.Net.Util;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tokenizes the input from an edge into n-grams of given size(s).
+ /// <para>
+ /// This <see cref="Tokenizer"/> create n-grams from the beginning edge or ending edge of a input token.
+ /// </para>
+ /// <para>As of Lucene 4.4, this tokenizer
+ /// <list type="bullet">
+ /// <item>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage</item>
+ /// <item>doesn't trim the input,</item>
+ /// <item>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones</item>
+ /// <item>doesn't support backward n-grams anymore.</item>
+ /// <item>supports <see cref="Util.CharTokenizer.IsTokenChar(int)"/> pre-tokenization,</item>
+ /// <item>correctly handles supplementary characters.</item>
+ /// </list>
+ /// </para>
+ /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
+ /// to use the old behavior through <see cref="Lucene43EdgeNGramTokenizer"/>.
+ /// </para>
+ /// </summary>
+ public class EdgeNGramTokenizer : NGramTokenizer
+ {
+ public const int DEFAULT_MAX_GRAM_SIZE = 1;
+ public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ /// <summary>
+ /// Creates <see cref="EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
+ : base(version, input, minGram, maxGram, true)
+ {
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public EdgeNGramTokenizer(LuceneVersion version, AttributeSource.AttributeFactory factory, TextReader input, int minGram, int maxGram)
+ : base(version, factory, input, minGram, maxGram, true)
+ {
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs
new file mode 100644
index 0000000..00325f5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs
@@ -0,0 +1,75 @@
+\ufeffusing Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Creates new instances of <see cref="EdgeNGramTokenizer"/>.
+ /// <code>
+ /// <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.EdgeNGramTokenizerFactory" minGramSize="1" maxGramSize="1"/>
+ /// </analyzer>
+ /// </fieldType></code>
+ /// </summary>
+ public class EdgeNGramTokenizerFactory : TokenizerFactory
+ {
+ private readonly int maxGramSize;
+ private readonly int minGramSize;
+ private readonly string side;
+
+ /// <summary>
+ /// Creates a new <see cref="EdgeNGramTokenizerFactory"/> </summary>
+ public EdgeNGramTokenizerFactory(IDictionary<string, string> args) : base(args)
+ {
+ minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE);
+ maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
+ side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString());
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
+ {
+#pragma warning disable 612, 618
+ if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+ {
+ EdgeNGramTokenFilter.Side sideEnum;
+ if (!Enum.TryParse(this.side, true, out sideEnum))
+ {
+ throw new System.ArgumentException(typeof(EdgeNGramTokenizer).Name + " does not support backward n-grams as of Lucene 4.4");
+ }
+ return new EdgeNGramTokenizer(m_luceneMatchVersion, input, minGramSize, maxGramSize);
+ }
+ else
+ {
+#pragma warning disable 612, 618
+ return new Lucene43EdgeNGramTokenizer(m_luceneMatchVersion, input, side, minGramSize, maxGramSize);
+#pragma warning restore 612, 618
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs
new file mode 100644
index 0000000..4dadbed
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs
@@ -0,0 +1,297 @@
+\ufeffusing Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Util;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Old version of <see cref="EdgeNGramTokenizer"/> which doesn't handle correctly
+ /// supplementary characters.
+ /// </summary>
+ [Obsolete]
+ public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
+ {
+ public const Side DEFAULT_SIDE = Side.FRONT;
+ public const int DEFAULT_MAX_GRAM_SIZE = 1;
+ public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ private ICharTermAttribute termAtt;
+ private IOffsetAttribute offsetAtt;
+ private IPositionIncrementAttribute posIncrAtt;
+
+ /// <summary>
+ /// Specifies which side of the input the n-gram should be generated from </summary>
+ public enum Side
+ {
+ /// <summary>
+ /// Get the n-gram from the front of the input </summary>
+ FRONT,
+
+ /// <summary>
+ /// Get the n-gram from the end of the input </summary>
+ BACK,
+ }
+
+ // Get the appropriate Side from a string
+ public static Side GetSide(string sideName)
+ {
+ Side result;
+ if (!Enum.TryParse(sideName, true, out result))
+ {
+ result = Side.FRONT;
+ }
+ return result;
+ }
+
+ private int minGram;
+ private int maxGram;
+ private int gramSize;
+ private Side side;
+ private bool started;
+ private int inLen; // length of the input AFTER trim()
+ private int charsRead; // length of the input
+ private string inStr;
+
+
+ /// <summary>
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram)
+ : base(input)
+ {
+ Init(version, side, minGram, maxGram);
+ }
+
+ /// <summary>
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
+ : base(factory, input)
+ {
+ Init(version, side, minGram, maxGram);
+ }
+
+ /// <summary>
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, string sideLabel, int minGram, int maxGram)
+ : this(version, input, GetSide(sideLabel), minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram)
+ : this(version, factory, input, GetSide(sideLabel), minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
+ : this(version, input, Side.FRONT, minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
+ : this(version, factory, input, Side.FRONT, minGram, maxGram)
+ {
+ }
+
+ private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
+ {
+ //if (version == null)
+ //{
+ // throw new System.ArgumentException("version must not be null");
+ //}
+
+ if (!Enum.IsDefined(typeof(Side), side))
+ {
+ throw new System.ArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+
+ if (version.OnOrAfter(LuceneVersion.LUCENE_44))
+ {
+ if (side == Side.BACK)
+ {
+ throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
+ }
+ }
+ else
+ {
+ maxGram = Math.Min(maxGram, 1024);
+ }
+
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.side = side;
+ this.termAtt = AddAttribute<ICharTermAttribute>();
+ this.offsetAtt = AddAttribute<IOffsetAttribute>();
+ this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ }
+
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS. </summary>
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ // if we are just starting, read the whole input
+ if (!started)
+ {
+ started = true;
+ gramSize = minGram;
+ int limit = side == Side.FRONT ? maxGram : 1024;
+ char[] chars = new char[Math.Min(1024, limit)];
+ charsRead = 0;
+ // TODO: refactor to a shared readFully somewhere:
+ bool exhausted = false;
+ while (charsRead < limit)
+ {
+ int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
+ if (inc <= 0)
+ {
+ exhausted = true;
+ break;
+ }
+ charsRead += inc;
+ if (charsRead == chars.Length && charsRead < limit)
+ {
+ chars = ArrayUtil.Grow(chars);
+ }
+ }
+
+ inStr = new string(chars, 0, charsRead);
+ inStr = inStr.Trim();
+
+ if (!exhausted)
+ {
+ // Read extra throwaway chars so that on end() we
+ // report the correct offset:
+ var throwaway = new char[1024];
+ while (true)
+ {
+ int inc = m_input.Read(throwaway, 0, throwaway.Length);
+ if (inc <= 0)
+ {
+ break;
+ }
+ charsRead += inc;
+ }
+ }
+
+ inLen = inStr.Length;
+ if (inLen == 0)
+ {
+ return false;
+ }
+ posIncrAtt.PositionIncrement = 1;
+ }
+ else
+ {
+ posIncrAtt.PositionIncrement = 0;
+ }
+
+ // if the remaining input is too short, we can't generate any n-grams
+ if (gramSize > inLen)
+ {
+ return false;
+ }
+
+ // if we have hit the end of our n-gram size range, quit
+ if (gramSize > maxGram || gramSize > inLen)
+ {
+ return false;
+ }
+
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : inLen - gramSize;
+ int end = start + gramSize;
+ termAtt.SetEmpty().Append(inStr, start, end);
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
+ gramSize++;
+ return true;
+ }
+
+ public override void End()
+ {
+ base.End();
+ // set final offset
+ int finalOffset = CorrectOffset(charsRead);
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ started = false;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs
new file mode 100644
index 0000000..b806345
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs
@@ -0,0 +1,173 @@
+\ufeffusing Lucene.Net.Analysis.TokenAttributes;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Old broken version of <see cref="NGramTokenizer"/>.
+ /// </summary>
+ [Obsolete]
+ public sealed class Lucene43NGramTokenizer : Tokenizer
+ {
+ public const int DEFAULT_MIN_NGRAM_SIZE = 1;
+ public const int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ private int minGram, maxGram;
+ private int gramSize;
+ private int pos;
+ private int inLen; // length of the input AFTER trim()
+ private int charsRead; // length of the input
+ private string inStr;
+ private bool started;
+
+ private ICharTermAttribute termAtt;
+ private IOffsetAttribute offsetAtt;
+
+ /// <summary>
+ /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public Lucene43NGramTokenizer(TextReader input, int minGram, int maxGram)
+ : base(input)
+ {
+ Init(minGram, maxGram);
+ }
+
+ /// <summary>
+ /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
+ /// <param name="factory"> <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
+ : base(factory, input)
+ {
+ Init(minGram, maxGram);
+ }
+
+ /// <summary>
+ /// Creates <see cref="Lucene43NGramTokenizer"/> with default min and max n-grams. </summary>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ public Lucene43NGramTokenizer(TextReader input)
+ : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+ {
+ }
+
+ private void Init(int minGram, int maxGram)
+ {
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ termAtt = AddAttribute<ICharTermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ }
+
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS. </summary>
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ if (!started)
+ {
+ started = true;
+ gramSize = minGram;
+ char[] chars = new char[1024];
+ charsRead = 0;
+ // TODO: refactor to a shared readFully somewhere:
+ while (charsRead < chars.Length)
+ {
+ int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
+ if (inc == -1)
+ {
+ break;
+ }
+ charsRead += inc;
+ }
+ inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings
+
+ if (charsRead == chars.Length)
+ {
+ // Read extra throwaway chars so that on end() we
+ // report the correct offset:
+ var throwaway = new char[1024];
+ while (true)
+ {
+ int inc = m_input.Read(throwaway, 0, throwaway.Length);
+ if (inc == -1)
+ {
+ break;
+ }
+ charsRead += inc;
+ }
+ }
+
+ inLen = inStr.Length;
+ if (inLen == 0)
+ {
+ return false;
+ }
+ }
+
+ if (pos + gramSize > inLen) // if we hit the end of the string
+ {
+ pos = 0; // reset to beginning of string
+ gramSize++; // increase n-gram size
+ if (gramSize > maxGram) // we are done
+ {
+ return false;
+ }
+ if (pos + gramSize > inLen)
+ {
+ return false;
+ }
+ }
+
+ int oldPos = pos;
+ pos++;
+ termAtt.SetEmpty().Append(inStr, oldPos, oldPos + gramSize);
+ offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
+ return true;
+ }
+
+ public override void End()
+ {
+ base.End();
+ // set final offset
+ int finalOffset = CorrectOffset(charsRead);
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ started = false;
+ pos = 0;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs
new file mode 100644
index 0000000..ca1d0bc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs
@@ -0,0 +1,56 @@
+\ufeffusing Lucene.Net.Analysis.Util;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Factory for <see cref="NGramTokenFilter"/>.
+ /// <code>
+ /// <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/>
+ /// </analyzer>
+ /// </fieldType></code>
+ /// </summary>
+ public class NGramFilterFactory : TokenFilterFactory
+ {
+ private readonly int maxGramSize;
+ private readonly int minGramSize;
+
+ /// <summary>
+ /// Creates a new <see cref="NGramFilterFactory"/> </summary>
+ public NGramFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ minGramSize = GetInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
+ maxGramSize = GetInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new NGramTokenFilter(m_luceneMatchVersion, input, minGramSize, maxGramSize);
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs
new file mode 100644
index 0000000..f1c82c5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs
@@ -0,0 +1,252 @@
+\ufeffusing Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tokenizes the input into n-grams of the given size(s).
+ /// <para>You must specify the required <see cref="LuceneVersion"/> compatibility when
+ /// creating a <see cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters:
+ /// <list type="bullet">
+ /// <item>handles supplementary characters correctly,</item>
+ /// <item>emits all n-grams for the same token at the same position,</item>
+ /// <item>does not modify offsets,</item>
+ /// <item>sorts n-grams by their offset in the original token first, then
+ /// increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
+ /// "c").</item>
+ /// </list>
+ /// </para>
+ /// <para>You can make this filter use the old behavior by providing a version <
+ /// <see cref="LuceneVersion.LUCENE_44"/> in the constructor but this is not recommended as
+ /// it will lead to broken <see cref="TokenStream"/>s that will cause highlighting
+ /// bugs.
+ /// </para>
+ /// <para>If you were using this <see cref="TokenFilter"/> to perform partial highlighting,
+ /// this won't work anymore since this filter doesn't update offsets. You should
+ /// modify your analysis chain to use <see cref="NGramTokenizer"/>, and potentially
+ /// override <see cref="NGramTokenizer.IsTokenChar(int)"/> to perform pre-tokenization.
+ /// </para>
+ /// </summary>
+ public sealed class NGramTokenFilter : TokenFilter
+ {
+ public const int DEFAULT_MIN_NGRAM_SIZE = 1;
+ public const int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ private readonly int minGram, maxGram;
+
+ private char[] curTermBuffer;
+ private int curTermLength;
+ private int curCodePointCount;
+ private int curGramSize;
+ private int curPos;
+ private int curPosInc, curPosLen;
+ private int tokStart;
+ private int tokEnd;
+ private bool hasIllegalOffsets; // only if the length changed before this filter
+
+ private readonly LuceneVersion version;
+ private readonly CharacterUtils charUtils;
+ private readonly ICharTermAttribute termAtt;
+ private readonly IPositionIncrementAttribute posIncAtt;
+ private readonly IPositionLengthAttribute posLenAtt;
+ private readonly IOffsetAttribute offsetAtt;
+
+ /// <summary>
+ /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
+ /// <param name="version"> Lucene version to enable correct position increments.
+ /// See <see cref="NGramTokenFilter"/> for details. </param>
+ /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
+ : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
+ {
+ this.version = version;
+ this.charUtils = version.OnOrAfter(
+#pragma warning disable 612, 618
+ LuceneVersion.LUCENE_44) ?
+#pragma warning restore 612, 618
+ CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+#pragma warning disable 612, 618
+ if (version.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+ {
+ posIncAtt = AddAttribute<IPositionIncrementAttribute>();
+ posLenAtt = AddAttribute<IPositionLengthAttribute>();
+ }
+ else
+ {
+ posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
+ posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
+ }
+ termAtt = AddAttribute<ICharTermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ }
+
+ private class PositionIncrementAttributeAnonymousInnerClassHelper : PositionIncrementAttribute
+ {
+ private readonly NGramTokenFilter outerInstance;
+
+ public PositionIncrementAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ public override int PositionIncrement
+ {
+ set
+ {
+ }
+ get
+ {
+ return 0;
+ }
+ }
+ }
+
+ private class PositionLengthAttributeAnonymousInnerClassHelper : PositionLengthAttribute
+ {
+ private readonly NGramTokenFilter outerInstance;
+
+ public PositionLengthAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ public override int PositionLength
+ {
+ set
+ {
+ }
+ get
+ {
+ return 0;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Creates <see cref="NGramTokenFilter"/> with default min and max n-grams. </summary>
+ /// <param name="version"> Lucene version to enable correct position increments.
+ /// See <see cref="NGramTokenFilter"/> for details. </param>
+ /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+ public NGramTokenFilter(LuceneVersion version, TokenStream input)
+ : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+ {
+ }
+
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS.
+ /// </summary>
+ public override sealed bool IncrementToken()
+ {
+ while (true)
+ {
+ if (curTermBuffer == null)
+ {
+ if (!m_input.IncrementToken())
+ {
+ return false;
+ }
+ else
+ {
+ curTermBuffer = (char[])termAtt.Buffer.Clone();
+ curTermLength = termAtt.Length;
+ curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
+ curGramSize = minGram;
+ curPos = 0;
+ curPosInc = posIncAtt.PositionIncrement;
+ curPosLen = posLenAtt.PositionLength;
+ tokStart = offsetAtt.StartOffset;
+ tokEnd = offsetAtt.EndOffset;
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
+ }
+ }
+#pragma warning disable 612, 618
+ if (version.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+ {
+ if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
+ {
+ ++curPos;
+ curGramSize = minGram;
+ }
+ if ((curPos + curGramSize) <= curCodePointCount)
+ {
+ ClearAttributes();
+ int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+ int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+ termAtt.CopyBuffer(curTermBuffer, start, end - start);
+ posIncAtt.PositionIncrement = curPosInc;
+ curPosInc = 0;
+ posLenAtt.PositionLength = curPosLen;
+ offsetAtt.SetOffset(tokStart, tokEnd);
+ curGramSize++;
+ return true;
+ }
+ }
+ else
+ {
+ while (curGramSize <= maxGram)
+ {
+ while (curPos + curGramSize <= curTermLength) // while there is input
+ {
+ ClearAttributes();
+ termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
+ if (hasIllegalOffsets)
+ {
+ offsetAtt.SetOffset(tokStart, tokEnd);
+ }
+ else
+ {
+ offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
+ }
+ curPos++;
+ return true;
+ }
+ curGramSize++; // increase n-gram size
+ curPos = 0;
+ }
+ }
+ curTermBuffer = null;
+ }
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ curTermBuffer = null;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs
new file mode 100644
index 0000000..b1845c8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs
@@ -0,0 +1,319 @@
+\ufeffusing Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Diagnostics;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tokenizes the input into n-grams of the given size(s).
+ /// <para>On the contrary to <see cref="NGramTokenFilter"/>, this class sets offsets so
+ /// that characters between startOffset and endOffset in the original stream are
+ /// the same as the term chars.
+ /// </para>
+ /// <para>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
+ /// <list type="table">
+ /// <listheader>
+ /// <term>Term</term>
+ /// <term>Position increment</term>
+ /// <term>Position length</term>
+ /// <term>Offsets</term>
+ /// </listheader>
+ /// <item>
+ /// <term>ab</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[0,2[</term>
+ /// </item>
+ /// <item>
+ /// <term>abc</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[0,3[</term>
+ /// </item>
+ /// <item>
+ /// <term>bc</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[1,3[</term>
+ /// </item>
+ /// <item>
+ /// <term>bcd</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[1,4[</term>
+ /// </item>
+ /// <item>
+ /// <term>cd</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[2,4[</term>
+ /// </item>
+ /// <item>
+ /// <term>cde</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[2,5[</term>
+ /// </item>
+ /// <item>
+ /// <term>de</term>
+ /// <term>1</term>
+ /// <term>1</term>
+ /// <term>[3,5[</term>
+ /// </item>
+ /// </list>
+ /// </para>
+ /// <para>This tokenizer changed a lot in Lucene 4.4 in order to:
+ /// <list type="bullet">
+ /// <item>tokenize in a streaming fashion to support streams which are larger
+ /// than 1024 chars (limit of the previous version),</item>
+ /// <item>count grams based on unicode code points instead of java chars (and
+ /// never split in the middle of surrogate pairs),</item>
+ /// <item>give the ability to pre-tokenize the stream (<see cref="IsTokenChar(int)"/>)
+ /// before computing n-grams.</item>
+ /// </list>
+ /// </para>
+ /// <para>Additionally, this class doesn't trim trailing whitespaces and emits
+ /// tokens in a different order, tokens are now emitted by increasing start
+ /// offsets while they used to be emitted by increasing lengths (which prevented
+ /// from supporting large input streams).
+ /// </para>
+ /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
+ /// to use the old behavior through <see cref="Lucene43NGramTokenizer"/>.
+ /// </para>
+ /// </summary>
+ // non-sealed to allow for overriding IsTokenChar, but all other methods should be sealed
+ public class NGramTokenizer : Tokenizer
+ {
+ public const int DEFAULT_MIN_NGRAM_SIZE = 1;
+ public const int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ private CharacterUtils charUtils;
+ private CharacterUtils.CharacterBuffer charBuffer;
+ private int[] buffer; // like charBuffer, but converted to code points
+ private int bufferStart, bufferEnd; // remaining slice in buffer
+ private int offset;
+ private int gramSize;
+ private int minGram, maxGram;
+ private bool exhausted;
+ private int lastCheckedChar; // last offset in the buffer that we checked
+ private int lastNonTokenChar; // last offset that we found to not be a token char
+ private bool edgesOnly; // leading edges n-grams only
+
+ private ICharTermAttribute termAtt;
+ private IPositionIncrementAttribute posIncAtt;
+ private IPositionLengthAttribute posLenAtt;
+ private IOffsetAttribute offsetAtt;
+
+ internal NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram, bool edgesOnly)
+ : base(input)
+ {
+ Init(version, minGram, maxGram, edgesOnly);
+ }
+
+ /// <summary>
+ /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
+ /// <param name="version"> the lucene compatibility version </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
+ : this(version, input, minGram, maxGram, false)
+ {
+ }
+
+ internal NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram, bool edgesOnly)
+ : base(factory, input)
+ {
+ Init(version, minGram, maxGram, edgesOnly);
+ }
+
+ /// <summary>
+ /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
+ /// <param name="version"> the lucene compatibility version </param>
+ /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
+ : this(version, factory, input, minGram, maxGram, false)
+ {
+ }
+
+ /// <summary>
+ /// Creates <see cref="NGramTokenizer"/> with default min and max n-grams. </summary>
+ /// <param name="version"> the lucene compatibility version </param>
+ /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+ public NGramTokenizer(LuceneVersion version, TextReader input)
+ : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+ {
+ }
+
+ private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
+ {
+#pragma warning disable 612, 618
+ if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+ {
+ throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
+ }
+#pragma warning disable 612, 618
+ charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
+#pragma warning restore 612, 618
+ CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+ termAtt = AddAttribute<ICharTermAttribute>();
+ posIncAtt = AddAttribute<IPositionIncrementAttribute>();
+ posLenAtt = AddAttribute<IPositionLengthAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.edgesOnly = edgesOnly;
+ charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
+ buffer = new int[charBuffer.Buffer.Length];
+
+ // Make the term att large enough
+ termAtt.ResizeBuffer(2 * maxGram);
+ }
+
+ public override sealed bool IncrementToken()
+ {
+ ClearAttributes();
+
+ // termination of this loop is guaranteed by the fact that every iteration
+ // either advances the buffer (calls consumes()) or increases gramSize
+ while (true)
+ {
+ // compact
+ if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted)
+ {
+ Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+ bufferEnd -= bufferStart;
+ lastCheckedChar -= bufferStart;
+ lastNonTokenChar -= bufferStart;
+ bufferStart = 0;
+
+ // fill in remaining space
+ exhausted = !charUtils.Fill(charBuffer, m_input, buffer.Length - bufferEnd);
+ // convert to code points
+ bufferEnd += charUtils.ToCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd);
+ }
+
+ // should we go to the next offset?
+ if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd)
+ {
+ if (bufferStart + 1 + minGram > bufferEnd)
+ {
+ Debug.Assert(exhausted);
+ return false;
+ }
+ Consume();
+ gramSize = minGram;
+ }
+
+ UpdateLastNonTokenChar();
+
+ // retry if the token to be emitted was going to not only contain token chars
+ bool termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
+ bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
+ if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar)
+ {
+ Consume();
+ gramSize = minGram;
+ continue;
+ }
+
+ int length = charUtils.ToChars(buffer, bufferStart, gramSize, termAtt.Buffer, 0);
+ termAtt.Length = length;
+ posIncAtt.PositionIncrement = 1;
+ posLenAtt.PositionLength = 1;
+ offsetAtt.SetOffset(CorrectOffset(offset), CorrectOffset(offset + length));
+ ++gramSize;
+ return true;
+ }
+ }
+
+ private void UpdateLastNonTokenChar()
+ {
+ int termEnd = bufferStart + gramSize - 1;
+ if (termEnd > lastCheckedChar)
+ {
+ for (int i = termEnd; i > lastCheckedChar; --i)
+ {
+ if (!IsTokenChar(buffer[i]))
+ {
+ lastNonTokenChar = i;
+ break;
+ }
+ }
+ lastCheckedChar = termEnd;
+ }
+ }
+
+ /// <summary>
+ /// Consume one code point. </summary>
+ private void Consume()
+ {
+ offset += Character.CharCount(buffer[bufferStart++]);
+ }
+
+ /// <summary>
+ /// Only collect characters which satisfy this condition. </summary>
+ protected virtual bool IsTokenChar(int chr)
+ {
+ return true;
+ }
+
+ public override sealed void End()
+ {
+ base.End();
+ Debug.Assert(bufferStart <= bufferEnd);
+ int endOffset = offset;
+ for (int i = bufferStart; i < bufferEnd; ++i)
+ {
+ endOffset += Character.CharCount(buffer[i]);
+ }
+ endOffset = CorrectOffset(endOffset);
+ // set final offset
+ offsetAtt.SetOffset(endOffset, endOffset);
+ }
+
+ public override sealed void Reset()
+ {
+ base.Reset();
+ bufferStart = bufferEnd = buffer.Length;
+ lastNonTokenChar = lastCheckedChar = bufferStart - 1;
+ offset = 0;
+ gramSize = minGram;
+ exhausted = false;
+ charBuffer.Reset();
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs
new file mode 100644
index 0000000..cf25b65
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs
@@ -0,0 +1,70 @@
+\ufeffusing Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Factory for <see cref="NGramTokenizer"/>.
+ /// <code>
+ /// <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/>
+ /// </analyzer>
+ /// </fieldType></code>
+ /// </summary>
+ public class NGramTokenizerFactory : TokenizerFactory
+ {
+ private readonly int maxGramSize;
+ private readonly int minGramSize;
+
+ /// <summary>
+ /// Creates a new <see cref="NGramTokenizerFactory"/> </summary>
+ public NGramTokenizerFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ minGramSize = GetInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
+ maxGramSize = GetInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ /// <summary>
+ /// Creates the <see cref="TokenStream"/> of n-grams from the given <see cref="TextReader"/> and <see cref="AttributeSource.AttributeFactory"/>. </summary>
+ public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
+ {
+#pragma warning disable 612, 618
+ if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+ {
+ return new NGramTokenizer(m_luceneMatchVersion, factory, input, minGramSize, maxGramSize);
+ }
+ else
+ {
+#pragma warning disable 612, 618
+ return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize);
+#pragma warning restore 612, 618
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
deleted file mode 100644
index 70b44d3..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
+++ /dev/null
@@ -1,60 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.Util;
-using System.Collections.Generic;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Creates new instances of <see cref="EdgeNGramTokenFilter"/>.
- /// <code>
- /// <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
- /// <analyzer>
- /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- /// <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/>
- /// </analyzer>
- /// </fieldType></code>
- /// </summary>
- public class EdgeNGramFilterFactory : TokenFilterFactory
- {
- private readonly int maxGramSize;
- private readonly int minGramSize;
- private readonly string side;
-
- /// <summary>
- /// Creates a new <see cref="EdgeNGramFilterFactory"/> </summary>
- public EdgeNGramFilterFactory(IDictionary<string, string> args)
- : base(args)
- {
- minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
- maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
- side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString());
- if (args.Count > 0)
- {
- throw new System.ArgumentException("Unknown parameters: " + args);
- }
- }
-
- public override TokenStream Create(TokenStream input)
- {
-#pragma warning disable 612, 618
- return new EdgeNGramTokenFilter(m_luceneMatchVersion, input, side, minGramSize, maxGramSize);
-#pragma warning restore 612, 618
- }
- }
-}
\ No newline at end of file