You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/02/04 20:32:57 UTC
[38/39] lucenenet git commit: Lucene.Net.Analysis.Ngram - renamed
NGram in Git
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
deleted file mode 100644
index 8cf8172..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
+++ /dev/null
@@ -1,245 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Analysis.Util;
-using Lucene.Net.Util;
-using System;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Tokenizes the given token into n-grams of given size(s).
- /// <para>
- /// This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token.
- /// </para>
- /// <para>As of Lucene 4.4, this filter does not support
- /// <see cref="Side.BACK"/> (you can use <see cref="Reverse.ReverseStringFilter"/> up-front and
- /// afterward to get the same behavior), handles supplementary characters
- /// correctly and does not update offsets anymore.
- /// </para>
- /// </summary>
- public sealed class EdgeNGramTokenFilter : TokenFilter
- {
- public const Side DEFAULT_SIDE = Side.FRONT;
- public const int DEFAULT_MAX_GRAM_SIZE = 1;
- public const int DEFAULT_MIN_GRAM_SIZE = 1;
-
- /// <summary>
- /// Specifies which side of the input the n-gram should be generated from </summary>
- public enum Side
- {
- /// <summary>
- /// Get the n-gram from the front of the input </summary>
- FRONT,
-
- /// <summary>
- /// Get the n-gram from the end of the input </summary>
- [System.Obsolete]
- BACK,
- }
-
- /// <summary>
- /// Get the appropriate <see cref="Side"/> from a string
- /// </summary>
- public static Side GetSide(string sideName)
- {
- Side result;
- if (!Enum.TryParse(sideName, true, out result))
- {
- result = Side.FRONT;
- }
- return result;
- }
-
- private readonly LuceneVersion version;
- private readonly CharacterUtils charUtils;
- private readonly int minGram;
- private readonly int maxGram;
- private Side side;
- private char[] curTermBuffer;
- private int curTermLength;
- private int curCodePointCount;
- private int curGramSize;
- private int tokStart;
- private int tokEnd; // only used if the length changed before this filter
- private bool updateOffsets; // never if the length changed before this filter
- private int savePosIncr;
- private int savePosLen;
-
- private readonly ICharTermAttribute termAtt;
- private readonly IOffsetAttribute offsetAtt;
- private readonly IPositionIncrementAttribute posIncrAtt;
- private readonly IPositionLengthAttribute posLenAtt;
-
- /// <summary>
- /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
- /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
- : base(input)
- {
-
- //if (version == null)
- //{
- // throw new System.ArgumentException("version must not be null");
- //}
-
- if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
- {
- throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
- }
-
- if (!Enum.IsDefined(typeof(Side), side))
- {
- throw new System.ArgumentException("sideLabel must be either front or back");
- }
-
- if (minGram < 1)
- {
- throw new System.ArgumentException("minGram must be greater than zero");
- }
-
- if (minGram > maxGram)
- {
- throw new System.ArgumentException("minGram must not be greater than maxGram");
- }
-
- this.version = version;
- this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
- this.minGram = minGram;
- this.maxGram = maxGram;
- this.side = side;
-
- this.termAtt = AddAttribute<ICharTermAttribute>();
- this.offsetAtt = AddAttribute<IOffsetAttribute>();
- this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
- this.posLenAtt = AddAttribute<IPositionLengthAttribute>();
- }
-
- /// <summary>
- /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
- /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, string sideLabel, int minGram, int maxGram)
- : this(version, input, GetSide(sideLabel), minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
-#pragma warning disable 612, 618
- : this(version, input, Side.FRONT, minGram, maxGram)
-#pragma warning restore 612, 618
- {
- }
-
- public override sealed bool IncrementToken()
- {
- while (true)
- {
- if (curTermBuffer == null)
- {
- if (!m_input.IncrementToken())
- {
- return false;
- }
- else
- {
- curTermBuffer = (char[])termAtt.Buffer.Clone();
- curTermLength = termAtt.Length;
- curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
- curGramSize = minGram;
- tokStart = offsetAtt.StartOffset;
- tokEnd = offsetAtt.EndOffset;
-#pragma warning disable 612, 618
- if (version.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
- {
- // Never update offsets
- updateOffsets = false;
- }
- else
- {
- // if length by start + end offsets doesn't match the term text then assume
- // this is a synonym and don't adjust the offsets.
- updateOffsets = (tokStart + curTermLength) == tokEnd;
- }
- savePosIncr += posIncrAtt.PositionIncrement;
- savePosLen = posLenAtt.PositionLength;
- }
- }
- if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit
- {
- if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
- {
- // grab gramSize chars from front or back
- int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
- int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
- ClearAttributes();
- if (updateOffsets)
- {
- offsetAtt.SetOffset(tokStart + start, tokStart + end);
- }
- else
- {
- offsetAtt.SetOffset(tokStart, tokEnd);
- }
- // first ngram gets increment, others don't
- if (curGramSize == minGram)
- {
- posIncrAtt.PositionIncrement = savePosIncr;
- savePosIncr = 0;
- }
- else
- {
- posIncrAtt.PositionIncrement = 0;
- }
- posLenAtt.PositionLength = savePosLen;
- termAtt.CopyBuffer(curTermBuffer, start, end - start);
- curGramSize++;
- return true;
- }
- }
- curTermBuffer = null;
- }
- }
-
- public override void Reset()
- {
- base.Reset();
- curTermBuffer = null;
- savePosIncr = 0;
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
deleted file mode 100644
index ed2cb3d..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
+++ /dev/null
@@ -1,72 +0,0 @@
-\ufeffusing Lucene.Net.Util;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Tokenizes the input from an edge into n-grams of given size(s).
- /// <para>
- /// This <see cref="Tokenizer"/> create n-grams from the beginning edge or ending edge of a input token.
- /// </para>
- /// <para>As of Lucene 4.4, this tokenizer
- /// <list type="bullet">
- /// <item>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage</item>
- /// <item>doesn't trim the input,</item>
- /// <item>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones</item>
- /// <item>doesn't support backward n-grams anymore.</item>
- /// <item>supports <see cref="Util.CharTokenizer.IsTokenChar(int)"/> pre-tokenization,</item>
- /// <item>correctly handles supplementary characters.</item>
- /// </list>
- /// </para>
- /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
- /// to use the old behavior through <see cref="Lucene43EdgeNGramTokenizer"/>.
- /// </para>
- /// </summary>
- public class EdgeNGramTokenizer : NGramTokenizer
- {
- public const int DEFAULT_MAX_GRAM_SIZE = 1;
- public const int DEFAULT_MIN_GRAM_SIZE = 1;
-
- /// <summary>
- /// Creates <see cref="EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
- : base(version, input, minGram, maxGram, true)
- {
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public EdgeNGramTokenizer(LuceneVersion version, AttributeSource.AttributeFactory factory, TextReader input, int minGram, int maxGram)
- : base(version, factory, input, minGram, maxGram, true)
- {
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
deleted file mode 100644
index 00325f5..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
+++ /dev/null
@@ -1,75 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.Util;
-using Lucene.Net.Util;
-using System;
-using System.Collections.Generic;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Creates new instances of <see cref="EdgeNGramTokenizer"/>.
- /// <code>
- /// <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
- /// <analyzer>
- /// <tokenizer class="solr.EdgeNGramTokenizerFactory" minGramSize="1" maxGramSize="1"/>
- /// </analyzer>
- /// </fieldType></code>
- /// </summary>
- public class EdgeNGramTokenizerFactory : TokenizerFactory
- {
- private readonly int maxGramSize;
- private readonly int minGramSize;
- private readonly string side;
-
- /// <summary>
- /// Creates a new <see cref="EdgeNGramTokenizerFactory"/> </summary>
- public EdgeNGramTokenizerFactory(IDictionary<string, string> args) : base(args)
- {
- minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE);
- maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
- side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString());
- if (args.Count > 0)
- {
- throw new System.ArgumentException("Unknown parameters: " + args);
- }
- }
-
- public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
- {
-#pragma warning disable 612, 618
- if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
- {
- EdgeNGramTokenFilter.Side sideEnum;
- if (!Enum.TryParse(this.side, true, out sideEnum))
- {
- throw new System.ArgumentException(typeof(EdgeNGramTokenizer).Name + " does not support backward n-grams as of Lucene 4.4");
- }
- return new EdgeNGramTokenizer(m_luceneMatchVersion, input, minGramSize, maxGramSize);
- }
- else
- {
-#pragma warning disable 612, 618
- return new Lucene43EdgeNGramTokenizer(m_luceneMatchVersion, input, side, minGramSize, maxGramSize);
-#pragma warning restore 612, 618
- }
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
deleted file mode 100644
index 4dadbed..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
+++ /dev/null
@@ -1,297 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Util;
-using System;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Old version of <see cref="EdgeNGramTokenizer"/> which doesn't handle correctly
- /// supplementary characters.
- /// </summary>
- [Obsolete]
- public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
- {
- public const Side DEFAULT_SIDE = Side.FRONT;
- public const int DEFAULT_MAX_GRAM_SIZE = 1;
- public const int DEFAULT_MIN_GRAM_SIZE = 1;
-
- private ICharTermAttribute termAtt;
- private IOffsetAttribute offsetAtt;
- private IPositionIncrementAttribute posIncrAtt;
-
- /// <summary>
- /// Specifies which side of the input the n-gram should be generated from </summary>
- public enum Side
- {
- /// <summary>
- /// Get the n-gram from the front of the input </summary>
- FRONT,
-
- /// <summary>
- /// Get the n-gram from the end of the input </summary>
- BACK,
- }
-
- // Get the appropriate Side from a string
- public static Side GetSide(string sideName)
- {
- Side result;
- if (!Enum.TryParse(sideName, true, out result))
- {
- result = Side.FRONT;
- }
- return result;
- }
-
- private int minGram;
- private int maxGram;
- private int gramSize;
- private Side side;
- private bool started;
- private int inLen; // length of the input AFTER trim()
- private int charsRead; // length of the input
- private string inStr;
-
-
- /// <summary>
- /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram)
- : base(input)
- {
- Init(version, side, minGram, maxGram);
- }
-
- /// <summary>
- /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
- : base(factory, input)
- {
- Init(version, side, minGram, maxGram);
- }
-
- /// <summary>
- /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, string sideLabel, int minGram, int maxGram)
- : this(version, input, GetSide(sideLabel), minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram)
- : this(version, factory, input, GetSide(sideLabel), minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
- : this(version, input, Side.FRONT, minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
- /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
- : this(version, factory, input, Side.FRONT, minGram, maxGram)
- {
- }
-
- private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
- {
- //if (version == null)
- //{
- // throw new System.ArgumentException("version must not be null");
- //}
-
- if (!Enum.IsDefined(typeof(Side), side))
- {
- throw new System.ArgumentException("sideLabel must be either front or back");
- }
-
- if (minGram < 1)
- {
- throw new System.ArgumentException("minGram must be greater than zero");
- }
-
- if (minGram > maxGram)
- {
- throw new System.ArgumentException("minGram must not be greater than maxGram");
- }
-
- if (version.OnOrAfter(LuceneVersion.LUCENE_44))
- {
- if (side == Side.BACK)
- {
- throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
- }
- }
- else
- {
- maxGram = Math.Min(maxGram, 1024);
- }
-
- this.minGram = minGram;
- this.maxGram = maxGram;
- this.side = side;
- this.termAtt = AddAttribute<ICharTermAttribute>();
- this.offsetAtt = AddAttribute<IOffsetAttribute>();
- this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
- }
-
- /// <summary>
- /// Returns the next token in the stream, or null at EOS. </summary>
- public override bool IncrementToken()
- {
- ClearAttributes();
- // if we are just starting, read the whole input
- if (!started)
- {
- started = true;
- gramSize = minGram;
- int limit = side == Side.FRONT ? maxGram : 1024;
- char[] chars = new char[Math.Min(1024, limit)];
- charsRead = 0;
- // TODO: refactor to a shared readFully somewhere:
- bool exhausted = false;
- while (charsRead < limit)
- {
- int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
- if (inc <= 0)
- {
- exhausted = true;
- break;
- }
- charsRead += inc;
- if (charsRead == chars.Length && charsRead < limit)
- {
- chars = ArrayUtil.Grow(chars);
- }
- }
-
- inStr = new string(chars, 0, charsRead);
- inStr = inStr.Trim();
-
- if (!exhausted)
- {
- // Read extra throwaway chars so that on end() we
- // report the correct offset:
- var throwaway = new char[1024];
- while (true)
- {
- int inc = m_input.Read(throwaway, 0, throwaway.Length);
- if (inc <= 0)
- {
- break;
- }
- charsRead += inc;
- }
- }
-
- inLen = inStr.Length;
- if (inLen == 0)
- {
- return false;
- }
- posIncrAtt.PositionIncrement = 1;
- }
- else
- {
- posIncrAtt.PositionIncrement = 0;
- }
-
- // if the remaining input is too short, we can't generate any n-grams
- if (gramSize > inLen)
- {
- return false;
- }
-
- // if we have hit the end of our n-gram size range, quit
- if (gramSize > maxGram || gramSize > inLen)
- {
- return false;
- }
-
- // grab gramSize chars from front or back
- int start = side == Side.FRONT ? 0 : inLen - gramSize;
- int end = start + gramSize;
- termAtt.SetEmpty().Append(inStr, start, end);
- offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
- gramSize++;
- return true;
- }
-
- public override void End()
- {
- base.End();
- // set final offset
- int finalOffset = CorrectOffset(charsRead);
- this.offsetAtt.SetOffset(finalOffset, finalOffset);
- }
-
- public override void Reset()
- {
- base.Reset();
- started = false;
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
deleted file mode 100644
index b806345..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
+++ /dev/null
@@ -1,173 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.TokenAttributes;
-using System;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Old broken version of <see cref="NGramTokenizer"/>.
- /// </summary>
- [Obsolete]
- public sealed class Lucene43NGramTokenizer : Tokenizer
- {
- public const int DEFAULT_MIN_NGRAM_SIZE = 1;
- public const int DEFAULT_MAX_NGRAM_SIZE = 2;
-
- private int minGram, maxGram;
- private int gramSize;
- private int pos;
- private int inLen; // length of the input AFTER trim()
- private int charsRead; // length of the input
- private string inStr;
- private bool started;
-
- private ICharTermAttribute termAtt;
- private IOffsetAttribute offsetAtt;
-
- /// <summary>
- /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public Lucene43NGramTokenizer(TextReader input, int minGram, int maxGram)
- : base(input)
- {
- Init(minGram, maxGram);
- }
-
- /// <summary>
- /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
- /// <param name="factory"> <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
- : base(factory, input)
- {
- Init(minGram, maxGram);
- }
-
- /// <summary>
- /// Creates <see cref="Lucene43NGramTokenizer"/> with default min and max n-grams. </summary>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- public Lucene43NGramTokenizer(TextReader input)
- : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
- {
- }
-
- private void Init(int minGram, int maxGram)
- {
- if (minGram < 1)
- {
- throw new System.ArgumentException("minGram must be greater than zero");
- }
- if (minGram > maxGram)
- {
- throw new System.ArgumentException("minGram must not be greater than maxGram");
- }
- this.minGram = minGram;
- this.maxGram = maxGram;
- termAtt = AddAttribute<ICharTermAttribute>();
- offsetAtt = AddAttribute<IOffsetAttribute>();
- }
-
- /// <summary>
- /// Returns the next token in the stream, or null at EOS. </summary>
- public override bool IncrementToken()
- {
- ClearAttributes();
- if (!started)
- {
- started = true;
- gramSize = minGram;
- char[] chars = new char[1024];
- charsRead = 0;
- // TODO: refactor to a shared readFully somewhere:
- while (charsRead < chars.Length)
- {
- int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
- if (inc == -1)
- {
- break;
- }
- charsRead += inc;
- }
- inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings
-
- if (charsRead == chars.Length)
- {
- // Read extra throwaway chars so that on end() we
- // report the correct offset:
- var throwaway = new char[1024];
- while (true)
- {
- int inc = m_input.Read(throwaway, 0, throwaway.Length);
- if (inc == -1)
- {
- break;
- }
- charsRead += inc;
- }
- }
-
- inLen = inStr.Length;
- if (inLen == 0)
- {
- return false;
- }
- }
-
- if (pos + gramSize > inLen) // if we hit the end of the string
- {
- pos = 0; // reset to beginning of string
- gramSize++; // increase n-gram size
- if (gramSize > maxGram) // we are done
- {
- return false;
- }
- if (pos + gramSize > inLen)
- {
- return false;
- }
- }
-
- int oldPos = pos;
- pos++;
- termAtt.SetEmpty().Append(inStr, oldPos, oldPos + gramSize);
- offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
- return true;
- }
-
- public override void End()
- {
- base.End();
- // set final offset
- int finalOffset = CorrectOffset(charsRead);
- this.offsetAtt.SetOffset(finalOffset, finalOffset);
- }
-
- public override void Reset()
- {
- base.Reset();
- started = false;
- pos = 0;
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
deleted file mode 100644
index ca1d0bc..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
+++ /dev/null
@@ -1,56 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.Util;
-using System.Collections.Generic;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Factory for <see cref="NGramTokenFilter"/>.
- /// <code>
- /// <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
- /// <analyzer>
- /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- /// <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/>
- /// </analyzer>
- /// </fieldType></code>
- /// </summary>
- public class NGramFilterFactory : TokenFilterFactory
- {
- private readonly int maxGramSize;
- private readonly int minGramSize;
-
- /// <summary>
- /// Creates a new <see cref="NGramFilterFactory"/> </summary>
- public NGramFilterFactory(IDictionary<string, string> args)
- : base(args)
- {
- minGramSize = GetInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
- maxGramSize = GetInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
- if (args.Count > 0)
- {
- throw new System.ArgumentException("Unknown parameters: " + args);
- }
- }
-
- public override TokenStream Create(TokenStream input)
- {
- return new NGramTokenFilter(m_luceneMatchVersion, input, minGramSize, maxGramSize);
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
deleted file mode 100644
index f1c82c5..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
+++ /dev/null
@@ -1,252 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.Miscellaneous;
-using Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Analysis.Util;
-using Lucene.Net.Util;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Tokenizes the input into n-grams of the given size(s).
- /// <para>You must specify the required <see cref="LuceneVersion"/> compatibility when
- /// creating a <see cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters:
- /// <list type="bullet">
- /// <item>handles supplementary characters correctly,</item>
- /// <item>emits all n-grams for the same token at the same position,</item>
- /// <item>does not modify offsets,</item>
- /// <item>sorts n-grams by their offset in the original token first, then
- /// increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
- /// "c").</item>
- /// </list>
- /// </para>
- /// <para>You can make this filter use the old behavior by providing a version <
- /// <see cref="LuceneVersion.LUCENE_44"/> in the constructor but this is not recommended as
- /// it will lead to broken <see cref="TokenStream"/>s that will cause highlighting
- /// bugs.
- /// </para>
- /// <para>If you were using this <see cref="TokenFilter"/> to perform partial highlighting,
- /// this won't work anymore since this filter doesn't update offsets. You should
- /// modify your analysis chain to use <see cref="NGramTokenizer"/>, and potentially
- /// override <see cref="NGramTokenizer.IsTokenChar(int)"/> to perform pre-tokenization.
- /// </para>
- /// </summary>
- public sealed class NGramTokenFilter : TokenFilter
- {
- public const int DEFAULT_MIN_NGRAM_SIZE = 1;
- public const int DEFAULT_MAX_NGRAM_SIZE = 2;
-
- private readonly int minGram, maxGram;
-
- private char[] curTermBuffer;
- private int curTermLength;
- private int curCodePointCount;
- private int curGramSize;
- private int curPos;
- private int curPosInc, curPosLen;
- private int tokStart;
- private int tokEnd;
- private bool hasIllegalOffsets; // only if the length changed before this filter
-
- private readonly LuceneVersion version;
- private readonly CharacterUtils charUtils;
- private readonly ICharTermAttribute termAtt;
- private readonly IPositionIncrementAttribute posIncAtt;
- private readonly IPositionLengthAttribute posLenAtt;
- private readonly IOffsetAttribute offsetAtt;
-
- /// <summary>
- /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
- /// <param name="version"> Lucene version to enable correct position increments.
- /// See <see cref="NGramTokenFilter"/> for details. </param>
- /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
- : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
- {
- this.version = version;
- this.charUtils = version.OnOrAfter(
-#pragma warning disable 612, 618
- LuceneVersion.LUCENE_44) ?
-#pragma warning restore 612, 618
- CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
- if (minGram < 1)
- {
- throw new System.ArgumentException("minGram must be greater than zero");
- }
- if (minGram > maxGram)
- {
- throw new System.ArgumentException("minGram must not be greater than maxGram");
- }
- this.minGram = minGram;
- this.maxGram = maxGram;
-#pragma warning disable 612, 618
- if (version.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
- {
- posIncAtt = AddAttribute<IPositionIncrementAttribute>();
- posLenAtt = AddAttribute<IPositionLengthAttribute>();
- }
- else
- {
- posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
- posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
- }
- termAtt = AddAttribute<ICharTermAttribute>();
- offsetAtt = AddAttribute<IOffsetAttribute>();
- }
-
- private class PositionIncrementAttributeAnonymousInnerClassHelper : PositionIncrementAttribute
- {
- private readonly NGramTokenFilter outerInstance;
-
- public PositionIncrementAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
- {
- this.outerInstance = outerInstance;
- }
-
- public override int PositionIncrement
- {
- set
- {
- }
- get
- {
- return 0;
- }
- }
- }
-
- private class PositionLengthAttributeAnonymousInnerClassHelper : PositionLengthAttribute
- {
- private readonly NGramTokenFilter outerInstance;
-
- public PositionLengthAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
- {
- this.outerInstance = outerInstance;
- }
-
- public override int PositionLength
- {
- set
- {
- }
- get
- {
- return 0;
- }
- }
- }
-
- /// <summary>
- /// Creates <see cref="NGramTokenFilter"/> with default min and max n-grams. </summary>
- /// <param name="version"> Lucene version to enable correct position increments.
- /// See <see cref="NGramTokenFilter"/> for details. </param>
- /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
- public NGramTokenFilter(LuceneVersion version, TokenStream input)
- : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
- {
- }
-
- /// <summary>
- /// Returns the next token in the stream, or null at EOS.
- /// </summary>
- public override sealed bool IncrementToken()
- {
- while (true)
- {
- if (curTermBuffer == null)
- {
- if (!m_input.IncrementToken())
- {
- return false;
- }
- else
- {
- curTermBuffer = (char[])termAtt.Buffer.Clone();
- curTermLength = termAtt.Length;
- curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
- curGramSize = minGram;
- curPos = 0;
- curPosInc = posIncAtt.PositionIncrement;
- curPosLen = posLenAtt.PositionLength;
- tokStart = offsetAtt.StartOffset;
- tokEnd = offsetAtt.EndOffset;
- // if length by start + end offsets doesn't match the term text then assume
- // this is a synonym and don't adjust the offsets.
- hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
- }
- }
-#pragma warning disable 612, 618
- if (version.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
- {
- if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
- {
- ++curPos;
- curGramSize = minGram;
- }
- if ((curPos + curGramSize) <= curCodePointCount)
- {
- ClearAttributes();
- int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
- int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
- termAtt.CopyBuffer(curTermBuffer, start, end - start);
- posIncAtt.PositionIncrement = curPosInc;
- curPosInc = 0;
- posLenAtt.PositionLength = curPosLen;
- offsetAtt.SetOffset(tokStart, tokEnd);
- curGramSize++;
- return true;
- }
- }
- else
- {
- while (curGramSize <= maxGram)
- {
- while (curPos + curGramSize <= curTermLength) // while there is input
- {
- ClearAttributes();
- termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
- if (hasIllegalOffsets)
- {
- offsetAtt.SetOffset(tokStart, tokEnd);
- }
- else
- {
- offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
- }
- curPos++;
- return true;
- }
- curGramSize++; // increase n-gram size
- curPos = 0;
- }
- }
- curTermBuffer = null;
- }
- }
-
- public override void Reset()
- {
- base.Reset();
- curTermBuffer = null;
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
deleted file mode 100644
index b1845c8..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
+++ /dev/null
@@ -1,319 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Analysis.Util;
-using Lucene.Net.Support;
-using Lucene.Net.Util;
-using System;
-using System.Diagnostics;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Tokenizes the input into n-grams of the given size(s).
- /// <para>On the contrary to <see cref="NGramTokenFilter"/>, this class sets offsets so
- /// that characters between startOffset and endOffset in the original stream are
- /// the same as the term chars.
- /// </para>
- /// <para>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
- /// <list type="table">
- /// <listheader>
- /// <term>Term</term>
- /// <term>Position increment</term>
- /// <term>Position length</term>
- /// <term>Offsets</term>
- /// </listheader>
- /// <item>
- /// <term>ab</term>
- /// <term>1</term>
- /// <term>1</term>
- /// <term>[0,2[</term>
- /// </item>
- /// <item>
- /// <term>abc</term>
- /// <term>1</term>
- /// <term>1</term>
- /// <term>[0,3[</term>
- /// </item>
- /// <item>
- /// <term>bc</term>
- /// <term>1</term>
- /// <term>1</term>
- /// <term>[1,3[</term>
- /// </item>
- /// <item>
- /// <term>bcd</term>
- /// <term>1</term>
- /// <term>1</term>
- /// <term>[1,4[</term>
- /// </item>
- /// <item>
- /// <term>cd</term>
- /// <term>1</term>
- /// <term>1</term>
- /// <term>[2,4[</term>
- /// </item>
- /// <item>
- /// <term>cde</term>
- /// <term>1</term>
- /// <term>1</term>
- /// <term>[2,5[</term>
- /// </item>
- /// <item>
- /// <term>de</term>
- /// <term>1</term>
- /// <term>1</term>
- /// <term>[3,5[</term>
- /// </item>
- /// </list>
- /// </para>
- /// <para>This tokenizer changed a lot in Lucene 4.4 in order to:
- /// <list type="bullet">
- /// <item>tokenize in a streaming fashion to support streams which are larger
- /// than 1024 chars (limit of the previous version),</item>
- /// <item>count grams based on unicode code points instead of java chars (and
- /// never split in the middle of surrogate pairs),</item>
- /// <item>give the ability to pre-tokenize the stream (<see cref="IsTokenChar(int)"/>)
- /// before computing n-grams.</item>
- /// </list>
- /// </para>
- /// <para>Additionally, this class doesn't trim trailing whitespaces and emits
- /// tokens in a different order, tokens are now emitted by increasing start
- /// offsets while they used to be emitted by increasing lengths (which prevented
- /// from supporting large input streams).
- /// </para>
- /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
- /// to use the old behavior through <see cref="Lucene43NGramTokenizer"/>.
- /// </para>
- /// </summary>
- // non-sealed to allow for overriding IsTokenChar, but all other methods should be sealed
- public class NGramTokenizer : Tokenizer
- {
- public const int DEFAULT_MIN_NGRAM_SIZE = 1;
- public const int DEFAULT_MAX_NGRAM_SIZE = 2;
-
- private CharacterUtils charUtils;
- private CharacterUtils.CharacterBuffer charBuffer;
- private int[] buffer; // like charBuffer, but converted to code points
- private int bufferStart, bufferEnd; // remaining slice in buffer
- private int offset;
- private int gramSize;
- private int minGram, maxGram;
- private bool exhausted;
- private int lastCheckedChar; // last offset in the buffer that we checked
- private int lastNonTokenChar; // last offset that we found to not be a token char
- private bool edgesOnly; // leading edges n-grams only
-
- private ICharTermAttribute termAtt;
- private IPositionIncrementAttribute posIncAtt;
- private IPositionLengthAttribute posLenAtt;
- private IOffsetAttribute offsetAtt;
-
- internal NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram, bool edgesOnly)
- : base(input)
- {
- Init(version, minGram, maxGram, edgesOnly);
- }
-
- /// <summary>
- /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
- /// <param name="version"> the lucene compatibility version </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
- : this(version, input, minGram, maxGram, false)
- {
- }
-
- internal NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram, bool edgesOnly)
- : base(factory, input)
- {
- Init(version, minGram, maxGram, edgesOnly);
- }
-
- /// <summary>
- /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
- /// <param name="version"> the lucene compatibility version </param>
- /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
- : this(version, factory, input, minGram, maxGram, false)
- {
- }
-
- /// <summary>
- /// Creates <see cref="NGramTokenizer"/> with default min and max n-grams. </summary>
- /// <param name="version"> the lucene compatibility version </param>
- /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
- public NGramTokenizer(LuceneVersion version, TextReader input)
- : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
- {
- }
-
- private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
- {
-#pragma warning disable 612, 618
- if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
- {
- throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
- }
-#pragma warning disable 612, 618
- charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
-#pragma warning restore 612, 618
- CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
- if (minGram < 1)
- {
- throw new System.ArgumentException("minGram must be greater than zero");
- }
- if (minGram > maxGram)
- {
- throw new System.ArgumentException("minGram must not be greater than maxGram");
- }
- termAtt = AddAttribute<ICharTermAttribute>();
- posIncAtt = AddAttribute<IPositionIncrementAttribute>();
- posLenAtt = AddAttribute<IPositionLengthAttribute>();
- offsetAtt = AddAttribute<IOffsetAttribute>();
- this.minGram = minGram;
- this.maxGram = maxGram;
- this.edgesOnly = edgesOnly;
- charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
- buffer = new int[charBuffer.Buffer.Length];
-
- // Make the term att large enough
- termAtt.ResizeBuffer(2 * maxGram);
- }
-
- public override sealed bool IncrementToken()
- {
- ClearAttributes();
-
- // termination of this loop is guaranteed by the fact that every iteration
- // either advances the buffer (calls consumes()) or increases gramSize
- while (true)
- {
- // compact
- if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted)
- {
- Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
- bufferEnd -= bufferStart;
- lastCheckedChar -= bufferStart;
- lastNonTokenChar -= bufferStart;
- bufferStart = 0;
-
- // fill in remaining space
- exhausted = !charUtils.Fill(charBuffer, m_input, buffer.Length - bufferEnd);
- // convert to code points
- bufferEnd += charUtils.ToCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd);
- }
-
- // should we go to the next offset?
- if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd)
- {
- if (bufferStart + 1 + minGram > bufferEnd)
- {
- Debug.Assert(exhausted);
- return false;
- }
- Consume();
- gramSize = minGram;
- }
-
- UpdateLastNonTokenChar();
-
- // retry if the token to be emitted was going to not only contain token chars
- bool termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
- bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
- if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar)
- {
- Consume();
- gramSize = minGram;
- continue;
- }
-
- int length = charUtils.ToChars(buffer, bufferStart, gramSize, termAtt.Buffer, 0);
- termAtt.Length = length;
- posIncAtt.PositionIncrement = 1;
- posLenAtt.PositionLength = 1;
- offsetAtt.SetOffset(CorrectOffset(offset), CorrectOffset(offset + length));
- ++gramSize;
- return true;
- }
- }
-
- private void UpdateLastNonTokenChar()
- {
- int termEnd = bufferStart + gramSize - 1;
- if (termEnd > lastCheckedChar)
- {
- for (int i = termEnd; i > lastCheckedChar; --i)
- {
- if (!IsTokenChar(buffer[i]))
- {
- lastNonTokenChar = i;
- break;
- }
- }
- lastCheckedChar = termEnd;
- }
- }
-
- /// <summary>
- /// Consume one code point. </summary>
- private void Consume()
- {
- offset += Character.CharCount(buffer[bufferStart++]);
- }
-
- /// <summary>
- /// Only collect characters which satisfy this condition. </summary>
- protected virtual bool IsTokenChar(int chr)
- {
- return true;
- }
-
- public override sealed void End()
- {
- base.End();
- Debug.Assert(bufferStart <= bufferEnd);
- int endOffset = offset;
- for (int i = bufferStart; i < bufferEnd; ++i)
- {
- endOffset += Character.CharCount(buffer[i]);
- }
- endOffset = CorrectOffset(endOffset);
- // set final offset
- offsetAtt.SetOffset(endOffset, endOffset);
- }
-
- public override sealed void Reset()
- {
- base.Reset();
- bufferStart = bufferEnd = buffer.Length;
- lastNonTokenChar = lastCheckedChar = bufferStart - 1;
- offset = 0;
- gramSize = minGram;
- exhausted = false;
- charBuffer.Reset();
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
deleted file mode 100644
index cf25b65..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
+++ /dev/null
@@ -1,70 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.Util;
-using Lucene.Net.Util;
-using System.Collections.Generic;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Factory for <see cref="NGramTokenizer"/>.
- /// <code>
- /// <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
- /// <analyzer>
- /// <tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/>
- /// </analyzer>
- /// </fieldType></code>
- /// </summary>
- public class NGramTokenizerFactory : TokenizerFactory
- {
- private readonly int maxGramSize;
- private readonly int minGramSize;
-
- /// <summary>
- /// Creates a new <see cref="NGramTokenizerFactory"/> </summary>
- public NGramTokenizerFactory(IDictionary<string, string> args)
- : base(args)
- {
- minGramSize = GetInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
- maxGramSize = GetInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
- if (args.Count > 0)
- {
- throw new System.ArgumentException("Unknown parameters: " + args);
- }
- }
-
- /// <summary>
- /// Creates the <see cref="TokenStream"/> of n-grams from the given <see cref="TextReader"/> and <see cref="AttributeSource.AttributeFactory"/>. </summary>
- public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
- {
-#pragma warning disable 612, 618
- if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
- {
- return new NGramTokenizer(m_luceneMatchVersion, factory, input, minGramSize, maxGramSize);
- }
- else
- {
-#pragma warning disable 612, 618
- return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize);
-#pragma warning restore 612, 618
- }
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs
new file mode 100644
index 0000000..ea6fbd7
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs
@@ -0,0 +1,390 @@
+\ufeffusing Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.Shingle;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests <seealso cref="EdgeNGramTokenFilter"/> for correctness.
+ /// </summary>
+ public class EdgeNGramTokenFilterTest : BaseTokenStreamTestCase
+ {
+ private TokenStream input;
+
+ public override void SetUp()
+ {
+ base.SetUp();
+ input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
+ }
+
+ [Test]
+ public virtual void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+#pragma warning disable 612, 618
+ new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 0, 0);
+#pragma warning restore 612, 618
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+#pragma warning disable 612, 618
+ new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 2, 1);
+#pragma warning restore 612, 618
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestInvalidInput3()
+ {
+ bool gotException = false;
+ try
+ {
+#pragma warning disable 612, 618
+ new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, -1, 2);
+#pragma warning restore 612, 618
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestFrontUnigram()
+ {
+#pragma warning disable 612, 618
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[] { "a" }, new int[] { 0 }, new int[] { 5 });
+ }
+
+ [Test]
+ public virtual void TestBackUnigram()
+ {
+#pragma warning disable 612, 618
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[] { "e" }, new int[] { 4 }, new int[] { 5 });
+ }
+
+ [Test]
+ public virtual void TestOversizedNgrams()
+ {
+#pragma warning disable 612, 618
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0]);
+ }
+
+ [Test]
+ public virtual void TestFrontRangeOfNgrams()
+ {
+#pragma warning disable 612, 618
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
+ }
+
+ [Test]
+ public virtual void TestBackRangeOfNgrams()
+ {
+#pragma warning disable 612, 618
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, null, null, null, null, false);
+ }
+
+ [Test]
+ public virtual void TestFilterPositions()
+ {
+ TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false);
+#pragma warning disable 612, 618
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc", "v", "vw", "vwx" }, new int[] { 0, 0, 0, 6, 6, 6 }, new int[] { 5, 5, 5, 11, 11, 11 }, null, new int[] { 1, 0, 0, 1, 0, 0 }, null, null, false);
+ }
+
+ private class PositionFilter : TokenFilter
+ {
+
+ internal readonly IPositionIncrementAttribute posIncrAtt;
+ internal bool started;
+
+ internal PositionFilter(TokenStream input) : base(input)
+ {
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ }
+
+ public override sealed bool IncrementToken()
+ {
+ if (m_input.IncrementToken())
+ {
+ if (started)
+ {
+ posIncrAtt.PositionIncrement = 0;
+ }
+ else
+ {
+ started = true;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ started = false;
+ }
+ }
+
+ [Test]
+ public virtual void TestFirstTokenPositionIncrement()
+ {
+ TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false);
+ ts = new PositionFilter(ts); // All but first token will get 0 position increment
+#pragma warning disable 612, 618
+ EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3);
+#pragma warning restore 612, 618
+ // The first token "a" will not be output, since it's smaller than the mingram size of 2.
+ // The second token on input to EdgeNGramTokenFilter will have position increment of 0,
+ // which should be increased to 1, since this is the first output token in the stream.
+ AssertTokenStreamContents(filter, new string[] { "ab", "abc" }, new int[] { 2, 2 }, new int[] { 5, 5 }, new int[] { 1, 0 });
+ }
+
+ [Test]
+ public virtual void TestSmallTokenInStream()
+ {
+ input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
+#pragma warning disable 612, 618
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 });
+ }
+
+ [Test]
+ public virtual void TestReset()
+ {
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
+#pragma warning disable 612, 618
+ EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
+ tokenizer.SetReader(new StringReader("abcde"));
+ AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
+ }
+
+ // LUCENE-3642
+ // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
+ // wrt original text if a previous filter increases the length of the word (in this case � -> ae)
+ // so in this case we behave like WDF, and preserve any modified offsets
+ [Test]
+ public virtual void TestInvalidOffsets()
+ {
+ Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this);
+ AssertAnalyzesTo(analyzer, "mosfellsb�r", new string[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper : Analyzer
+ {
+ private readonly EdgeNGramTokenFilterTest outerInstance;
+
+ public AnalyzerAnonymousInnerClassHelper(EdgeNGramTokenFilterTest outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
+#pragma warning disable 612, 618
+ filters = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
+#pragma warning restore 612, 618
+ return new TokenStreamComponents(tokenizer, filters);
+ }
+ }
+
+ /// <summary>
+ /// blast some random strings through the analyzer </summary>
+ [Test]
+ public virtual void TestRandomStrings()
+ {
+ for (int i = 0; i < 10; i++)
+ {
+ int min = TestUtil.NextInt(Random(), 2, 10);
+ int max = TestUtil.NextInt(Random(), min, 20);
+
+ Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, min, max);
+ CheckRandomData(Random(), a, 100 * RANDOM_MULTIPLIER);
+ }
+
+ Analyzer b = new AnalyzerAnonymousInnerClassHelper3(this);
+ CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER, 20, false, false);
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
+ {
+ private readonly EdgeNGramTokenFilterTest outerInstance;
+
+ private int min;
+ private int max;
+
+ public AnalyzerAnonymousInnerClassHelper2(EdgeNGramTokenFilterTest outerInstance, int min, int max)
+ {
+ this.outerInstance = outerInstance;
+ this.min = min;
+ this.max = max;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
+ }
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper3 : Analyzer
+ {
+ private readonly EdgeNGramTokenFilterTest outerInstance;
+
+ public AnalyzerAnonymousInnerClassHelper3(EdgeNGramTokenFilterTest outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+#pragma warning disable 612, 618
+ return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 4));
+#pragma warning restore 612, 618
+ }
+ }
+
+ [Test]
+ public virtual void TestEmptyTerm()
+ {
+ Random random = Random();
+ Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this);
+ CheckAnalysisConsistency(random, a, random.nextBoolean(), "");
+
+ Analyzer b = new AnalyzerAnonymousInnerClassHelper5(this);
+ CheckAnalysisConsistency(random, b, random.nextBoolean(), "");
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper4 : Analyzer
+ {
+ private readonly EdgeNGramTokenFilterTest outerInstance;
+
+ public AnalyzerAnonymousInnerClassHelper4(EdgeNGramTokenFilterTest outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+#pragma warning disable 612, 618
+ return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15));
+#pragma warning restore 612, 618
+ }
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper5 : Analyzer
+ {
+ private readonly EdgeNGramTokenFilterTest outerInstance;
+
+ public AnalyzerAnonymousInnerClassHelper5(EdgeNGramTokenFilterTest outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+#pragma warning disable 612, 618
+ return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
+#pragma warning restore 612, 618
+ }
+ }
+
+ [Test]
+ public virtual void TestGraphs()
+ {
+ TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
+ tk = new ShingleFilter(tk);
+ tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
+ AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
+ }
+
+ [Test]
+ public virtual void TestSupplementaryCharacters()
+ {
+ string s = TestUtil.RandomUnicodeString(Random(), 10);
+ int codePointCount = s.CodePointCount(0, s.Length);
+ int minGram = TestUtil.NextInt(Random(), 1, 3);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 10);
+ TokenStream tk = new KeywordTokenizer(new StringReader(s));
+ tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
+ ICharTermAttribute termAtt = tk.AddAttribute<ICharTermAttribute>();
+ IOffsetAttribute offsetAtt = tk.AddAttribute<IOffsetAttribute>();
+ tk.Reset();
+ for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i)
+ {
+ assertTrue(tk.IncrementToken());
+ assertEquals(0, offsetAtt.StartOffset);
+ assertEquals(s.Length, offsetAtt.EndOffset);
+ int end = Character.OffsetByCodePoints(s, 0, i);
+ assertEquals(s.Substring(0, end), termAtt.ToString());
+ }
+ assertFalse(tk.IncrementToken());
+ }
+ }
+}
\ No newline at end of file