You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by di...@apache.org on 2011/05/15 19:12:25 UTC
[Lucene.Net] svn commit: r1103463 - in /incubator/lucene.net/trunk:
src/contrib/Analyzers/ src/contrib/Analyzers/NGram/ test/contrib/Analyzers/
test/contrib/Analyzers/NGram/
Author: digy
Date: Sun May 15 17:12:24 2011
New Revision: 1103463
URL: http://svn.apache.org/viewvc?rev=1103463&view=rev
Log:
[LUCENENET-405] contrib/Analysis.NGram
Added:
incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/
incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs
incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs
incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs
incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs
incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/
incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs
incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs
incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs
incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs
Modified:
incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj?rev=1103463&r1=1103462&r2=1103463&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj Sun May 15 17:12:24 2011
@@ -63,6 +63,10 @@
<Compile Include="Fr\FrenchAnalyzer.cs" />
<Compile Include="Fr\FrenchStemFilter.cs" />
<Compile Include="Fr\FrenchStemmer.cs" />
+ <Compile Include="NGram\EdgeNGramTokenFilter.cs" />
+ <Compile Include="NGram\EdgeNGramTokenizer.cs" />
+ <Compile Include="NGram\NGramTokenFilter.cs" />
+ <Compile Include="NGram\NGramTokenizer.cs" />
<Compile Include="Nl\DutchAnalyzer.cs" />
<Compile Include="Nl\DutchStemFilter.cs" />
<Compile Include="Nl\DutchStemmer.cs" />
Added: incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tokenizes the given token into n-grams of given size(s).
+ * <p>
+ * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
+ * </p>
+ */
+ public class EdgeNGramTokenFilter : TokenFilter
+ {
+ public static Side DEFAULT_SIDE = Side.FRONT;
+ public static int DEFAULT_MAX_GRAM_SIZE = 1;
+ public static int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
+ /** Specifies which side of the input the n-gram should be generated from */
+ public class Side
+ {
+ private string label;
+
+ /** Get the n-gram from the front of the input */
+ public static Side FRONT = new Side("front");
+
+ /** Get the n-gram from the end of the input */
+ public static Side BACK = new Side("back");
+
+ // Private ctor
+ private Side(string label) { this.label = label; }
+
+ public string getLabel() { return label; }
+
+ // Get the appropriate Side from a string
+ public static Side getSide(string sideName)
+ {
+ if (FRONT.getLabel().Equals(sideName))
+ {
+ return FRONT;
+ }
+ else if (BACK.getLabel().Equals(sideName))
+ {
+ return BACK;
+ }
+ return null;
+ }
+ }
+
+ private int minGram;
+ private int maxGram;
+ private Side side;
+ private char[] curTermBuffer;
+ private int curTermLength;
+ private int curGramSize;
+ private int tokStart;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+
+ protected EdgeNGramTokenFilter(TokenStream input) : base(input)
+ {
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ }
+
+ /**
+ * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+ *
+ * @param input {@link TokenStream} holding the input to be tokenized
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram)
+ : base(input)
+ {
+
+
+ if (side == null)
+ {
+ throw new System.ArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.side = side;
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ }
+
+ /**
+ * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+ *
+ * @param input {@link TokenStream} holding the input to be tokenized
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram)
+ : this(input, Side.getSide(sideLabel), minGram, maxGram)
+ {
+
+ }
+
+ public override bool IncrementToken()
+ {
+ while (true)
+ {
+ if (curTermBuffer == null)
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
+ else
+ {
+ curTermBuffer = (char[])termAtt.TermBuffer().Clone();
+ curTermLength = termAtt.TermLength();
+ curGramSize = minGram;
+ tokStart = offsetAtt.StartOffset();
+ }
+ }
+ if (curGramSize <= maxGram)
+ {
+ if (!(curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
+ || curGramSize > maxGram))
+ { // if we have hit the end of our n-gram size range, quit
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
+ int end = start + curGramSize;
+ ClearAttributes();
+ offsetAtt.SetOffset(tokStart + start, tokStart + end);
+ termAtt.SetTermBuffer(curTermBuffer, start, curGramSize);
+ curGramSize++;
+ return true;
+ }
+ }
+ curTermBuffer = null;
+ }
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next(Token reusableToken)
+ {
+ return base.Next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next()
+ {
+ return base.Next();
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ curTermBuffer = null;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,271 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tokenizes the input from an edge into n-grams of given size(s).
+ * <p>
+ * This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token.
+ * MaxGram can't be larger than 1024 because of limitation.
+ * </p>
+ */
+ public class EdgeNGramTokenizer : Tokenizer
+ {
+ public static Side DEFAULT_SIDE = Side.FRONT;
+ public static int DEFAULT_MAX_GRAM_SIZE = 1;
+ public static int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
+ /** Specifies which side of the input the n-gram should be generated from */
+ public class Side
+ {
+ private string label;
+
+ /** Get the n-gram from the front of the input */
+ public static Side FRONT = new Side("front");
+
+ /** Get the n-gram from the end of the input */
+ public static Side BACK = new Side("back");
+
+ // Private ctor
+ private Side(string label) { this.label = label; }
+
+
+ public string getLabel() { return label; }
+
+ // Get the appropriate Side from a string
+ public static Side getSide(string sideName)
+ {
+ if (FRONT.getLabel().Equals(sideName))
+ {
+ return FRONT;
+ }
+ else if (BACK.getLabel().Equals(sideName))
+ {
+ return BACK;
+ }
+ return null;
+ }
+ }
+
+ private int minGram;
+ private int maxGram;
+ private int gramSize;
+ private Side side;
+ private bool started = false;
+ private int inLen;
+ private string inStr;
+
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(TextReader input, Side side, int minGram, int maxGram)
+ : base(input)
+ {
+ init(side, minGram, maxGram);
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param source {@link AttributeSource} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(AttributeSource source, TextReader input, Side side, int minGram, int maxGram)
+ : base(source, input)
+ {
+
+ init(side, minGram, maxGram);
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
+ : base(factory, input)
+ {
+
+ init(side, minGram, maxGram);
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(TextReader input, string sideLabel, int minGram, int maxGram)
+ : this(input, Side.getSide(sideLabel), minGram, maxGram)
+ {
+
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param source {@link AttributeSource} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(AttributeSource source, TextReader input, string sideLabel, int minGram, int maxGram)
+ : this(source, input, Side.getSide(sideLabel), minGram, maxGram)
+ {
+
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram) :
+ this(factory, input, Side.getSide(sideLabel), minGram, maxGram)
+ {
+ }
+
+ private void init(Side side, int minGram, int maxGram)
+ {
+ if (side == null)
+ {
+ throw new System.ArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.side = side;
+
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ // if we are just starting, read the whole input
+ if (!started)
+ {
+ started = true;
+ char[] chars = new char[1024];
+ inStr = input.ReadToEnd().Trim(); // remove any leading or trailing spaces
+ inLen = inStr.Length;
+ gramSize = minGram;
+ }
+
+ // if the remaining input is too short, we can't generate any n-grams
+ if (gramSize > inLen)
+ {
+ return false;
+ }
+
+ // if we have hit the end of our n-gram size range, quit
+ if (gramSize > maxGram)
+ {
+ return false;
+ }
+
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : inLen - gramSize;
+ int end = start + gramSize;
+ termAtt.SetTermBuffer(inStr, start, gramSize);
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
+ gramSize++;
+ return true;
+ }
+
+ public override void End()
+ {
+ // set offset
+ int finalOffset = inLen;
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next(Token reusableToken)
+ {
+ return base.Next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next()
+ {
+ return base.Next();
+ }
+
+ public override void Reset(TextReader input)
+ {
+ base.Reset(input);
+ Reset();
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ started = false;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tokenizes the input into n-grams of the given size(s).
+ */
+ public class NGramTokenFilter : TokenFilter
+ {
+ public static int DEFAULT_MIN_NGRAM_SIZE = 1;
+ public static int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ private int minGram, maxGram;
+
+ private char[] curTermBuffer;
+ private int curTermLength;
+ private int curGramSize;
+ private int curPos;
+ private int tokStart;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ /**
+ * Creates NGramTokenFilter with given min and max n-grams.
+ * @param input {@link TokenStream} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
+ : base(input)
+ {
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ }
+
+ /**
+ * Creates NGramTokenFilter with default min and max n-grams.
+ * @param input {@link TokenStream} holding the input to be tokenized
+ */
+ public NGramTokenFilter(TokenStream input)
+ : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+ {
+
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ public override bool IncrementToken()
+ {
+ while (true)
+ {
+ if (curTermBuffer == null)
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
+ else
+ {
+ curTermBuffer = (char[])termAtt.TermBuffer().Clone();
+ curTermLength = termAtt.TermLength();
+ curGramSize = minGram;
+ curPos = 0;
+ tokStart = offsetAtt.StartOffset();
+ }
+ }
+ while (curGramSize <= maxGram)
+ {
+ while (curPos + curGramSize <= curTermLength)
+ { // while there is input
+ ClearAttributes();
+ termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize);
+ offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
+ curPos++;
+ return true;
+ }
+ curGramSize++; // increase n-gram size
+ curPos = 0;
+ }
+ curTermBuffer = null;
+ }
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next(Token reusableToken)
+ {
+ return base.Next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next()
+ {
+ return base.Next();
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ curTermBuffer = null;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tokenizes the input into n-grams of the given size(s).
+ */
+ public class NGramTokenizer : Tokenizer
+ {
+ public static int DEFAULT_MIN_NGRAM_SIZE = 1;
+ public static int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ private int minGram, maxGram;
+ private int gramSize;
+ private int pos = 0;
+ private int inLen;
+ private string inStr;
+ private bool started = false;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public NGramTokenizer(TextReader input, int minGram, int maxGram)
+ : base(input)
+ {
+ init(minGram, maxGram);
+ }
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param source {@link AttributeSource} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public NGramTokenizer(AttributeSource source, TextReader input, int minGram, int maxGram)
+ : base(source, input)
+ {
+ init(minGram, maxGram);
+ }
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
+ : base(factory, input)
+ {
+ init(minGram, maxGram);
+ }
+
+ /**
+ * Creates NGramTokenizer with default min and max n-grams.
+ * @param input {@link Reader} holding the input to be tokenized
+ */
+ public NGramTokenizer(TextReader input)
+ : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+ {
+
+ }
+
+ private void init(int minGram, int maxGram)
+ {
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+
+ this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ if (!started)
+ {
+ started = true;
+ gramSize = minGram;
+ char[] chars = new char[1024];
+ inStr = input.ReadToEnd(); // remove any trailing empty strings
+ inLen = inStr.Length;
+ }
+
+ if (pos + gramSize > inLen)
+ { // if we hit the end of the string
+ pos = 0; // reset to beginning of string
+ gramSize++; // increase n-gram size
+ if (gramSize > maxGram) // we are done
+ return false;
+ if (pos + gramSize > inLen)
+ return false;
+ }
+
+ int oldPos = pos;
+ pos++;
+ termAtt.SetTermBuffer(inStr, oldPos, gramSize);
+ offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
+ return true;
+ }
+
+ public override void End()
+ {
+ // set offset
+ int finalOffset = inLen;
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next(Token reusableToken)
+ {
+ return base.Next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+ public override Token Next()
+ {
+ return base.Next();
+ }
+
+ public override void Reset(TextReader input)
+ {
+ base.Reset(input);
+ Reset();
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ started = false;
+ pos = 0;
+ }
+ }
+}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj?rev=1103463&r1=1103462&r2=1103463&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj Sun May 15 17:12:24 2011
@@ -60,6 +60,10 @@
<Compile Include="AR\TestArabicAnalyzer.cs" />
<Compile Include="AR\TestArabicNormalizationFilter.cs" />
<Compile Include="AR\TestArabicStemFilter.cs" />
+ <Compile Include="NGram\TestEdgeNGramTokenFilter.cs" />
+ <Compile Include="NGram\TestEdgeNGramTokenizer.cs" />
+ <Compile Include="NGram\TestNGramTokenFilter.cs" />
+ <Compile Include="NGram\TestNGramTokenizer.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
Added: incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tests {@link EdgeNGramTokenFilter} for correctness.
+ */
+ [TestFixture]
+ public class TestEdgeNGramTokenFilter : BaseTokenStreamTestCase
+ {
+ private TokenStream input;
+
+ [SetUp]
+ public void SetUp()
+ {
+ base.SetUp();
+ input = new WhitespaceTokenizer(new StringReader("abcde"));
+ }
+
+ [Test]
+ public void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 0, 0);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 2, 1);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput3()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, -1, 2);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestFrontUnigram()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "a" }, new int[] { 0 }, new int[] { 1 });
+ }
+
+ [Test]
+ public void TestBackUnigram()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "e" }, new int[] { 4 }, new int[] { 5 });
+ }
+
+ [Test]
+ public void TestOversizedNgrams()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
+ AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
+ }
+
+ [Test]
+ public void TestFrontRangeOfNgrams()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
+ }
+
+ [Test]
+ public void TestBackRangeOfNgrams()
+ {
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 });
+ }
+
+ [Test]
+ public void TestSmallTokenInStream()
+ {
+ input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+ EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 });
+ }
+
+ [Test]
+ public void TestReset()
+ {
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+ EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+ AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
+ tokenizer.Reset(new StringReader("abcde"));
+ AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tests {@link EdgeNGramTokenizer} for correctness.
+ */
+ [TestFixture]
+ public class TestEdgeNGramTokenizer : BaseTokenStreamTestCase
+ {
+ private StringReader input;
+
+ [SetUp]
+ public void SetUp()
+ {
+ base.SetUp();
+ input = new StringReader("abcde");
+ }
+
+ [Test]
+ public void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0, 0);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 2, 1);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput3()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1, 2);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestFrontUnigram()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "a" }, new int[] { 0 }, new int[] { 1 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestBackUnigram()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "e" }, new int[] { 4 }, new int[] { 5 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestOversizedNgrams()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
+ AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestFrontRangeOfNgrams()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestBackRangeOfNgrams()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestReset()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
+ tokenizer.Reset(new StringReader("abcde"));
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tests {@link NGramTokenFilter} for correctness.
+ */
+ [TestFixture]
+ public class TestNGramTokenFilter : BaseTokenStreamTestCase
+ {
+ private TokenStream input;
+
+ [SetUp]
+ public void SetUp()
+ {
+ base.SetUp();
+ input = new WhitespaceTokenizer(new StringReader("abcde"));
+ }
+
+ [Test]
+ public void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+ new NGramTokenFilter(input, 2, 1);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+ new NGramTokenFilter(input, 0, 1);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestUnigrams()
+ {
+ NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
+ AssertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 });
+ }
+
+ [Test]
+ public void TestBigrams()
+ {
+ NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
+ AssertTokenStreamContents(filter, new String[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 });
+ }
+
+ [Test]
+ public void TestNgrams()
+ {
+ NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+ AssertTokenStreamContents(filter,
+ new String[] { "a", "b", "c", "d", "e", "ab", "bc", "cd", "de", "abc", "bcd", "cde" },
+ new int[] { 0, 1, 2, 3, 4, 0, 1, 2, 3, 0, 1, 2 },
+ new int[] { 1, 2, 3, 4, 5, 2, 3, 4, 5, 3, 4, 5 }
+ );
+ }
+
+ [Test]
+ public void TestOversizedNgrams()
+ {
+ NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
+ AssertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
+ }
+
+ [Test]
+ public void TestSmallTokenInStream()
+ {
+ input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+ NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
+ AssertTokenStreamContents(filter, new String[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 });
+ }
+
+ [Test]
+ public void TestReset()
+ {
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+ NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
+ AssertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 });
+ tokenizer.Reset(new StringReader("abcde"));
+ AssertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 });
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+ /**
+ * Tests {@link NGramTokenizer} for correctness.
+ */
+ [TestFixture]
+ public class TestNGramTokenizer : BaseTokenStreamTestCase
+ {
+ private StringReader input;
+
+ [SetUp]
+ public void SetUp()
+ {
+ base.SetUp();
+ input = new StringReader("abcde");
+ }
+
+ [Test]
+ public void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+ new NGramTokenizer(input, 2, 1);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+ new NGramTokenizer(input, 0, 1);
+ }
+ catch (System.ArgumentException e)
+ {
+ gotException = true;
+ }
+ Assert.IsTrue(gotException);
+ }
+
+ [Test]
+ public void TestUnigrams()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestBigrams()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
+ AssertTokenStreamContents(tokenizer, new String[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 }, 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestNgrams()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+ AssertTokenStreamContents(tokenizer,
+ new String[] { "a", "b", "c", "d", "e", "ab", "bc", "cd", "de", "abc", "bcd", "cde" },
+ new int[] { 0, 1, 2, 3, 4, 0, 1, 2, 3, 0, 1, 2 },
+ new int[] { 1, 2, 3, 4, 5, 2, 3, 4, 5, 3, 4, 5 },
+ 5 /* abcde */
+ );
+ }
+
+ [Test]
+ public void TestOversizedNgrams()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
+ AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
+ }
+
+ [Test]
+ public void TestReset()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */);
+ tokenizer.Reset(new StringReader("abcde"));
+ AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */);
+ }
+ }
+}
\ No newline at end of file