You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/02/04 20:32:56 UTC
[37/39] lucenenet git commit: Lucene.Net.Analysis.Ngram - renamed
NGram in Git
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerTest.cs
new file mode 100644
index 0000000..4ccecfa
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerTest.cs
@@ -0,0 +1,278 @@
+\ufeffusing Lucene.Net.Support;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.IO;
+using Reader = System.IO.TextReader;
+using Version = Lucene.Net.Util.LuceneVersion;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests <seealso cref="EdgeNGramTokenizer"/> for correctness.
+ /// </summary>
+ public class EdgeNGramTokenizerTest : BaseTokenStreamTestCase
+ {
+ private StringReader input;
+
+ public override void SetUp()
+ {
+ base.SetUp();
+ input = new StringReader("abcde");
+ }
+
+ [Test]
+ public virtual void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 0, 0);
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1);
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestInvalidInput3()
+ {
+ bool gotException = false;
+ try
+ {
+ new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, -1, 2);
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestFrontUnigram()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
+ AssertTokenStreamContents(tokenizer, new string[] { "a" }, new int[] { 0 }, new int[] { 1 }, 5); // abcde
+ }
+
+ [Test]
+ public virtual void TestBackUnigram()
+ {
+#pragma warning disable 612, 618
+ Tokenizer tokenizer = new Lucene43EdgeNGramTokenizer(Version.LUCENE_43, input, Lucene43EdgeNGramTokenizer.Side.BACK, 1, 1);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[] { "e" }, new int[] { 4 }, new int[] { 5 }, 5); // abcde
+ }
+
+ [Test]
+ public virtual void TestOversizedNgrams()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 6, 6);
+ AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0], 5); // abcde
+ }
+
+ [Test]
+ public virtual void TestFrontRangeOfNgrams()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5); // abcde
+ }
+
+ [Test]
+ public virtual void TestBackRangeOfNgrams()
+ {
+#pragma warning disable 612, 618
+ Tokenizer tokenizer = new Lucene43EdgeNGramTokenizer(Version.LUCENE_43, input, Lucene43EdgeNGramTokenizer.Side.BACK, 1, 3);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, null, null, null, 5, false); // abcde
+ }
+
+ [Test]
+ public virtual void TestReset()
+ {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5); // abcde
+ tokenizer.SetReader(new StringReader("abcde"));
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5); // abcde
+ }
+
+ /// <summary>
+ /// blast some random strings through the analyzer </summary>
+ [Test]
+ public virtual void TestRandomStrings()
+ {
+ for (int i = 0; i < 10; i++)
+ {
+ int min = TestUtil.NextInt(Random(), 2, 10);
+ int max = TestUtil.NextInt(Random(), min, 20);
+
+ Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, min, max);
+ CheckRandomData(Random(), a, 100 * RANDOM_MULTIPLIER, 20);
+ CheckRandomData(Random(), a, 10 * RANDOM_MULTIPLIER, 8192);
+ }
+
+ Analyzer b = new AnalyzerAnonymousInnerClassHelper2(this);
+ CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER, 20, false, false);
+ CheckRandomData(Random(), b, 100 * RANDOM_MULTIPLIER, 8192, false, false);
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper : Analyzer
+ {
+ private readonly EdgeNGramTokenizerTest outerInstance;
+
+ private int min;
+ private int max;
+
+ public AnalyzerAnonymousInnerClassHelper(EdgeNGramTokenizerTest outerInstance, int min, int max)
+ {
+ this.outerInstance = outerInstance;
+ this.min = min;
+ this.max = max;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, Reader reader)
+ {
+ Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
+ {
+ private readonly EdgeNGramTokenizerTest outerInstance;
+
+ public AnalyzerAnonymousInnerClassHelper2(EdgeNGramTokenizerTest outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, Reader reader)
+ {
+#pragma warning disable 612, 618
+ Tokenizer tokenizer = new Lucene43EdgeNGramTokenizer(Version.LUCENE_43, reader, Lucene43EdgeNGramTokenizer.Side.BACK, 2, 4);
+#pragma warning restore 612, 618
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ }
+
+ [Test]
+ public virtual void TestTokenizerPositions()
+ {
+#pragma warning disable 612, 618
+ Tokenizer tokenizer = new Lucene43EdgeNGramTokenizer(Version.LUCENE_43, input, Lucene43EdgeNGramTokenizer.Side.FRONT, 1, 3);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, null, new int[] { 1, 0, 0 }, null, null, false);
+
+ tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"), 1, 3);
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, null, new int[] { 1, 1, 1 }, null, null, false);
+ }
+
+ private static void TestNGrams(int minGram, int maxGram, int length, string nonTokenChars)
+ {
+ // LUCENENET TODO: Changed randomizing strategy - not sure if this is right...
+ //string s = RandomStrings.randomAsciiOfLength(Random(), length);
+ string s = TestUtil.RandomAnalysisString(Random(), length, true);
+ TestNGrams(minGram, maxGram, s, nonTokenChars);
+ }
+
+ private static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars)
+ {
+ NGramTokenizerTest.TestNGrams(minGram, maxGram, s, nonTokenChars, true);
+ }
+
+ [Test]
+ public virtual void TestLargeInput()
+ {
+ // test sliding
+ int minGram = TestUtil.NextInt(Random(), 1, 100);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 100);
+ TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 3 * 1024, 4 * 1024), "");
+ }
+
+ [Test]
+ public virtual void TestLargeMaxGram()
+ {
+ // test sliding with maxGram > 1024
+ int minGram = TestUtil.NextInt(Random(), 1290, 1300);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 1300);
+ TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 3 * 1024, 4 * 1024), "");
+ }
+
+ [Test]
+ public virtual void TestPreTokenization()
+ {
+ int minGram = TestUtil.NextInt(Random(), 1, 100);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 100);
+ TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 0, 4 * 1024), "a");
+ }
+
+ [Test]
+ public virtual void TestHeavyPreTokenization()
+ {
+ int minGram = TestUtil.NextInt(Random(), 1, 100);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 100);
+ TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 0, 4 * 1024), "abcdef");
+ }
+
+ [Test]
+ public virtual void TestFewTokenChars()
+ {
+ char[] chrs = new char[TestUtil.NextInt(Random(), 4000, 5000)];
+ Arrays.Fill(chrs, ' ');
+ for (int i = 0; i < chrs.Length; ++i)
+ {
+ if (Random().NextDouble() < 0.1)
+ {
+ chrs[i] = 'a';
+ }
+ }
+ int minGram = TestUtil.NextInt(Random(), 1, 2);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 2);
+ TestNGrams(minGram, maxGram, new string(chrs), " ");
+ }
+
+ [Test]
+ public virtual void TestFullUTF8Range()
+ {
+ int minGram = TestUtil.NextInt(Random(), 1, 100);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 100);
+ string s = TestUtil.RandomUnicodeString(Random(), 4 * 1024);
+ TestNGrams(minGram, maxGram, s, "");
+ TestNGrams(minGram, maxGram, s, "abcdef");
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenFilterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenFilterTest.cs
new file mode 100644
index 0000000..e485fc0
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenFilterTest.cs
@@ -0,0 +1,249 @@
+\ufeffusing Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests <seealso cref="NGramTokenFilter"/> for correctness.
+ /// </summary>
+ public class NGramTokenFilterTest : BaseTokenStreamTestCase
+ {
+ private TokenStream input;
+
+ public override void SetUp()
+ {
+ base.SetUp();
+ input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
+ }
+
+ [Test]
+ public virtual void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+ new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 1);
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+ new NGramTokenFilter(TEST_VERSION_CURRENT, input, 0, 1);
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestUnigrams()
+ {
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 1);
+ AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
+ }
+
+ [Test]
+ public virtual void TestBigrams()
+ {
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 2);
+ AssertTokenStreamContents(filter, new string[] { "ab", "bc", "cd", "de" }, new int[] { 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0 });
+ }
+
+ [Test]
+ public virtual void TestNgrams()
+ {
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
+ AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, null, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, null, null, false);
+ }
+
+ [Test]
+ public virtual void TestNgramsNoIncrement()
+ {
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
+ AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, null, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, null, null, false);
+ }
+
+ [Test]
+ public virtual void TestOversizedNgrams()
+ {
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 6, 7);
+ AssertTokenStreamContents(filter, new string[0], new int[0], new int[0]);
+ }
+
+ [Test]
+ public virtual void TestSmallTokenInStream()
+ {
+ input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 3, 3);
+ AssertTokenStreamContents(filter, new string[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 }, new int[] { 1, 2 });
+ }
+
+ [Test]
+ public virtual void TestReset()
+ {
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
+ NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1);
+ AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
+ tokenizer.SetReader(new StringReader("abcde"));
+ AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
+ }
+
+ // LUCENE-3642
+ // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
+ // wrt original text if a previous filter increases the length of the word (in this case � -> ae)
+ // so in this case we behave like WDF, and preserve any modified offsets
+ [Test]
+ public virtual void TestInvalidOffsets()
+ {
+ Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this);
+ AssertAnalyzesTo(analyzer, "mosfellsb�r", new string[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper : Analyzer
+ {
+ private readonly NGramTokenFilterTest outerInstance;
+
+ public AnalyzerAnonymousInnerClassHelper(NGramTokenFilterTest outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
+ filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
+ return new TokenStreamComponents(tokenizer, filters);
+ }
+ }
+
+ /// <summary>
+ /// blast some random strings through the analyzer </summary>
+ [Test]
+ public virtual void TestRandomStrings()
+ {
+ for (int i = 0; i < 10; i++)
+ {
+ int min = TestUtil.NextInt(Random(), 2, 10);
+ int max = TestUtil.NextInt(Random(), min, 20);
+ Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, min, max);
+ CheckRandomData(Random(), a, 200 * RANDOM_MULTIPLIER, 20);
+ }
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
+ {
+ private readonly NGramTokenFilterTest outerInstance;
+
+ private int min;
+ private int max;
+
+ public AnalyzerAnonymousInnerClassHelper2(NGramTokenFilterTest outerInstance, int min, int max)
+ {
+ this.outerInstance = outerInstance;
+ this.min = min;
+ this.max = max;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
+ }
+ }
+
+ [Test]
+ public virtual void TestEmptyTerm()
+ {
+ Random random = Random();
+ Analyzer a = new AnalyzerAnonymousInnerClassHelper3(this);
+ CheckAnalysisConsistency(random, a, random.nextBoolean(), "");
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper3 : Analyzer
+ {
+ private readonly NGramTokenFilterTest outerInstance;
+
+ public AnalyzerAnonymousInnerClassHelper3(NGramTokenFilterTest outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 15));
+ }
+ }
+
+ [Test]
+ public virtual void TestLucene43()
+ {
+#pragma warning disable 612, 618
+ NGramTokenFilter filter = new NGramTokenFilter(LuceneVersion.LUCENE_43, input, 2, 3);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(filter, new string[] { "ab", "bc", "cd", "de", "abc", "bcd", "cde" }, new int[] { 0, 1, 2, 3, 0, 1, 2 }, new int[] { 2, 3, 4, 5, 3, 4, 5 }, null, new int[] { 1, 1, 1, 1, 1, 1, 1 }, null, null, false);
+ }
+
+ [Test]
+ public virtual void TestSupplementaryCharacters()
+ {
+ string s = TestUtil.RandomUnicodeString(Random(), 10);
+ int codePointCount = s.CodePointCount(0, s.Length);
+ int minGram = TestUtil.NextInt(Random(), 1, 3);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 10);
+ TokenStream tk = new KeywordTokenizer(new StringReader(s));
+ tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
+ ICharTermAttribute termAtt = tk.AddAttribute<ICharTermAttribute>();
+ IOffsetAttribute offsetAtt = tk.AddAttribute<IOffsetAttribute>();
+ tk.Reset();
+ for (int start = 0; start < codePointCount; ++start)
+ {
+ for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end)
+ {
+ assertTrue(tk.IncrementToken());
+ assertEquals(0, offsetAtt.StartOffset);
+ assertEquals(s.Length, offsetAtt.EndOffset);
+ int startIndex = Character.OffsetByCodePoints(s, 0, start);
+ int endIndex = Character.OffsetByCodePoints(s, 0, end);
+ assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString());
+ }
+ }
+ assertFalse(tk.IncrementToken());
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenizerTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenizerTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenizerTest.cs
new file mode 100644
index 0000000..2fc1356
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenizerTest.cs
@@ -0,0 +1,303 @@
+\ufeffusing Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests <seealso cref="NGramTokenizer"/> for correctness.
+ /// </summary>
+ public class NGramTokenizerTest : BaseTokenStreamTestCase
+ {
+ private StringReader input;
+
+ public override void SetUp()
+ {
+ base.SetUp();
+ input = new StringReader("abcde");
+ }
+
+ [Test]
+ public virtual void TestInvalidInput()
+ {
+ bool gotException = false;
+ try
+ {
+ new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1);
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestInvalidInput2()
+ {
+ bool gotException = false;
+ try
+ {
+ new NGramTokenizer(TEST_VERSION_CURRENT, input, 0, 1);
+ }
+ catch (System.ArgumentException)
+ {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ [Test]
+ public virtual void TestUnigrams()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
+ }
+
+ [Test]
+ public virtual void TestBigrams()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 2);
+ AssertTokenStreamContents(tokenizer, new string[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 }, 5); // abcde
+ }
+
+ [Test]
+ public virtual void TestNgrams()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e" }, new int[] { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4 }, new int[] { 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 5 }, null, null, null, 5, false); // abcde
+ }
+
+ [Test]
+ public virtual void TestOversizedNgrams()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 6, 7);
+ AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0], 5); // abcde
+ }
+
+ [Test]
+ public virtual void TestReset()
+ {
+ NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
+ tokenizer.SetReader(new StringReader("abcde"));
+ AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
+ }
+
+ /// <summary>
+ /// blast some random strings through the analyzer </summary>
+ [Test]
+ public virtual void TestRandomStrings()
+ {
+ for (int i = 0; i < 10; i++)
+ {
+ int min = TestUtil.NextInt(Random(), 2, 10);
+ int max = TestUtil.NextInt(Random(), min, 20);
+ Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, min, max);
+ CheckRandomData(Random(), a, 200 * RANDOM_MULTIPLIER, 20);
+ CheckRandomData(Random(), a, 10 * RANDOM_MULTIPLIER, 1027);
+ }
+ }
+
+ private class AnalyzerAnonymousInnerClassHelper : Analyzer
+ {
+ private readonly NGramTokenizerTest outerInstance;
+
+ private int min;
+ private int max;
+
+ public AnalyzerAnonymousInnerClassHelper(NGramTokenizerTest outerInstance, int min, int max)
+ {
+ this.outerInstance = outerInstance;
+ this.min = min;
+ this.max = max;
+ }
+
+ protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ }
+
+ private static void TestNGrams(int minGram, int maxGram, int length, string nonTokenChars)
+ {
+ //string s = RandomStrings.randomAsciiOfLength(Random(), length);
+ string s = TestUtil.RandomAnalysisString(Random(), length, true);
+ TestNGrams(minGram, maxGram, s, nonTokenChars);
+ }
+
+ private static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars)
+ {
+ TestNGrams(minGram, maxGram, s, nonTokenChars, false);
+ }
+
+ internal static int[] toCodePoints(string s)
+ {
+ int[] codePoints = new int[Character.CodePointCount(s, 0, s.Length)];
+ for (int i = 0, j = 0; i < s.Length; ++j)
+ {
+ codePoints[j] = Character.CodePointAt(s, i);
+ i += Character.CharCount(codePoints[j]);
+ }
+ return codePoints;
+ }
+
+ internal static bool isTokenChar(string nonTokenChars, int codePoint)
+ {
+ for (int i = 0; i < nonTokenChars.Length;)
+ {
+ int cp = char.ConvertToUtf32(nonTokenChars, i);
+ if (cp == codePoint)
+ {
+ return false;
+ }
+ i += Character.CharCount(cp);
+ }
+ return true;
+ }
+
+ internal static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars, bool edgesOnly)
+ {
+ // convert the string to code points
+ int[] codePoints = toCodePoints(s);
+ int[] offsets = new int[codePoints.Length + 1];
+ for (int i = 0; i < codePoints.Length; ++i)
+ {
+ offsets[i + 1] = offsets[i] + Character.CharCount(codePoints[i]);
+ }
+ TokenStream grams = new NGramTokenizerAnonymousInnerClassHelper(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly, nonTokenChars);
+ ICharTermAttribute termAtt = grams.AddAttribute<ICharTermAttribute>();
+ IPositionIncrementAttribute posIncAtt = grams.AddAttribute<IPositionIncrementAttribute>();
+ IPositionLengthAttribute posLenAtt = grams.AddAttribute<IPositionLengthAttribute>();
+ IOffsetAttribute offsetAtt = grams.AddAttribute<IOffsetAttribute>();
+ grams.Reset();
+ for (int start = 0; start < codePoints.Length; ++start)
+ {
+ for (int end = start + minGram; end <= start + maxGram && end <= codePoints.Length; ++end)
+ {
+ if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1]))
+ {
+ // not on an edge
+ goto nextGramContinue;
+ }
+ for (int j = start; j < end; ++j)
+ {
+ if (!isTokenChar(nonTokenChars, codePoints[j]))
+ {
+ goto nextGramContinue;
+ }
+ }
+ assertTrue(grams.IncrementToken());
+ assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt.ToString()));
+ assertEquals(1, posIncAtt.PositionIncrement);
+ assertEquals(1, posLenAtt.PositionLength);
+ assertEquals(offsets[start], offsetAtt.StartOffset);
+ assertEquals(offsets[end], offsetAtt.EndOffset);
+ nextGramContinue:;
+ }
+ //nextGramBreak:;
+ }
+ assertFalse(grams.IncrementToken());
+ grams.End();
+ assertEquals(s.Length, offsetAtt.StartOffset);
+ assertEquals(s.Length, offsetAtt.EndOffset);
+ }
+
+ private class NGramTokenizerAnonymousInnerClassHelper : NGramTokenizer
+ {
+ private string nonTokenChars;
+
+ public NGramTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT, StringReader java, int minGram, int maxGram, bool edgesOnly, string nonTokenChars)
+ : base(TEST_VERSION_CURRENT, java, minGram, maxGram, edgesOnly)
+ {
+ this.nonTokenChars = nonTokenChars;
+ }
+
+ protected override bool IsTokenChar(int chr)
+ {
+ return nonTokenChars.IndexOf((char)chr) < 0;
+ }
+ }
+
+ [Test]
+ public virtual void TestLargeInput()
+ {
+ // test sliding
+ int minGram = TestUtil.NextInt(Random(), 1, 100);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 100);
+ TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 3 * 1024, 4 * 1024), "");
+ }
+
+ [Test]
+ public virtual void TestLargeMaxGram()
+ {
+ // test sliding with maxGram > 1024
+ int minGram = TestUtil.NextInt(Random(), 1290, 1300);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 1300);
+ TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 3 * 1024, 4 * 1024), "");
+ }
+
+ [Test]
+ public virtual void TestPreTokenization()
+ {
+ int minGram = TestUtil.NextInt(Random(), 1, 100);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 100);
+ TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 0, 4 * 1024), "a");
+ }
+
+ [Test]
+ public virtual void TestHeavyPreTokenization()
+ {
+ int minGram = TestUtil.NextInt(Random(), 1, 100);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 100);
+ TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 0, 4 * 1024), "abcdef");
+ }
+
+ [Test]
+ public virtual void TestFewTokenChars()
+ {
+ char[] chrs = new char[TestUtil.NextInt(Random(), 4000, 5000)];
+ Arrays.Fill(chrs, ' ');
+ for (int i = 0; i < chrs.Length; ++i)
+ {
+ if (Random().NextDouble() < 0.1)
+ {
+ chrs[i] = 'a';
+ }
+ }
+ int minGram = TestUtil.NextInt(Random(), 1, 2);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 2);
+ TestNGrams(minGram, maxGram, new string(chrs), " ");
+ }
+
+ [Test]
+ public virtual void TestFullUTF8Range()
+ {
+ int minGram = TestUtil.NextInt(Random(), 1, 100);
+ int maxGram = TestUtil.NextInt(Random(), minGram, 100);
+ string s = TestUtil.RandomUnicodeString(Random(), 4 * 1024);
+ TestNGrams(minGram, maxGram, s, "");
+ TestNGrams(minGram, maxGram, s, "abcdef");
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/TestNGramFilters.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/TestNGramFilters.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/TestNGramFilters.cs
new file mode 100644
index 0000000..c0683a6
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/TestNGramFilters.cs
@@ -0,0 +1,196 @@
+\ufeffusing Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.IO;
+using Reader = System.IO.TextReader;
+
+namespace Lucene.Net.Analysis.NGram
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Simple tests to ensure the NGram filter factories are working.
+ /// </summary>
+ public class TestNGramFilters : BaseTokenStreamFactoryTestCase
+ {
+ /// <summary>
+ /// Test NGramTokenizerFactory
+ /// </summary>
+ [Test]
+ public virtual void TestNGramTokenizer()
+ {
+ Reader reader = new StringReader("test");
+ TokenStream stream = TokenizerFactory("NGram").Create(reader);
+ AssertTokenStreamContents(stream, new string[] { "t", "te", "e", "es", "s", "st", "t" });
+ }
+
+ /// <summary>
+ /// Test NGramTokenizerFactory with min and max gram options
+ /// </summary>
+ [Test]
+ public virtual void TestNGramTokenizer2()
+ {
+ Reader reader = new StringReader("test");
+ TokenStream stream = TokenizerFactory("NGram", "minGramSize", "2", "maxGramSize", "3").Create(reader);
+ AssertTokenStreamContents(stream, new string[] { "te", "tes", "es", "est", "st" });
+ }
+
+ /// <summary>
+ /// Test the NGramFilterFactory
+ /// </summary>
+ [Test]
+ public virtual void TestNGramFilter()
+ {
+ Reader reader = new StringReader("test");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = TokenFilterFactory("NGram").Create(stream);
+ AssertTokenStreamContents(stream, new string[] { "t", "te", "e", "es", "s", "st", "t" });
+ }
+
+ /// <summary>
+ /// Test the NGramFilterFactory with min and max gram options
+ /// </summary>
+ [Test]
+ public virtual void TestNGramFilter2()
+ {
+ Reader reader = new StringReader("test");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = TokenFilterFactory("NGram", "minGramSize", "2", "maxGramSize", "3").Create(stream);
+ AssertTokenStreamContents(stream, new string[] { "te", "tes", "es", "est", "st" });
+ }
+
+ /// <summary>
+ /// Test EdgeNGramTokenizerFactory
+ /// </summary>
+ [Test]
+ public virtual void TestEdgeNGramTokenizer()
+ {
+ Reader reader = new StringReader("test");
+ TokenStream stream = TokenizerFactory("EdgeNGram").Create(reader);
+ AssertTokenStreamContents(stream, new string[] { "t" });
+ }
+
+ /// <summary>
+ /// Test EdgeNGramTokenizerFactory with min and max gram size
+ /// </summary>
+ [Test]
+ public virtual void TestEdgeNGramTokenizer2()
+ {
+ Reader reader = new StringReader("test");
+ TokenStream stream = TokenizerFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").Create(reader);
+ AssertTokenStreamContents(stream, new string[] { "t", "te" });
+ }
+
+ /// <summary>
+ /// Test EdgeNGramTokenizerFactory with side option
+ /// </summary>
+ [Test]
+ public virtual void TestEdgeNGramTokenizer3()
+ {
+ Reader reader = new StringReader("ready");
+#pragma warning disable 612, 618
+ TokenStream stream = TokenizerFactory("EdgeNGram", LuceneVersion.LUCENE_43, "side", "back").Create(reader);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(stream, new string[] { "y" });
+ }
+
+ /// <summary>
+ /// Test EdgeNGramFilterFactory
+ /// </summary>
+ [Test]
+ public virtual void TestEdgeNGramFilter()
+ {
+ Reader reader = new StringReader("test");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = TokenFilterFactory("EdgeNGram").Create(stream);
+ AssertTokenStreamContents(stream, new string[] { "t" });
+ }
+
+ /// <summary>
+ /// Test EdgeNGramFilterFactory with min and max gram size
+ /// </summary>
+ [Test]
+ public virtual void TestEdgeNGramFilter2()
+ {
+ Reader reader = new StringReader("test");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = TokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").Create(stream);
+ AssertTokenStreamContents(stream, new string[] { "t", "te" });
+ }
+
+ /// <summary>
+ /// Test EdgeNGramFilterFactory with side option
+ /// </summary>
+ [Test]
+ public virtual void TestEdgeNGramFilter3()
+ {
+ Reader reader = new StringReader("ready");
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+#pragma warning disable 612, 618
+ stream = TokenFilterFactory("EdgeNGram", LuceneVersion.LUCENE_43, "side", "back").Create(stream);
+#pragma warning restore 612, 618
+ AssertTokenStreamContents(stream, new string[] { "y" });
+ }
+
+ /// <summary>
+ /// Test that bogus arguments result in exception </summary>
+ [Test]
+ public virtual void TestBogusArguments()
+ {
+ try
+ {
+ TokenizerFactory("NGram", "bogusArg", "bogusValue");
+ fail();
+ }
+ catch (System.ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+
+ try
+ {
+ TokenizerFactory("EdgeNGram", "bogusArg", "bogusValue");
+ fail();
+ }
+ catch (System.ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+
+ try
+ {
+ TokenFilterFactory("NGram", "bogusArg", "bogusValue");
+ fail();
+ }
+ catch (System.ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+
+ try
+ {
+ TokenFilterFactory("EdgeNGram", "bogusArg", "bogusValue");
+ fail();
+ }
+ catch (System.ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilterTest.cs
deleted file mode 100644
index ea6fbd7..0000000
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilterTest.cs
+++ /dev/null
@@ -1,390 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.Core;
-using Lucene.Net.Analysis.Miscellaneous;
-using Lucene.Net.Analysis.Shingle;
-using Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Support;
-using Lucene.Net.Util;
-using NUnit.Framework;
-using System;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Tests <seealso cref="EdgeNGramTokenFilter"/> for correctness.
- /// </summary>
- public class EdgeNGramTokenFilterTest : BaseTokenStreamTestCase
- {
- private TokenStream input;
-
- public override void SetUp()
- {
- base.SetUp();
- input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
- }
-
- [Test]
- public virtual void TestInvalidInput()
- {
- bool gotException = false;
- try
- {
-#pragma warning disable 612, 618
- new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 0, 0);
-#pragma warning restore 612, 618
- }
- catch (System.ArgumentException)
- {
- gotException = true;
- }
- assertTrue(gotException);
- }
-
- [Test]
- public virtual void TestInvalidInput2()
- {
- bool gotException = false;
- try
- {
-#pragma warning disable 612, 618
- new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 2, 1);
-#pragma warning restore 612, 618
- }
- catch (System.ArgumentException)
- {
- gotException = true;
- }
- assertTrue(gotException);
- }
-
- [Test]
- public virtual void TestInvalidInput3()
- {
- bool gotException = false;
- try
- {
-#pragma warning disable 612, 618
- new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, -1, 2);
-#pragma warning restore 612, 618
- }
- catch (System.ArgumentException)
- {
- gotException = true;
- }
- assertTrue(gotException);
- }
-
- [Test]
- public virtual void TestFrontUnigram()
- {
-#pragma warning disable 612, 618
- EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[] { "a" }, new int[] { 0 }, new int[] { 5 });
- }
-
- [Test]
- public virtual void TestBackUnigram()
- {
-#pragma warning disable 612, 618
- EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[] { "e" }, new int[] { 4 }, new int[] { 5 });
- }
-
- [Test]
- public virtual void TestOversizedNgrams()
- {
-#pragma warning disable 612, 618
- EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0]);
- }
-
- [Test]
- public virtual void TestFrontRangeOfNgrams()
- {
-#pragma warning disable 612, 618
- EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
- }
-
- [Test]
- public virtual void TestBackRangeOfNgrams()
- {
-#pragma warning disable 612, 618
- EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, null, null, null, null, false);
- }
-
- [Test]
- public virtual void TestFilterPositions()
- {
- TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false);
-#pragma warning disable 612, 618
- EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc", "v", "vw", "vwx" }, new int[] { 0, 0, 0, 6, 6, 6 }, new int[] { 5, 5, 5, 11, 11, 11 }, null, new int[] { 1, 0, 0, 1, 0, 0 }, null, null, false);
- }
-
- private class PositionFilter : TokenFilter
- {
-
- internal readonly IPositionIncrementAttribute posIncrAtt;
- internal bool started;
-
- internal PositionFilter(TokenStream input) : base(input)
- {
- posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
- }
-
- public override sealed bool IncrementToken()
- {
- if (m_input.IncrementToken())
- {
- if (started)
- {
- posIncrAtt.PositionIncrement = 0;
- }
- else
- {
- started = true;
- }
- return true;
- }
- else
- {
- return false;
- }
- }
-
- public override void Reset()
- {
- base.Reset();
- started = false;
- }
- }
-
- [Test]
- public virtual void TestFirstTokenPositionIncrement()
- {
- TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false);
- ts = new PositionFilter(ts); // All but first token will get 0 position increment
-#pragma warning disable 612, 618
- EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3);
-#pragma warning restore 612, 618
- // The first token "a" will not be output, since it's smaller than the mingram size of 2.
- // The second token on input to EdgeNGramTokenFilter will have position increment of 0,
- // which should be increased to 1, since this is the first output token in the stream.
- AssertTokenStreamContents(filter, new string[] { "ab", "abc" }, new int[] { 2, 2 }, new int[] { 5, 5 }, new int[] { 1, 0 });
- }
-
- [Test]
- public virtual void TestSmallTokenInStream()
- {
- input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
-#pragma warning disable 612, 618
- EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 });
- }
-
- [Test]
- public virtual void TestReset()
- {
- WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
-#pragma warning disable 612, 618
- EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
- tokenizer.SetReader(new StringReader("abcde"));
- AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
- }
-
- // LUCENE-3642
- // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
- // wrt original text if a previous filter increases the length of the word (in this case � -> ae)
- // so in this case we behave like WDF, and preserve any modified offsets
- [Test]
- public virtual void TestInvalidOffsets()
- {
- Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this);
- AssertAnalyzesTo(analyzer, "mosfellsb�r", new string[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
- }
-
- private class AnalyzerAnonymousInnerClassHelper : Analyzer
- {
- private readonly EdgeNGramTokenFilterTest outerInstance;
-
- public AnalyzerAnonymousInnerClassHelper(EdgeNGramTokenFilterTest outerInstance)
- {
- this.outerInstance = outerInstance;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
- {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
-#pragma warning disable 612, 618
- filters = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
-#pragma warning restore 612, 618
- return new TokenStreamComponents(tokenizer, filters);
- }
- }
-
- /// <summary>
- /// blast some random strings through the analyzer </summary>
- [Test]
- public virtual void TestRandomStrings()
- {
- for (int i = 0; i < 10; i++)
- {
- int min = TestUtil.NextInt(Random(), 2, 10);
- int max = TestUtil.NextInt(Random(), min, 20);
-
- Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, min, max);
- CheckRandomData(Random(), a, 100 * RANDOM_MULTIPLIER);
- }
-
- Analyzer b = new AnalyzerAnonymousInnerClassHelper3(this);
- CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER, 20, false, false);
- }
-
- private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
- {
- private readonly EdgeNGramTokenFilterTest outerInstance;
-
- private int min;
- private int max;
-
- public AnalyzerAnonymousInnerClassHelper2(EdgeNGramTokenFilterTest outerInstance, int min, int max)
- {
- this.outerInstance = outerInstance;
- this.min = min;
- this.max = max;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
- {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
- }
- }
-
- private class AnalyzerAnonymousInnerClassHelper3 : Analyzer
- {
- private readonly EdgeNGramTokenFilterTest outerInstance;
-
- public AnalyzerAnonymousInnerClassHelper3(EdgeNGramTokenFilterTest outerInstance)
- {
- this.outerInstance = outerInstance;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
- {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-#pragma warning disable 612, 618
- return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 4));
-#pragma warning restore 612, 618
- }
- }
-
- [Test]
- public virtual void TestEmptyTerm()
- {
- Random random = Random();
- Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this);
- CheckAnalysisConsistency(random, a, random.nextBoolean(), "");
-
- Analyzer b = new AnalyzerAnonymousInnerClassHelper5(this);
- CheckAnalysisConsistency(random, b, random.nextBoolean(), "");
- }
-
- private class AnalyzerAnonymousInnerClassHelper4 : Analyzer
- {
- private readonly EdgeNGramTokenFilterTest outerInstance;
-
- public AnalyzerAnonymousInnerClassHelper4(EdgeNGramTokenFilterTest outerInstance)
- {
- this.outerInstance = outerInstance;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
- {
- Tokenizer tokenizer = new KeywordTokenizer(reader);
-#pragma warning disable 612, 618
- return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15));
-#pragma warning restore 612, 618
- }
- }
-
- private class AnalyzerAnonymousInnerClassHelper5 : Analyzer
- {
- private readonly EdgeNGramTokenFilterTest outerInstance;
-
- public AnalyzerAnonymousInnerClassHelper5(EdgeNGramTokenFilterTest outerInstance)
- {
- this.outerInstance = outerInstance;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
- {
- Tokenizer tokenizer = new KeywordTokenizer(reader);
-#pragma warning disable 612, 618
- return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
-#pragma warning restore 612, 618
- }
- }
-
- [Test]
- public virtual void TestGraphs()
- {
- TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
- tk = new ShingleFilter(tk);
- tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
- AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
- }
-
- [Test]
- public virtual void TestSupplementaryCharacters()
- {
- string s = TestUtil.RandomUnicodeString(Random(), 10);
- int codePointCount = s.CodePointCount(0, s.Length);
- int minGram = TestUtil.NextInt(Random(), 1, 3);
- int maxGram = TestUtil.NextInt(Random(), minGram, 10);
- TokenStream tk = new KeywordTokenizer(new StringReader(s));
- tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
- ICharTermAttribute termAtt = tk.AddAttribute<ICharTermAttribute>();
- IOffsetAttribute offsetAtt = tk.AddAttribute<IOffsetAttribute>();
- tk.Reset();
- for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i)
- {
- assertTrue(tk.IncrementToken());
- assertEquals(0, offsetAtt.StartOffset);
- assertEquals(s.Length, offsetAtt.EndOffset);
- int end = Character.OffsetByCodePoints(s, 0, i);
- assertEquals(s.Substring(0, end), termAtt.ToString());
- }
- assertFalse(tk.IncrementToken());
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerTest.cs
deleted file mode 100644
index 4ccecfa..0000000
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerTest.cs
+++ /dev/null
@@ -1,278 +0,0 @@
-\ufeffusing Lucene.Net.Support;
-using Lucene.Net.Util;
-using NUnit.Framework;
-using System.IO;
-using Reader = System.IO.TextReader;
-using Version = Lucene.Net.Util.LuceneVersion;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Tests <seealso cref="EdgeNGramTokenizer"/> for correctness.
- /// </summary>
- public class EdgeNGramTokenizerTest : BaseTokenStreamTestCase
- {
- private StringReader input;
-
- public override void SetUp()
- {
- base.SetUp();
- input = new StringReader("abcde");
- }
-
- [Test]
- public virtual void TestInvalidInput()
- {
- bool gotException = false;
- try
- {
- new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 0, 0);
- }
- catch (System.ArgumentException)
- {
- gotException = true;
- }
- assertTrue(gotException);
- }
-
- [Test]
- public virtual void TestInvalidInput2()
- {
- bool gotException = false;
- try
- {
- new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1);
- }
- catch (System.ArgumentException)
- {
- gotException = true;
- }
- assertTrue(gotException);
- }
-
- [Test]
- public virtual void TestInvalidInput3()
- {
- bool gotException = false;
- try
- {
- new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, -1, 2);
- }
- catch (System.ArgumentException)
- {
- gotException = true;
- }
- assertTrue(gotException);
- }
-
- [Test]
- public virtual void TestFrontUnigram()
- {
- EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
- AssertTokenStreamContents(tokenizer, new string[] { "a" }, new int[] { 0 }, new int[] { 1 }, 5); // abcde
- }
-
- [Test]
- public virtual void TestBackUnigram()
- {
-#pragma warning disable 612, 618
- Tokenizer tokenizer = new Lucene43EdgeNGramTokenizer(Version.LUCENE_43, input, Lucene43EdgeNGramTokenizer.Side.BACK, 1, 1);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[] { "e" }, new int[] { 4 }, new int[] { 5 }, 5); // abcde
- }
-
- [Test]
- public virtual void TestOversizedNgrams()
- {
- EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 6, 6);
- AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0], 5); // abcde
- }
-
- [Test]
- public virtual void TestFrontRangeOfNgrams()
- {
- EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
- AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5); // abcde
- }
-
- [Test]
- public virtual void TestBackRangeOfNgrams()
- {
-#pragma warning disable 612, 618
- Tokenizer tokenizer = new Lucene43EdgeNGramTokenizer(Version.LUCENE_43, input, Lucene43EdgeNGramTokenizer.Side.BACK, 1, 3);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, null, null, null, 5, false); // abcde
- }
-
- [Test]
- public virtual void TestReset()
- {
- EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
- AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5); // abcde
- tokenizer.SetReader(new StringReader("abcde"));
- AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5); // abcde
- }
-
- /// <summary>
- /// blast some random strings through the analyzer </summary>
- [Test]
- public virtual void TestRandomStrings()
- {
- for (int i = 0; i < 10; i++)
- {
- int min = TestUtil.NextInt(Random(), 2, 10);
- int max = TestUtil.NextInt(Random(), min, 20);
-
- Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, min, max);
- CheckRandomData(Random(), a, 100 * RANDOM_MULTIPLIER, 20);
- CheckRandomData(Random(), a, 10 * RANDOM_MULTIPLIER, 8192);
- }
-
- Analyzer b = new AnalyzerAnonymousInnerClassHelper2(this);
- CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER, 20, false, false);
- CheckRandomData(Random(), b, 100 * RANDOM_MULTIPLIER, 8192, false, false);
- }
-
- private class AnalyzerAnonymousInnerClassHelper : Analyzer
- {
- private readonly EdgeNGramTokenizerTest outerInstance;
-
- private int min;
- private int max;
-
- public AnalyzerAnonymousInnerClassHelper(EdgeNGramTokenizerTest outerInstance, int min, int max)
- {
- this.outerInstance = outerInstance;
- this.min = min;
- this.max = max;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, Reader reader)
- {
- Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
- }
-
- private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
- {
- private readonly EdgeNGramTokenizerTest outerInstance;
-
- public AnalyzerAnonymousInnerClassHelper2(EdgeNGramTokenizerTest outerInstance)
- {
- this.outerInstance = outerInstance;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, Reader reader)
- {
-#pragma warning disable 612, 618
- Tokenizer tokenizer = new Lucene43EdgeNGramTokenizer(Version.LUCENE_43, reader, Lucene43EdgeNGramTokenizer.Side.BACK, 2, 4);
-#pragma warning restore 612, 618
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
- }
-
- [Test]
- public virtual void TestTokenizerPositions()
- {
-#pragma warning disable 612, 618
- Tokenizer tokenizer = new Lucene43EdgeNGramTokenizer(Version.LUCENE_43, input, Lucene43EdgeNGramTokenizer.Side.FRONT, 1, 3);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, null, new int[] { 1, 0, 0 }, null, null, false);
-
- tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"), 1, 3);
- AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, null, new int[] { 1, 1, 1 }, null, null, false);
- }
-
- private static void TestNGrams(int minGram, int maxGram, int length, string nonTokenChars)
- {
- // LUCENENET TODO: Changed randomizing strategy - not sure if this is right...
- //string s = RandomStrings.randomAsciiOfLength(Random(), length);
- string s = TestUtil.RandomAnalysisString(Random(), length, true);
- TestNGrams(minGram, maxGram, s, nonTokenChars);
- }
-
- private static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars)
- {
- NGramTokenizerTest.TestNGrams(minGram, maxGram, s, nonTokenChars, true);
- }
-
- [Test]
- public virtual void TestLargeInput()
- {
- // test sliding
- int minGram = TestUtil.NextInt(Random(), 1, 100);
- int maxGram = TestUtil.NextInt(Random(), minGram, 100);
- TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 3 * 1024, 4 * 1024), "");
- }
-
- [Test]
- public virtual void TestLargeMaxGram()
- {
- // test sliding with maxGram > 1024
- int minGram = TestUtil.NextInt(Random(), 1290, 1300);
- int maxGram = TestUtil.NextInt(Random(), minGram, 1300);
- TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 3 * 1024, 4 * 1024), "");
- }
-
- [Test]
- public virtual void TestPreTokenization()
- {
- int minGram = TestUtil.NextInt(Random(), 1, 100);
- int maxGram = TestUtil.NextInt(Random(), minGram, 100);
- TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 0, 4 * 1024), "a");
- }
-
- [Test]
- public virtual void TestHeavyPreTokenization()
- {
- int minGram = TestUtil.NextInt(Random(), 1, 100);
- int maxGram = TestUtil.NextInt(Random(), minGram, 100);
- TestNGrams(minGram, maxGram, TestUtil.NextInt(Random(), 0, 4 * 1024), "abcdef");
- }
-
- [Test]
- public virtual void TestFewTokenChars()
- {
- char[] chrs = new char[TestUtil.NextInt(Random(), 4000, 5000)];
- Arrays.Fill(chrs, ' ');
- for (int i = 0; i < chrs.Length; ++i)
- {
- if (Random().NextDouble() < 0.1)
- {
- chrs[i] = 'a';
- }
- }
- int minGram = TestUtil.NextInt(Random(), 1, 2);
- int maxGram = TestUtil.NextInt(Random(), minGram, 2);
- TestNGrams(minGram, maxGram, new string(chrs), " ");
- }
-
- [Test]
- public virtual void TestFullUTF8Range()
- {
- int minGram = TestUtil.NextInt(Random(), 1, 100);
- int maxGram = TestUtil.NextInt(Random(), minGram, 100);
- string s = TestUtil.RandomUnicodeString(Random(), 4 * 1024);
- TestNGrams(minGram, maxGram, s, "");
- TestNGrams(minGram, maxGram, s, "abcdef");
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/NGramTokenFilterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/NGramTokenFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/NGramTokenFilterTest.cs
deleted file mode 100644
index e485fc0..0000000
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Ngram/NGramTokenFilterTest.cs
+++ /dev/null
@@ -1,249 +0,0 @@
-\ufeffusing Lucene.Net.Analysis.Core;
-using Lucene.Net.Analysis.Miscellaneous;
-using Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Support;
-using Lucene.Net.Util;
-using NUnit.Framework;
-using System;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// Tests <seealso cref="NGramTokenFilter"/> for correctness.
- /// </summary>
- public class NGramTokenFilterTest : BaseTokenStreamTestCase
- {
- private TokenStream input;
-
- public override void SetUp()
- {
- base.SetUp();
- input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
- }
-
- [Test]
- public virtual void TestInvalidInput()
- {
- bool gotException = false;
- try
- {
- new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 1);
- }
- catch (System.ArgumentException)
- {
- gotException = true;
- }
- assertTrue(gotException);
- }
-
- [Test]
- public virtual void TestInvalidInput2()
- {
- bool gotException = false;
- try
- {
- new NGramTokenFilter(TEST_VERSION_CURRENT, input, 0, 1);
- }
- catch (System.ArgumentException)
- {
- gotException = true;
- }
- assertTrue(gotException);
- }
-
- [Test]
- public virtual void TestUnigrams()
- {
- NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 1);
- AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
- }
-
- [Test]
- public virtual void TestBigrams()
- {
- NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 2);
- AssertTokenStreamContents(filter, new string[] { "ab", "bc", "cd", "de" }, new int[] { 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0 });
- }
-
- [Test]
- public virtual void TestNgrams()
- {
- NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
- AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, null, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, null, null, false);
- }
-
- [Test]
- public virtual void TestNgramsNoIncrement()
- {
- NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
- AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, null, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, null, null, false);
- }
-
- [Test]
- public virtual void TestOversizedNgrams()
- {
- NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 6, 7);
- AssertTokenStreamContents(filter, new string[0], new int[0], new int[0]);
- }
-
- [Test]
- public virtual void TestSmallTokenInStream()
- {
- input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
- NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 3, 3);
- AssertTokenStreamContents(filter, new string[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 }, new int[] { 1, 2 });
- }
-
- [Test]
- public virtual void TestReset()
- {
- WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
- NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1);
- AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
- tokenizer.SetReader(new StringReader("abcde"));
- AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
- }
-
- // LUCENE-3642
- // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
- // wrt original text if a previous filter increases the length of the word (in this case � -> ae)
- // so in this case we behave like WDF, and preserve any modified offsets
- [Test]
- public virtual void TestInvalidOffsets()
- {
- Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this);
- AssertAnalyzesTo(analyzer, "mosfellsb�r", new string[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
- }
-
- private class AnalyzerAnonymousInnerClassHelper : Analyzer
- {
- private readonly NGramTokenFilterTest outerInstance;
-
- public AnalyzerAnonymousInnerClassHelper(NGramTokenFilterTest outerInstance)
- {
- this.outerInstance = outerInstance;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
- {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
- filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
- return new TokenStreamComponents(tokenizer, filters);
- }
- }
-
- /// <summary>
- /// blast some random strings through the analyzer </summary>
- [Test]
- public virtual void TestRandomStrings()
- {
- for (int i = 0; i < 10; i++)
- {
- int min = TestUtil.NextInt(Random(), 2, 10);
- int max = TestUtil.NextInt(Random(), min, 20);
- Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, min, max);
- CheckRandomData(Random(), a, 200 * RANDOM_MULTIPLIER, 20);
- }
- }
-
- private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
- {
- private readonly NGramTokenFilterTest outerInstance;
-
- private int min;
- private int max;
-
- public AnalyzerAnonymousInnerClassHelper2(NGramTokenFilterTest outerInstance, int min, int max)
- {
- this.outerInstance = outerInstance;
- this.min = min;
- this.max = max;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
- {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
- }
- }
-
- [Test]
- public virtual void TestEmptyTerm()
- {
- Random random = Random();
- Analyzer a = new AnalyzerAnonymousInnerClassHelper3(this);
- CheckAnalysisConsistency(random, a, random.nextBoolean(), "");
- }
-
- private class AnalyzerAnonymousInnerClassHelper3 : Analyzer
- {
- private readonly NGramTokenFilterTest outerInstance;
-
- public AnalyzerAnonymousInnerClassHelper3(NGramTokenFilterTest outerInstance)
- {
- this.outerInstance = outerInstance;
- }
-
- protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
- {
- Tokenizer tokenizer = new KeywordTokenizer(reader);
- return new TokenStreamComponents(tokenizer, new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 15));
- }
- }
-
- [Test]
- public virtual void TestLucene43()
- {
-#pragma warning disable 612, 618
- NGramTokenFilter filter = new NGramTokenFilter(LuceneVersion.LUCENE_43, input, 2, 3);
-#pragma warning restore 612, 618
- AssertTokenStreamContents(filter, new string[] { "ab", "bc", "cd", "de", "abc", "bcd", "cde" }, new int[] { 0, 1, 2, 3, 0, 1, 2 }, new int[] { 2, 3, 4, 5, 3, 4, 5 }, null, new int[] { 1, 1, 1, 1, 1, 1, 1 }, null, null, false);
- }
-
- [Test]
- public virtual void TestSupplementaryCharacters()
- {
- string s = TestUtil.RandomUnicodeString(Random(), 10);
- int codePointCount = s.CodePointCount(0, s.Length);
- int minGram = TestUtil.NextInt(Random(), 1, 3);
- int maxGram = TestUtil.NextInt(Random(), minGram, 10);
- TokenStream tk = new KeywordTokenizer(new StringReader(s));
- tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
- ICharTermAttribute termAtt = tk.AddAttribute<ICharTermAttribute>();
- IOffsetAttribute offsetAtt = tk.AddAttribute<IOffsetAttribute>();
- tk.Reset();
- for (int start = 0; start < codePointCount; ++start)
- {
- for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end)
- {
- assertTrue(tk.IncrementToken());
- assertEquals(0, offsetAtt.StartOffset);
- assertEquals(s.Length, offsetAtt.EndOffset);
- int startIndex = Character.OffsetByCodePoints(s, 0, start);
- int endIndex = Character.OffsetByCodePoints(s, 0, end);
- assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString());
- }
- }
- assertFalse(tk.IncrementToken());
- }
- }
-}
\ No newline at end of file