You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by th...@apache.org on 2011/07/17 04:46:03 UTC
[Lucene.Net] svn commit: r1147514 [3/3] - in /incubator/lucene.net/trunk:
src/contrib/Analyzers/ src/contrib/Analyzers/Miscellaneous/
src/contrib/Analyzers/Payloads/ src/contrib/Analyzers/Shingle/
src/contrib/Analyzers/Shingle/Codec/ src/contrib/Analyzers/Shingle/M...
Added: incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs?rev=1147514&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs Sun Jul 17 02:46:00 2011
@@ -0,0 +1,530 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analyzers.Shingle
+{
+ public class ShingleFilterTests : BaseTokenStreamTestCase
+ {
+ public static readonly Token[] TestToken = new[]
+ {
+ CreateToken("please", 0, 6),
+ CreateToken("divide", 7, 13),
+ CreateToken("this", 14, 18),
+ CreateToken("sentence", 19, 27),
+ CreateToken("into", 28, 32),
+ CreateToken("shingles", 33, 39),
+ };
+
+ public static Token[] TestTokenWithHoles;
+
+ public static readonly Token[] BiGramTokens = new[]
+ {
+ CreateToken("please", 0, 6),
+ CreateToken("please divide", 0, 13),
+ CreateToken("divide", 7, 13),
+ CreateToken("divide this", 7, 18),
+ CreateToken("this", 14, 18),
+ CreateToken("this sentence", 14, 27),
+ CreateToken("sentence", 19, 27),
+ CreateToken("sentence into", 19, 32),
+ CreateToken("into", 28, 32),
+ CreateToken("into shingles", 28, 39),
+ CreateToken("shingles", 33, 39),
+ };
+
+ public static readonly int[] BiGramPositionIncrements = new[]
+ {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ public static readonly String[] BiGramTypes = new[]
+ {
+ "word", "shingle", "word", "shingle", "word", "shingle",
+ "word",
+ "shingle", "word", "shingle", "word"
+ };
+
+ public static readonly Token[] BiGramTokensWithHoles = new[]
+ {
+ CreateToken("please", 0, 6),
+ CreateToken("please divide", 0, 13),
+ CreateToken("divide", 7, 13),
+ CreateToken("divide _", 7, 19),
+ CreateToken("_", 19, 19),
+ CreateToken("_ sentence", 19, 27),
+ CreateToken("sentence", 19, 27),
+ CreateToken("sentence _", 19, 33),
+ CreateToken("_", 33, 33),
+ CreateToken("_ shingles", 33, 39),
+ CreateToken("shingles", 33, 39),
+ };
+
+ public static readonly int[] BiGramPositionIncrementsWithHoles = new[]
+ {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ public static readonly Token[] BiGramTokensWithoutUnigrams = new[]
+ {
+ CreateToken("please divide", 0, 13),
+ CreateToken("divide this", 7, 18),
+ CreateToken("this sentence", 14, 27),
+ CreateToken("sentence into", 19, 32),
+ CreateToken("into shingles", 28, 39),
+ };
+
+ public static readonly int[] BiGramPositionIncrementsWithoutUnigrams = new[]
+ {
+ 1, 1, 1, 1, 1
+ };
+
+ public static readonly String[] BiGramTypesWithoutUnigrams = new[]
+ {
+ "shingle", "shingle", "shingle",
+ "shingle", "shingle"
+ };
+
+ public static readonly Token[] BiGramTokensWithHolesWithoutUnigrams = new[]
+ {
+ CreateToken(
+ "please divide", 0, 13),
+ CreateToken("divide _", 7,
+ 19),
+ CreateToken("_ sentence", 19,
+ 27),
+ CreateToken("sentence _", 19,
+ 33),
+ CreateToken("_ shingles", 33,
+ 39),
+ };
+
+ public static readonly int[] BiGramPositionIncrementsWithHolesWithoutUnigrams = new[]
+ {
+ 1, 1, 1, 1, 1, 1
+ };
+
+
+ public static readonly Token[] TestSingleToken = new[] { CreateToken("please", 0, 6) };
+
+ public static readonly Token[] SingleToken = new[] { CreateToken("please", 0, 6) };
+
+ public static readonly int[] SingleTokenIncrements = new[] { 1 };
+
+ public static readonly String[] SingleTokenTypes = new[] { "word" };
+
+ public static readonly Token[] EmptyTokenArray = new Token[] { };
+
+ public static readonly int[] EmptyTokenIncrementsArray = new int[] { };
+
+ public static readonly String[] EmptyTokenTypesArray = new String[] { };
+
+ public static readonly Token[] TriGramTokens = new[]
+ {
+ CreateToken("please", 0, 6),
+ CreateToken("please divide", 0, 13),
+ CreateToken("please divide this", 0, 18),
+ CreateToken("divide", 7, 13),
+ CreateToken("divide this", 7, 18),
+ CreateToken("divide this sentence", 7, 27),
+ CreateToken("this", 14, 18),
+ CreateToken("this sentence", 14, 27),
+ CreateToken("this sentence into", 14, 32),
+ CreateToken("sentence", 19, 27),
+ CreateToken("sentence into", 19, 32),
+ CreateToken("sentence into shingles", 19, 39),
+ CreateToken("into", 28, 32),
+ CreateToken("into shingles", 28, 39),
+ CreateToken("shingles", 33, 39)
+ };
+
+ public static readonly int[] TriGramPositionIncrements = new[]
+ {
+ 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+ };
+
+ public static readonly String[] TriGramTypes = new[]
+ {
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle",
+ "word"
+ };
+
+ public static readonly Token[] TriGramTokensWithoutUnigrams = new[]
+ {
+ CreateToken("please divide", 0, 13),
+ CreateToken("please divide this", 0,
+ 18),
+ CreateToken("divide this", 7, 18),
+ CreateToken("divide this sentence", 7,
+ 27),
+ CreateToken("this sentence", 14, 27),
+ CreateToken("this sentence into", 14,
+ 32),
+ CreateToken("sentence into", 19, 32),
+ CreateToken("sentence into shingles",
+ 19, 39),
+ CreateToken("into shingles", 28, 39),
+ };
+
+ public static readonly int[] TriGramPositionIncrementsWithoutUnigrams = new[]
+ {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ public static readonly String[] TriGramTypesWithoutUnigrams = new[]
+ {
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle",
+ };
+
+ public static readonly Token[] FourGramTokens = new[]
+ {
+ CreateToken("please", 0, 6),
+ CreateToken("please divide", 0, 13),
+ CreateToken("please divide this", 0, 18),
+ CreateToken("please divide this sentence", 0, 27),
+ CreateToken("divide", 7, 13),
+ CreateToken("divide this", 7, 18),
+ CreateToken("divide this sentence", 7, 27),
+ CreateToken("divide this sentence into", 7, 32),
+ CreateToken("this", 14, 18),
+ CreateToken("this sentence", 14, 27),
+ CreateToken("this sentence into", 14, 32),
+ CreateToken("this sentence into shingles", 14, 39),
+ CreateToken("sentence", 19, 27),
+ CreateToken("sentence into", 19, 32),
+ CreateToken("sentence into shingles", 19, 39),
+ CreateToken("into", 28, 32),
+ CreateToken("into shingles", 28, 39),
+ CreateToken("shingles", 33, 39)
+ };
+
+ public static readonly int[] FourGramPositionIncrements = new[]
+ {
+ 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0
+ , 1, 0, 1
+ };
+
+ public static readonly String[] FourGramTypes = new[]
+ {
+ "word", "shingle", "shingle", "shingle",
+ "word", "shingle", "shingle", "shingle",
+ "word", "shingle", "shingle", "shingle",
+ "word", "shingle", "shingle",
+ "word", "shingle",
+ "word"
+ };
+
+ public static readonly Token[] FourGramTokensWithoutUnigrams = new[]
+ {
+ CreateToken("please divide", 0, 13),
+ CreateToken("please divide this", 0,
+ 18),
+ CreateToken(
+ "please divide this sentence", 0,
+ 27),
+ CreateToken("divide this", 7, 18),
+ CreateToken("divide this sentence", 7,
+ 27),
+ CreateToken(
+ "divide this sentence into", 7,
+ 32),
+ CreateToken("this sentence", 14, 27),
+ CreateToken("this sentence into", 14,
+ 32),
+ CreateToken(
+ "this sentence into shingles", 14,
+ 39),
+ CreateToken("sentence into", 19, 32),
+ CreateToken(
+ "sentence into shingles", 19, 39)
+ ,
+ CreateToken("into shingles", 28, 39),
+ };
+
+ public static readonly int[] FourGramPositionIncrementsWithoutUnigrams = new[]
+ {
+ 1, 0, 0, 1, 0, 0, 1, 0, 0,
+ 1, 0, 1
+ };
+
+ public static readonly String[] FourGramTypesWithoutUnigrams = new[]
+ {
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ "shingle", "shingle",
+ };
+
+ private static Token CreateToken(String term, int start, int offset)
+ {
+ var token = new Token(start, offset);
+ token.SetTermBuffer(term);
+ return token;
+ }
+
+ [SetUp]
+ public override void SetUp()
+ {
+ base.SetUp();
+ TestTokenWithHoles = new[]
+ {
+ CreateToken("please", 0, 6),
+ CreateToken("divide", 7, 13),
+ CreateToken("sentence", 19, 27),
+ CreateToken("shingles", 33, 39),
+ };
+
+ TestTokenWithHoles[2].SetPositionIncrement(2);
+ TestTokenWithHoles[3].SetPositionIncrement(2);
+ }
+
+
+ /// <summary>
+ /// Class under test for void ShingleFilter(TokenStream, int)
+ /// </summary>
+ [Test]
+ public void TestBiGramFilter()
+ {
+ ShingleFilterTest(2, TestToken, BiGramTokens,
+ BiGramPositionIncrements, BiGramTypes,
+ true);
+ }
+
+ [Test]
+ public void TestBiGramFilterWithHoles()
+ {
+ ShingleFilterTest(2, TestTokenWithHoles, BiGramTokensWithHoles,
+ BiGramPositionIncrements, BiGramTypes,
+ true);
+ }
+
+ [Test]
+ public void TestBiGramFilterWithoutUnigrams()
+ {
+ ShingleFilterTest(2, TestToken, BiGramTokensWithoutUnigrams,
+ BiGramPositionIncrementsWithoutUnigrams, BiGramTypesWithoutUnigrams,
+ false);
+ }
+
+ [Test]
+ public void TestBiGramFilterWithHolesWithoutUnigrams()
+ {
+ ShingleFilterTest(2, TestTokenWithHoles, BiGramTokensWithHolesWithoutUnigrams,
+ BiGramPositionIncrementsWithHolesWithoutUnigrams, BiGramTypesWithoutUnigrams,
+ false);
+ }
+
+ [Test]
+ public void TestBiGramFilterWithSingleToken()
+ {
+ ShingleFilterTest(2, TestSingleToken, SingleToken,
+ SingleTokenIncrements, SingleTokenTypes,
+ true);
+ }
+
+ [Test]
+ public void TestBiGramFilterWithSingleTokenWithoutUnigrams()
+ {
+ ShingleFilterTest(2, TestSingleToken, EmptyTokenArray,
+ EmptyTokenIncrementsArray, EmptyTokenTypesArray,
+ false);
+ }
+
+ [Test]
+ public void TestBiGramFilterWithEmptyTokenStream()
+ {
+ ShingleFilterTest(2, EmptyTokenArray, EmptyTokenArray,
+ EmptyTokenIncrementsArray, EmptyTokenTypesArray,
+ true);
+ }
+
+ [Test]
+ public void TestBiGramFilterWithEmptyTokenStreamWithoutUnigrams()
+ {
+ ShingleFilterTest(2, EmptyTokenArray, EmptyTokenArray,
+ EmptyTokenIncrementsArray, EmptyTokenTypesArray,
+ false);
+ }
+
+ [Test]
+ public void TestTriGramFilter()
+ {
+ ShingleFilterTest(3, TestToken, TriGramTokens,
+ TriGramPositionIncrements, TriGramTypes,
+ true);
+ }
+
+ [Test]
+ public void TestTriGramFilterWithoutUnigrams()
+ {
+ ShingleFilterTest(3, TestToken, TriGramTokensWithoutUnigrams,
+ TriGramPositionIncrementsWithoutUnigrams, TriGramTypesWithoutUnigrams,
+ false);
+ }
+
+ [Test]
+ public void TestFourGramFilter()
+ {
+ ShingleFilterTest(4, TestToken, FourGramTokens,
+ FourGramPositionIncrements, FourGramTypes,
+ true);
+ }
+
+ [Test]
+ public void TestFourGramFilterWithoutUnigrams()
+ {
+ ShingleFilterTest(4, TestToken, FourGramTokensWithoutUnigrams,
+ FourGramPositionIncrementsWithoutUnigrams,
+ FourGramTypesWithoutUnigrams, false);
+ }
+
+ [Test]
+ public void TestReset()
+ {
+ Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
+ TokenStream filter = new ShingleFilter(wsTokenizer, 2);
+
+ AssertTokenStreamContents(filter,
+ new[]
+ {
+ "please", "please divide", "divide", "divide this", "this",
+ "this sentence",
+ "sentence"
+ },
+ new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
+ new[]
+ {
+ TypeAttributeImpl.DEFAULT_TYPE, "shingle", TypeAttributeImpl.DEFAULT_TYPE,
+ "shingle", TypeAttributeImpl.DEFAULT_TYPE, "shingle",
+ TypeAttributeImpl.DEFAULT_TYPE
+ },
+ new[] {1, 0, 1, 0, 1, 0, 1}
+ );
+
+ wsTokenizer.Reset(new StringReader("please divide this sentence"));
+
+ AssertTokenStreamContents(filter,
+ new[]
+ {
+ "please", "please divide", "divide", "divide this", "this",
+ "this sentence",
+ "sentence"
+ },
+ new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
+ new[]
+ {
+ TypeAttributeImpl.DEFAULT_TYPE, "shingle", TypeAttributeImpl.DEFAULT_TYPE,
+ "shingle", TypeAttributeImpl.DEFAULT_TYPE, "shingle",
+ TypeAttributeImpl.DEFAULT_TYPE
+ },
+ new[] {1, 0, 1, 0, 1, 0, 1}
+ );
+ }
+
+ protected void ShingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
+ int[] positionIncrements, String[] types, bool outputUnigrams)
+ {
+ var filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+ filter.SetOutputUnigrams(outputUnigrams);
+
+ var termAtt = (TermAttribute) filter.AddAttribute(typeof (TermAttribute));
+ var offsetAtt = (OffsetAttribute) filter.AddAttribute(typeof (OffsetAttribute));
+ var posIncrAtt = (PositionIncrementAttribute) filter.AddAttribute(typeof (PositionIncrementAttribute));
+ var typeAtt = (TypeAttribute) filter.AddAttribute(typeof (TypeAttribute));
+
+ int i = 0;
+ while (filter.IncrementToken())
+ {
+ Assert.IsTrue(i < tokensToCompare.Length, "ShingleFilter outputted more tokens than expected");
+
+ String termText = termAtt.Term();
+ String goldText = tokensToCompare[i].Term();
+
+ Assert.AreEqual(goldText, termText, "Wrong termText");
+ Assert.AreEqual(tokensToCompare[i].StartOffset(), offsetAtt.StartOffset(),
+ "Wrong startOffset for token \"" + termText + "\"");
+ Assert.AreEqual(tokensToCompare[i].EndOffset(), offsetAtt.EndOffset(),
+ "Wrong endOffset for token \"" + termText + "\"");
+ Assert.AreEqual(positionIncrements[i], posIncrAtt.GetPositionIncrement(),
+ "Wrong positionIncrement for token \"" + termText + "\"");
+ Assert.AreEqual(types[i], typeAtt.Type(), "Wrong type for token \"" + termText + "\"");
+
+ i++;
+ }
+
+ Assert.AreEqual(tokensToCompare.Length, i,
+ "ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" +
+ tokensToCompare.Length + ")");
+ }
+
+ #region Nested type: TestTokenStream
+
+ public sealed class TestTokenStream : TokenStream
+ {
+ private readonly OffsetAttribute _offsetAtt;
+ private readonly PositionIncrementAttribute _posIncrAtt;
+ private readonly TermAttribute _termAtt;
+ private readonly Token[] _testToken;
+ private readonly TypeAttribute _typeAtt;
+ private int _index;
+
+ public TestTokenStream(Token[] testToken)
+ {
+ _testToken = testToken;
+
+ _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
+ _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
+ _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
+ _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
+ }
+
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+
+ if (_index >= _testToken.Length)
+ return false;
+
+ Token t = _testToken[_index++];
+
+ _termAtt.SetTermBuffer(t.TermBuffer(), 0, t.TermLength());
+ _offsetAtt.SetOffset(t.StartOffset(), t.EndOffset());
+ _posIncrAtt.SetPositionIncrement(t.GetPositionIncrement());
+ _typeAtt.SetType(TypeAttributeImpl.DEFAULT_TYPE);
+
+ return true;
+ }
+ }
+
+ #endregion
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/TestShingleMatrixFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/TestShingleMatrixFilter.cs?rev=1147514&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/TestShingleMatrixFilter.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/TestShingleMatrixFilter.cs Sun Jul 17 02:46:00 2011
@@ -0,0 +1,594 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analyzers.Miscellaneous;
+using Lucene.Net.Analyzers.Payloads;
+using Lucene.Net.Analyzers.Shingle.Codec;
+using Lucene.Net.Analyzers.Shingle.Matrix;
+using NUnit.Framework;
+using FlagsAttribute = Lucene.Net.Analysis.Tokenattributes.FlagsAttribute;
+
+namespace Lucene.Net.Analyzers.Shingle
+{
+ public class TestShingleMatrixFilter : BaseTokenStreamTestCase
+ {
+ public TestShingleMatrixFilter() : this(typeof (TestShingleMatrixFilter).Name)
+ {
+ }
+
+ // use this ctor, because SingleTokenTokenStream only uses next(Token), so exclude it
+ public TestShingleMatrixFilter(String name) :
+ base(
+ name,
+ new Hashtable(
+ new Dictionary<string, string[]>
+ {
+ {
+ "TestShingleMatrixFilter",
+ new[]
+ {
+ "testBehavingAsShingleFilter",
+ "testMatrix",
+ "testIterator"
+ }
+ }
+ }
+ ))
+ {
+ }
+
+ [Test]
+ public void TestIterator()
+ {
+ var wst = new WhitespaceTokenizer(new StringReader("one two three four five"));
+ var smf = new ShingleMatrixFilter(wst, 2, 2, '_', false,
+ new OneDimensionalNonWeightedTokenSettingsCodec());
+
+ int i;
+ for (i = 0; smf.IncrementToken(); i++) { }
+
+ Assert.AreEqual(4, i);
+
+ // call next once more. this should return false again rather than throwing an exception (LUCENE-1939)
+ Assert.IsFalse(smf.IncrementToken());
+
+ //System.DateTime.Now;
+ }
+
+ [Test]
+ public void TestBehavingAsShingleFilter()
+ {
+ ShingleMatrixFilter.DefaultSettingsCodec = null;
+
+ TokenStream ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, ' ', false,
+ new OneDimensionalNonWeightedTokenSettingsCodec
+ ());
+ Assert.IsFalse(ts.IncrementToken());
+
+ // test a plain old token stream with synonyms translated to rows.
+
+ var tokens = new LinkedList<Token>();
+ tokens.AddLast(CreateToken("please", 0, 6));
+ tokens.AddLast(CreateToken("divide", 7, 13));
+ tokens.AddLast(CreateToken("this", 14, 18));
+ tokens.AddLast(CreateToken("sentence", 19, 27));
+ tokens.AddLast(CreateToken("into", 28, 32));
+ tokens.AddLast(CreateToken("shingles", 33, 39));
+
+ var tls = new TokenListStream(tokens);
+
+ // bi-grams
+
+ ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new OneDimensionalNonWeightedTokenSettingsCodec());
+
+ //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token))
+ //{
+ // Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
+ // token.Clear();
+ //}
+
+ AssertTokenStreamContents(ts,
+ new[]
+ {
+ "please", "please divide", "divide", "divide this",
+ "this", "this sentence", "sentence", "sentence into", "into",
+ "into shingles", "shingles"
+ },
+ new[] {0, 0, 7, 7, 14, 14, 19, 19, 28, 28, 33},
+ new[] {6, 13, 13, 18, 18, 27, 27, 32, 32, 39, 39});
+ }
+
+
+ /// <summary>
+ /// Extracts a matrix from a token stream.
+ /// </summary>
+ [Test]
+ public void TestTokenStream()
+ {
+ ShingleMatrixFilter.DefaultSettingsCodec = null;
+ //new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
+
+ // test a plain old token stream with synonyms tranlated to rows.
+
+ var tokens = new LinkedList<Token>();
+ tokens.AddLast(TokenFactory("hello", 1, 0, 4));
+ tokens.AddLast(TokenFactory("greetings", 0, 0, 4));
+ tokens.AddLast(TokenFactory("world", 1, 5, 10));
+ tokens.AddLast(TokenFactory("earth", 0, 5, 10));
+ tokens.AddLast(TokenFactory("tellus", 0, 5, 10));
+
+ TokenStream tls = new TokenListStream(tokens);
+
+ // bi-grams
+
+ TokenStream ts = new ShingleMatrixFilter(tls, 2, 2, '_', false,
+ new TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
+
+ AssertNext(ts, "hello_world");
+ AssertNext(ts, "greetings_world");
+ AssertNext(ts, "hello_earth");
+ AssertNext(ts, "greetings_earth");
+ AssertNext(ts, "hello_tellus");
+ AssertNext(ts, "greetings_tellus");
+ Assert.IsFalse(ts.IncrementToken());
+
+ // bi-grams with no spacer character, start offset, end offset
+
+ tls.Reset();
+ ts = new ShingleMatrixFilter(tls, 2, 2, null, false,
+ new TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
+ AssertNext(ts, "helloworld", 0, 10);
+ AssertNext(ts, "greetingsworld", 0, 10);
+ AssertNext(ts, "helloearth", 0, 10);
+ AssertNext(ts, "greetingsearth", 0, 10);
+ AssertNext(ts, "hellotellus", 0, 10);
+ AssertNext(ts, "greetingstellus", 0, 10);
+ Assert.IsFalse(ts.IncrementToken());
+
+
+ // add ^_prefix_and_suffix_$
+ //
+ // using 3d codec as it supports weights
+
+ ShingleMatrixFilter.DefaultSettingsCodec =
+ new SimpleThreeDimensionalTokenSettingsCodec();
+
+ tokens = new LinkedList<Token>();
+ tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn));
+ tokens.AddLast(TokenFactory("greetings", 0, 1f, 0, 4, TokenPositioner.NewRow));
+ tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn));
+ tokens.AddLast(TokenFactory("earth", 0, 1f, 5, 10, TokenPositioner.NewRow));
+ tokens.AddLast(TokenFactory("tellus", 0, 1f, 5, 10, TokenPositioner.NewRow));
+
+ tls = new TokenListStream(tokens);
+
+ // bi-grams, position incrememnt, weight, start offset, end offset
+
+ ts = new PrefixAndSuffixAwareTokenFilter(
+ new SingleTokenTokenStream(TokenFactory("^", 1, 100f, 0, 0)),
+ tls,
+ new SingleTokenTokenStream(TokenFactory("$", 1, 50f, 0, 0))
+ );
+ tls = new CachingTokenFilter(ts);
+
+ ts = new ShingleMatrixFilter(tls, 2, 2, '_', false);
+
+ //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token)) {
+ // Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
+ // token.Clear();
+ //}
+
+ AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
+ AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
+ AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
+ AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
+ AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
+ Assert.IsFalse(ts.IncrementToken());
+
+ // test unlimited size and allow single boundary token as shingle
+ tls.Reset();
+
+ ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', false);
+
+
+ //for (Token token = ts.Next(new Token()); token != null; token = ts.Next(token))
+ //{
+ // Console.Out.WriteLine("AssertNext(ts, \"" + token.Term() + "\", " + token.GetPositionIncrement() + ", " + (token.GetPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.GetPayload().GetData()).ToString()) + "f, " + token.StartOffset() + ", " + token.EndOffset() + ");");
+ // token.Clear();
+ //}
+
+ AssertNext(ts, "^", 1, 10.0f, 0, 0);
+ AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
+ AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "hello", 1, 1.0f, 0, 4);
+ AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "world", 1, 1.0f, 5, 10);
+ AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
+ AssertNext(ts, "$", 1, 7.071068f, 10, 10);
+ AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
+ AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "greetings", 1, 1.0f, 0, 4);
+ AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "earth", 1, 1.0f, 5, 10);
+ AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
+ AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "tellus", 1, 1.0f, 5, 10);
+ AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
+ AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
+
+ Assert.IsFalse(ts.IncrementToken());
+
+ // test unlimited size but don't allow single boundary token as shingle
+
+ tls.Reset();
+ ts = new ShingleMatrixFilter(tls, 1, Int32.MaxValue, '_', true);
+
+ // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+ // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+ // token.clear();
+ // }
+
+ AssertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
+ AssertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "hello", 1, 1.0f, 0, 4);
+ AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "world", 1, 1.0f, 5, 10);
+ AssertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
+ AssertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
+ AssertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "greetings", 1, 1.0f, 0, 4);
+ AssertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "earth", 1, 1.0f, 5, 10);
+ AssertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
+ AssertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
+ AssertNext(ts, "tellus", 1, 1.0f, 5, 10);
+ AssertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
+ AssertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
+ AssertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
+ AssertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
+
+
+ Assert.IsFalse(ts.IncrementToken());
+
+ //System.currentTimeMillis();
+
+ // multi-token synonyms
+ //
+ // Token[][][] {
+ // {{hello}, {greetings, and, salutations},
+ // {{world}, {earth}, {tellus}}
+ // }
+ //
+
+
+ tokens = new LinkedList<Token>();
+ tokens.AddLast(TokenFactory("hello", 1, 1f, 0, 4, TokenPositioner.NewColumn));
+ tokens.AddLast(TokenFactory("greetings", 1, 1f, 0, 4, TokenPositioner.NewRow));
+ tokens.AddLast(TokenFactory("and", 1, 1f, 0, 4, TokenPositioner.SameRow));
+ tokens.AddLast(TokenFactory("salutations", 1, 1f, 0, 4, TokenPositioner.SameRow));
+ tokens.AddLast(TokenFactory("world", 1, 1f, 5, 10, TokenPositioner.NewColumn));
+ tokens.AddLast(TokenFactory("earth", 1, 1f, 5, 10, TokenPositioner.NewRow));
+ tokens.AddLast(TokenFactory("tellus", 1, 1f, 5, 10, TokenPositioner.NewRow));
+
+ tls = new TokenListStream(tokens);
+
+ // 2-3 grams
+
+ ts = new ShingleMatrixFilter(tls, 2, 3, '_', false);
+
+ // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+ // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+ // token.clear();
+ // }
+
+ // shingle, position increment, weight, start offset, end offset
+
+ AssertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
+ AssertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
+ AssertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
+ AssertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
+ AssertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
+ AssertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+ AssertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
+ AssertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);
+
+ Assert.IsFalse(ts.IncrementToken());
+
+ //System.currentTimeMillis();
+ }
+
+ /// <summary>
+ /// Tests creat shingles from a pre-assembled matrix
+ ///
+ /// Tests the row token z-axis, multi token synonyms.
+ /// </summary>
+ [Test]
+ public void TestMatrix()
+ {
+ // some other tests set this to null.
+ // set it here in case tests are run out of the usual order.
+ ShingleMatrixFilter.DefaultSettingsCodec = new SimpleThreeDimensionalTokenSettingsCodec();
+
+ var matrix = new Matrix.Matrix();
+
+ new Column(TokenFactory("no", 1), matrix);
+ new Column(TokenFactory("surprise", 1), matrix);
+ new Column(TokenFactory("to", 1), matrix);
+ new Column(TokenFactory("see", 1), matrix);
+ new Column(TokenFactory("england", 1), matrix);
+ new Column(TokenFactory("manager", 1), matrix);
+
+ var col = new Column(matrix);
+
+ // sven göran eriksson is a multi token synonym to svennis
+ new Row(col).Tokens.AddLast(TokenFactory("svennis", 1));
+
+ var row = new Row(col);
+ row.Tokens.AddLast(TokenFactory("sven", 1));
+ row.Tokens.AddLast(TokenFactory("göran", 1));
+ row.Tokens.AddLast(TokenFactory("eriksson", 1));
+
+ new Column(TokenFactory("in", 1), matrix);
+ new Column(TokenFactory("the", 1), matrix);
+ new Column(TokenFactory("croud", 1), matrix);
+
+ TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true,
+ new SimpleThreeDimensionalTokenSettingsCodec());
+
+ // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
+ // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+ // token.clear();
+ // }
+
+ AssertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
+ AssertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
+ AssertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
+ AssertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
+ AssertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
+ AssertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
+ AssertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
+ AssertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
+ AssertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
+ AssertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
+ AssertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
+ AssertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
+ AssertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
+ AssertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
+ AssertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
+
+ Assert.IsFalse(ts.IncrementToken());
+ }
+
+ private static Token TokenFactory(String text, int posIncr, int startOffset, int endOffset)
+ {
+ var token = new Token(startOffset, endOffset);
+
+ token.SetTermBuffer(text);
+ token.SetPositionIncrement(posIncr);
+
+ return token;
+ }
+
+ private static Token TokenFactory(String text, int posIncr)
+ {
+ return TokenFactory(text, posIncr, 1f, 0, 0);
+ }
+
+ private static Token TokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset)
+ {
+ var token = new Token(startOffset, endOffset);
+
+ token.SetTermBuffer(text);
+ token.SetPositionIncrement(posIncr);
+
+ ShingleMatrixFilter.DefaultSettingsCodec.SetWeight(token, weight);
+
+ return token;
+ }
+
+ private static Token TokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset,
+ TokenPositioner positioner)
+ {
+ var token = new Token(startOffset, endOffset);
+
+ token.SetTermBuffer(text);
+ token.SetPositionIncrement(posIncr);
+
+ ShingleMatrixFilter.DefaultSettingsCodec.SetWeight(token, weight);
+ ShingleMatrixFilter.DefaultSettingsCodec.SetTokenPositioner(token, positioner);
+
+ return token;
+ }
+
+ // assert-methods start here
+
+ private static void AssertNext(TokenStream ts, String text)
+ {
+ var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute));
+
+ Assert.IsTrue(ts.IncrementToken());
+ Assert.AreEqual(text, termAtt.Term());
+ }
+
+ private static void AssertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset,
+ int endOffset)
+ {
+ var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute));
+ var posIncrAtt = (PositionIncrementAttribute) ts.AddAttribute(typeof (PositionIncrementAttribute));
+ var payloadAtt = (PayloadAttribute) ts.AddAttribute(typeof (PayloadAttribute));
+ var offsetAtt = (OffsetAttribute) ts.AddAttribute(typeof (OffsetAttribute));
+
+ Assert.IsTrue(ts.IncrementToken());
+ Assert.AreEqual(text, termAtt.Term());
+ Assert.AreEqual(positionIncrement, posIncrAtt.GetPositionIncrement());
+ Assert.AreEqual(boost,
+ payloadAtt.GetPayload() == null
+ ? 1f
+ : PayloadHelper.DecodeFloat(payloadAtt.GetPayload().GetData()), 0);
+ Assert.AreEqual(startOffset, offsetAtt.StartOffset());
+ Assert.AreEqual(endOffset, offsetAtt.EndOffset());
+ }
+
+ private static void AssertNext(TokenStream ts, String text, int startOffset, int endOffset)
+ {
+ var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute));
+ var offsetAtt = (OffsetAttribute) ts.AddAttribute(typeof (OffsetAttribute));
+
+ Assert.IsTrue(ts.IncrementToken());
+ Assert.AreEqual(text, termAtt.Term());
+ Assert.AreEqual(startOffset, offsetAtt.StartOffset());
+ Assert.AreEqual(endOffset, offsetAtt.EndOffset());
+ }
+
+ private static Token CreateToken(String term, int start, int offset)
+ {
+ var token = new Token(start, offset);
+ token.SetTermBuffer(term);
+ return token;
+ }
+
+ #region Nested type: TokenListStream
+
+ public sealed class TokenListStream : TokenStream
+ {
+ private readonly FlagsAttribute _flagsAtt;
+ private readonly OffsetAttribute _offsetAtt;
+ private readonly PayloadAttribute _payloadAtt;
+ private readonly PositionIncrementAttribute _posIncrAtt;
+ private readonly TermAttribute _termAtt;
+ private readonly ICollection<Token> _tokens;
+ private readonly TypeAttribute _typeAtt;
+
+ private IEnumerator<Token> _iterator;
+
+ public TokenListStream(ICollection<Token> tokens)
+ {
+ _tokens = tokens;
+ _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
+ _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
+ _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
+ _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
+ _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
+ _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
+ }
+
+ public override bool IncrementToken()
+ {
+ if (_iterator == null)
+ _iterator = _tokens.GetEnumerator();
+
+ if (!_iterator.MoveNext())
+ return false;
+
+ Token prototype = _iterator.Current;
+
+ ClearAttributes();
+
+ _termAtt.SetTermBuffer(prototype.TermBuffer(), 0, prototype.TermLength());
+ _posIncrAtt.SetPositionIncrement(prototype.GetPositionIncrement());
+ _flagsAtt.SetFlags(prototype.GetFlags());
+ _offsetAtt.SetOffset(prototype.StartOffset(), prototype.EndOffset());
+ _typeAtt.SetType(prototype.Type());
+ _payloadAtt.SetPayload(prototype.GetPayload());
+
+ return true;
+ }
+
+
+ public override void Reset()
+ {
+ _iterator = null;
+ }
+ }
+
+ #endregion
+ }
+}
\ No newline at end of file