You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/04/08 00:08:29 UTC
[2/3] git commit: Ported tests/MockTokenizer
Ported tests/MockTokenizer
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/6e9d73f4
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/6e9d73f4
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/6e9d73f4
Branch: refs/heads/branch_4x
Commit: 6e9d73f4ac8bcbc1b0ae23dc4c32e5ca249c5be8
Parents: 5ecbe92
Author: synhershko <it...@code972.com>
Authored: Tue Apr 8 01:07:44 2014 +0300
Committer: synhershko <it...@code972.com>
Committed: Tue Apr 8 01:07:44 2014 +0300
----------------------------------------------------------------------
test/test-framework/Analysis/MockTokenizer.cs | 285 +++++++++++++++++++
.../Lucene.Net.TestFramework.csproj | 3 +-
2 files changed, 287 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/6e9d73f4/test/test-framework/Analysis/MockTokenizer.cs
----------------------------------------------------------------------
diff --git a/test/test-framework/Analysis/MockTokenizer.cs b/test/test-framework/Analysis/MockTokenizer.cs
new file mode 100644
index 0000000..0cd2942
--- /dev/null
+++ b/test/test-framework/Analysis/MockTokenizer.cs
@@ -0,0 +1,285 @@
+using System;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Randomized;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+
+namespace Lucene.Net.Analysis
+{
+ /**
+ * Tokenizer for testing.
+ * <p>
+ * This tokenizer is a replacement for {@link #WHITESPACE}, {@link #SIMPLE}, and {@link #KEYWORD}
+ * tokenizers. If you are writing a component such as a TokenFilter, its a great idea to test
+ * it wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior:
+ * <ul>
+ * <li>An internal state-machine is used for checking consumer consistency. These checks can
+ * be disabled with {@link #setEnableChecks(boolean)}.
+ * <li>For convenience, optionally lowercases terms that it outputs.
+ * </ul>
+ */
+ public class MockTokenizer : Tokenizer
+ {
+ /** Acts Similar to WhitespaceTokenizer */
+ public static CharacterRunAutomaton WHITESPACE =
+ new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").ToAutomaton());
+ /** Acts Similar to KeywordTokenizer.
+ * TODO: Keyword returns an "empty" token for an empty reader...
+ */
+ public static CharacterRunAutomaton KEYWORD =
+ new CharacterRunAutomaton(new RegExp(".*").ToAutomaton());
+ /** Acts like LetterTokenizer. */
+ // the ugly regex below is incomplete Unicode 5.2 [:Letter:]
+ public static CharacterRunAutomaton SIMPLE =
+ new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").ToAutomaton());
+
+ private CharacterRunAutomaton runAutomaton;
+ private bool lowerCase;
+ private int maxTokenLength;
+ public static int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue;
+ private int state;
+
+ private readonly CharTermAttribute termAtt;
+ private readonly OffsetAttribute offsetAtt;
+ int off = 0;
+
+ // TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
+ // currently, we can only check that the lifecycle is correct if someone is reusing,
+ // but not for "one-offs".
+ private enum State
+ {
+ SETREADER, // consumer set a reader input either via ctor or via reset(Reader)
+ RESET, // consumer has called reset()
+ INCREMENT, // consumer is consuming, has called incrementToken() == true
+ INCREMENT_FALSE, // consumer has called incrementToken() which returned false
+ END, // consumer has called end() to perform end of stream operations
+ CLOSE // consumer has called close() to release any resources
+ };
+
+ private State streamState = State.CLOSE;
+ private int lastOffset = 0; // only for asserting
+ private bool enableChecks = true;
+
+ // evil: but we don't change the behavior with this random, we only switch up how we read
+ private Random random = new Random(/*RandomizedContext.Current.getRandom().nextLong()*/);
+
+ public MockTokenizer(AttributeSource.AttributeFactory factory, System.IO.TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
+ : base(factory, input)
+ {
+ this.runAutomaton = runAutomaton;
+ this.lowerCase = lowerCase;
+ this.state = runAutomaton.InitialState;
+ this.streamState = State.SETREADER;
+ this.maxTokenLength = maxTokenLength;
+
+ termAtt = AddAttribute<CharTermAttribute>();
+ offsetAtt = AddAttribute<OffsetAttribute>();
+ }
+
+ public MockTokenizer(System.IO.TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength) :
+ this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength)
+ {
+ }
+
+ public MockTokenizer(System.IO.TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase) :
+ this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
+ {
+ }
+ /** Calls {@link #MockTokenizer(Reader, CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)} */
+ public MockTokenizer(System.IO.TextReader input) :
+ this(input, WHITESPACE, true)
+ {
+ }
+
+ public MockTokenizer(AttributeFactory factory, System.IO.TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase) :
+ this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
+ {
+ }
+
+ /** Calls {@link #MockTokenizer(org.apache.lucene.util.AttributeSource.AttributeFactory,Reader,CharacterRunAutomaton,boolean)
+ * MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)} */
+
+ public MockTokenizer(AttributeFactory factory, System.IO.TextReader input) :
+ this(input, WHITESPACE, true)
+ {
+
+ }
+
+ public override bool IncrementToken()
+ {
+ // assert !enableChecks || (streamState == State.RESET || streamState == State.INCREMENT)
+ // : "incrementToken() called while in wrong state: " + streamState;
+ ClearAttributes();
+ for (; ; )
+ {
+ int startOffset = off;
+ int cp = readCodePoint();
+ if (cp < 0)
+ {
+ break;
+ }
+ else if (isTokenChar(cp))
+ {
+ int endOffset;
+ do
+ {
+ char[] chars = Character.ToChars(Normalize(cp));
+ for (int i = 0; i < chars.Length; i++)
+ termAtt.Append(chars[i]);
+ endOffset = off;
+ if (termAtt.Length >= maxTokenLength)
+ {
+ break;
+ }
+ cp = readCodePoint();
+ } while (cp >= 0 && isTokenChar(cp));
+
+ int correctedStartOffset = CorrectOffset(startOffset);
+ int correctedEndOffset = CorrectOffset(endOffset);
+ // assert correctedStartOffset >= 0;
+ // assert correctedEndOffset >= 0;
+ // assert correctedStartOffset >= lastOffset;
+ lastOffset = correctedStartOffset;
+ // assert correctedEndOffset >= correctedStartOffset;
+ offsetAtt.SetOffset(correctedStartOffset, correctedEndOffset);
+ streamState = State.INCREMENT;
+ return true;
+ }
+ }
+ streamState = State.INCREMENT_FALSE;
+ return false;
+ }
+
+ protected int readCodePoint()
+ {
+ int ch = ReadChar();
+ if (ch < 0)
+ {
+ return ch;
+ }
+ else
+ {
+ //assert !Character.isLowSurrogate((char) ch) : "unpaired low surrogate: " + Integer.toHexString(ch);
+ off++;
+ if (Character.IsHighSurrogate((char)ch))
+ {
+ int ch2 = ReadChar();
+ if (ch2 >= 0)
+ {
+ off++;
+ //assert Character.isLowSurrogate((char) ch2) : "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2);
+ return Character.ToCodePoint((char)ch, (char)ch2);
+ }
+ else
+ {
+ //assert false : "stream ends with unpaired high surrogate: " + Integer.toHexString(ch);
+ }
+ }
+ return ch;
+ }
+ }
+
+ protected int ReadChar()
+ {
+ switch (random.Next(0, 10))
+ {
+ case 0:
+ {
+ // read(char[])
+ char[] c = new char[1];
+ int ret = input.Read(c, 0, c.Length);
+ return ret < 0 ? ret : c[0];
+ }
+ case 1:
+ {
+ // read(char[], int, int)
+ char[] c = new char[2];
+ int ret = input.Read(c, 1, 1);
+ return ret < 0 ? ret : c[1];
+ }
+ // case 2: {
+ // // read(CharBuffer)
+ // char[] c = new char[1];
+ // CharBuffer cb = CharBuffer.wrap(c);
+ // int ret = input.Read(cb);
+ // return ret < 0 ? ret : c[0];
+ // }
+ default:
+ // read()
+ return input.Read();
+ }
+ }
+
+ protected bool isTokenChar(int c)
+ {
+ state = runAutomaton.Step(state, c);
+ if (state < 0)
+ {
+ state = runAutomaton.InitialState;
+ return false;
+ }
+ else
+ {
+ return true;
+ }
+ }
+
+ protected int Normalize(int c)
+ {
+ return lowerCase ? Character.ToLowerCase(c) : c;
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ state = runAutomaton.InitialState;
+ lastOffset = off = 0;
+ //assert !enableChecks || streamState != State.RESET : "double reset()";
+ streamState = State.RESET;
+ }
+
+ protected virtual void Dispose(bool disposing)
+ {
+ base.Dispose(disposing);
+ // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close()
+ // these tests should disable this check, by default we check the normal workflow.
+ // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this
+ //assert !enableChecks || streamState == State.END || streamState == State.CLOSE : "close() called in wrong state: " + streamState;
+ streamState = State.CLOSE;
+ }
+
+ bool setReaderTestPoint()
+ {
+ //assert !enableChecks || streamState == State.CLOSE : "setReader() called in wrong state: " + streamState;
+ streamState = State.SETREADER;
+ return true;
+ }
+
+ public override void End()
+ {
+ int finalOffset = CorrectOffset(off);
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
+ // these tests should disable this check (in general you should consume the entire stream)
+ try
+ {
+ //assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
+ }
+ finally
+ {
+ streamState = State.END;
+ }
+ }
+
+ /**
+ * Toggle consumer workflow checking: if your test consumes tokenstreams normally you
+ * should leave this enabled.
+ */
+ public void setEnableChecks(bool enableChecks)
+ {
+ this.enableChecks = enableChecks;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/6e9d73f4/test/test-framework/Lucene.Net.TestFramework.csproj
----------------------------------------------------------------------
diff --git a/test/test-framework/Lucene.Net.TestFramework.csproj b/test/test-framework/Lucene.Net.TestFramework.csproj
index 14d381f..6b31aaa 100644
--- a/test/test-framework/Lucene.Net.TestFramework.csproj
+++ b/test/test-framework/Lucene.Net.TestFramework.csproj
@@ -56,6 +56,7 @@
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
+ <Compile Include="Analysis\MockTokenizer.cs" />
<Compile Include="JavaCompatibility\LuceneTestCase.cs" />
<Compile Include="JavaCompatibility\LuceneTypesHelpers.cs" />
<Compile Include="JavaCompatibility\SystemTypesHelpers.cs" />
@@ -85,7 +86,7 @@
</ProjectReference>
</ItemGroup>
<ItemGroup>
- <Folder Include="Analysis\" />
+ <Folder Include="Index\" />
</ItemGroup>
<ItemGroup>
<None Include="Lucene.Net.snk" />