You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ar...@apache.org on 2008/06/25 04:53:12 UTC
svn commit: r671406 [2/3] - in
/incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis: ./ Standard/
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardTokenizer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.cs Tue Jun 24 19:53:11 2008
@@ -15,273 +15,194 @@
* limitations under the License.
*/
-/* Generated By:JavaCC: Do not edit this line. StandardTokenizer.java */
using System;
+using Token = Lucene.Net.Analysis.Token;
+using Tokenizer = Lucene.Net.Analysis.Tokenizer;
+
namespace Lucene.Net.Analysis.Standard
{
- /// <summary>A grammar-based tokenizer constructed with JavaCC.
- ///
- /// <p> This should be a good tokenizer for most European-language documents:
- ///
- /// <ul>
- /// <li>Splits words at punctuation characters, removing punctuation. However, a
- /// dot that's not followed by whitespace is considered part of a token.
- /// <li>Splits words at hyphens, unless there's a number in the token, in which case
- /// the whole token is interpreted as a product number and is not split.
- /// <li>Recognizes email addresses and internet hostnames as one token.
- /// </ul>
- ///
- /// <p>Many applications have specific tokenizer needs. If this tokenizer does
- /// not suit your application, please consider copying this source code
- /// directory to your project and maintaining your own grammar-based tokenizer.
- /// </summary>
- public class StandardTokenizer : Lucene.Net.Analysis.Tokenizer
- {
-
- /// <summary>Constructs a tokenizer for this Reader. </summary>
- public StandardTokenizer(System.IO.TextReader reader) : this(new FastCharStream(reader))
- {
- this.input = reader;
- }
-
- /// <summary>Returns the next token in the stream, or null at EOS.
- /// <p>The returned token's type is set to an element of {@link
- /// StandardTokenizerConstants#tokenImage}.
- /// </summary>
- public override Lucene.Net.Analysis.Token Next()
- {
- Token token = null;
- switch ((jj_ntk == - 1) ? Jj_ntk() : jj_ntk)
- {
-
- case Lucene.Net.Analysis.Standard.StandardTokenizerConstants.ALPHANUM:
- token = Jj_consume_token(Lucene.Net.Analysis.Standard.StandardTokenizerConstants.ALPHANUM);
- break;
-
- case Lucene.Net.Analysis.Standard.StandardTokenizerConstants.APOSTROPHE:
- token = Jj_consume_token(Lucene.Net.Analysis.Standard.StandardTokenizerConstants.APOSTROPHE);
- break;
-
- case Lucene.Net.Analysis.Standard.StandardTokenizerConstants.ACRONYM:
- token = Jj_consume_token(Lucene.Net.Analysis.Standard.StandardTokenizerConstants.ACRONYM);
- break;
-
- case Lucene.Net.Analysis.Standard.StandardTokenizerConstants.COMPANY:
- token = Jj_consume_token(Lucene.Net.Analysis.Standard.StandardTokenizerConstants.COMPANY);
- break;
-
- case Lucene.Net.Analysis.Standard.StandardTokenizerConstants.EMAIL:
- token = Jj_consume_token(Lucene.Net.Analysis.Standard.StandardTokenizerConstants.EMAIL);
- break;
-
- case Lucene.Net.Analysis.Standard.StandardTokenizerConstants.HOST:
- token = Jj_consume_token(Lucene.Net.Analysis.Standard.StandardTokenizerConstants.HOST);
- break;
-
- case Lucene.Net.Analysis.Standard.StandardTokenizerConstants.NUM:
- token = Jj_consume_token(Lucene.Net.Analysis.Standard.StandardTokenizerConstants.NUM);
- break;
-
- case Lucene.Net.Analysis.Standard.StandardTokenizerConstants.CJ:
- token = Jj_consume_token(Lucene.Net.Analysis.Standard.StandardTokenizerConstants.CJ);
- break;
-
- case 0:
- token = Jj_consume_token(0);
- break;
-
- default:
- jj_la1[0] = jj_gen;
- Jj_consume_token(- 1);
- throw new ParseException();
-
- }
- if (token.kind == Lucene.Net.Analysis.Standard.StandardTokenizerConstants.EOF)
- {
- {
- if (true)
- return null;
- }
- }
- else
- {
- {
- if (true)
- return new Lucene.Net.Analysis.Token(token.image, token.beginColumn, token.endColumn, Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage[token.kind]);
- }
- }
- throw new System.ApplicationException("Missing return statement in function");
- }
-
- /// <summary>By default, closes the input Reader. </summary>
- public override void Close()
- {
- token_source.Close();
- base.Close();
- }
-
- public StandardTokenizerTokenManager token_source;
- public Token token, jj_nt;
- private int jj_ntk;
- private int jj_gen;
- private int[] jj_la1 = new int[1];
- private static int[] jj_la1_0_Renamed_Field;
- private static void jj_la1_0()
- {
- jj_la1_0_Renamed_Field = new int[]{0x10ff};
- }
-
- public StandardTokenizer(CharStream stream)
- {
- token_source = new StandardTokenizerTokenManager(stream);
- token = new Token();
- jj_ntk = - 1;
- jj_gen = 0;
- for (int i = 0; i < 1; i++)
- jj_la1[i] = - 1;
- }
-
- public virtual void ReInit(CharStream stream)
- {
- token_source.ReInit(stream);
- token = new Token();
- jj_ntk = - 1;
- jj_gen = 0;
- for (int i = 0; i < 1; i++)
- jj_la1[i] = - 1;
- }
-
- public StandardTokenizer(StandardTokenizerTokenManager tm)
- {
- token_source = tm;
- token = new Token();
- jj_ntk = - 1;
- jj_gen = 0;
- for (int i = 0; i < 1; i++)
- jj_la1[i] = - 1;
- }
-
- public virtual void ReInit(StandardTokenizerTokenManager tm)
- {
- token_source = tm;
- token = new Token();
- jj_ntk = - 1;
- jj_gen = 0;
- for (int i = 0; i < 1; i++)
- jj_la1[i] = - 1;
- }
-
- private Token Jj_consume_token(int kind)
- {
- Token oldToken;
- if ((oldToken = token).next != null)
- token = token.next;
- else
- token = token.next = token_source.GetNextToken();
- jj_ntk = - 1;
- if (token.kind == kind)
- {
- jj_gen++;
- return token;
- }
- token = oldToken;
- jj_kind = kind;
- throw GenerateParseException();
- }
-
- public Token GetNextToken()
- {
- if (token.next != null)
- token = token.next;
- else
- token = token.next = token_source.GetNextToken();
- jj_ntk = - 1;
- jj_gen++;
- return token;
- }
-
- public Token GetToken(int index)
- {
- Token t = token;
- for (int i = 0; i < index; i++)
- {
- if (t.next != null)
- t = t.next;
- else
- t = t.next = token_source.GetNextToken();
- }
- return t;
- }
-
- private int Jj_ntk()
- {
- if ((jj_nt = token.next) == null)
- return (jj_ntk = (token.next = token_source.GetNextToken()).kind);
- else
- return (jj_ntk = jj_nt.kind);
- }
-
- private System.Collections.ArrayList jj_expentries = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
- private int[] jj_expentry;
- private int jj_kind = - 1;
-
- public virtual ParseException GenerateParseException()
- {
- jj_expentries.Clear();
- bool[] la1tokens = new bool[16];
- for (int i = 0; i < 16; i++)
- {
- la1tokens[i] = false;
- }
- if (jj_kind >= 0)
- {
- la1tokens[jj_kind] = true;
- jj_kind = - 1;
- }
- for (int i = 0; i < 1; i++)
- {
- if (jj_la1[i] == jj_gen)
- {
- for (int j = 0; j < 32; j++)
- {
- if ((jj_la1_0_Renamed_Field[i] & (1 << j)) != 0)
- {
- la1tokens[j] = true;
- }
- }
- }
- }
- for (int i = 0; i < 16; i++)
- {
- if (la1tokens[i])
- {
- jj_expentry = new int[1];
- jj_expentry[0] = i;
- jj_expentries.Add(jj_expentry);
- }
- }
- int[][] exptokseq = new int[jj_expentries.Count][];
- for (int i = 0; i < jj_expentries.Count; i++)
- {
- exptokseq[i] = (int[]) jj_expentries[i];
- }
- return new ParseException(token, exptokseq, Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage);
- }
-
- public void Enable_tracing()
- {
- }
-
- public void Disable_tracing()
- {
- }
- static StandardTokenizer()
- {
- {
- jj_la1_0();
- }
- }
- }
+ /// <summary>A grammar-based tokenizer constructed with JFlex
+ ///
+ /// <p> This should be a good tokenizer for most European-language documents:
+ ///
+ /// <ul>
+ /// <li>Splits words at punctuation characters, removing punctuation. However, a
+ /// dot that's not followed by whitespace is considered part of a token.
+ /// <li>Splits words at hyphens, unless there's a number in the token, in which case
+ /// the whole token is interpreted as a product number and is not split.
+ /// <li>Recognizes email addresses and internet hostnames as one token.
+ /// </ul>
+ ///
+ /// <p>Many applications have specific tokenizer needs. If this tokenizer does
+ /// not suit your application, please consider copying this source code
+ /// directory to your project and maintaining your own grammar-based tokenizer.
+ /// </summary>
+
+ public class StandardTokenizer : Tokenizer
+ {
+ private void InitBlock()
+ {
+ maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+ }
+ /// <summary>A private instance of the JFlex-constructed scanner </summary>
+ private StandardTokenizerImpl scanner;
+
+ /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
+ /// This is false by default to support backward compatibility.
+ /// <p/>
+ /// See http://issues.apache.org/jira/browse/LUCENE-1068
+ ///
+ /// </summary>
+ /// <deprecated> this should be removed in the next release (3.0).
+ /// </deprecated>
+ private bool replaceInvalidAcronym = false;
+
+ internal virtual void SetInput(System.IO.TextReader reader)
+ {
+ this.input = reader;
+ }
+
+ private int maxTokenLength;
+
+ /// <summary>Set the max allowed token length. Any token longer
+ /// than this is skipped.
+ /// </summary>
+ public virtual void SetMaxTokenLength(int length)
+ {
+ this.maxTokenLength = length;
+ }
+
+ /// <seealso cref="setMaxTokenLength">
+ /// </seealso>
+ public virtual int GetMaxTokenLength()
+ {
+ return maxTokenLength;
+ }
+
+ /// <summary> Creates a new instance of the {@link StandardTokenizer}. Attaches the
+ /// <code>input</code> to a newly created JFlex scanner.
+ /// </summary>
+ public StandardTokenizer(System.IO.TextReader input)
+ {
+ InitBlock();
+ this.input = input;
+ this.scanner = new StandardTokenizerImpl(input);
+ }
+
+ /// <summary> Creates a new instance of the {@link Lucene.Net.Analysis.Standard.StandardTokenizer}. Attaches
+ /// the <code>input</code> to the newly created JFlex scanner.
+ ///
+ /// </summary>
+ /// <param name="input">The input reader
+ /// </param>
+ /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms with HOST.
+ ///
+ /// See http://issues.apache.org/jira/browse/LUCENE-1068
+ /// </param>
+ public StandardTokenizer(System.IO.TextReader input, bool replaceInvalidAcronym)
+ {
+ InitBlock();
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ this.input = input;
+ this.scanner = new StandardTokenizerImpl(input);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see Lucene.Net.Analysis.TokenStream#next()
+ */
+ public override Token Next(Token result)
+ {
+ int posIncr = 1;
+
+ while (true)
+ {
+ int tokenType = scanner.GetNextToken();
+
+ if (tokenType == StandardTokenizerImpl.YYEOF)
+ {
+ return null;
+ }
+
+ if (scanner.Yylength() <= maxTokenLength)
+ {
+ result.Clear();
+ result.SetPositionIncrement(posIncr);
+ scanner.GetText(result);
+ int start = scanner.Yychar();
+ result.SetStartOffset(start);
+ result.SetEndOffset(start + result.TermLength());
+ // This 'if' should be removed in the next release. For now, it converts
+ // invalid acronyms to HOST. When removed, only the 'else' part should
+ // remain.
+ if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
+ {
+ if (replaceInvalidAcronym)
+ {
+ result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
+ result.SetTermLength(result.TermLength() - 1); // remove extra '.'
+ }
+ else
+ {
+ result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
+ }
+ }
+ else
+ {
+ result.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+ }
+ return result;
+ }
+ // When we skip a too-long term, we still increment the
+ // position increment
+ else
+ posIncr++;
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see Lucene.Net.Analysis.TokenStream#reset()
+ */
+ public override void Reset()
+ {
+ base.Reset();
+ scanner.Yyreset(input);
+ }
+
+ public override void Reset(System.IO.TextReader reader)
+ {
+ input = reader;
+ Reset();
+ }
+
+ /// <summary> Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
+ /// when they should have been labeled as hosts instead.
+ /// </summary>
+ /// <returns> true if StandardTokenizer now returns these tokens as Hosts, otherwise false
+ ///
+ /// </returns>
+ /// <deprecated> Remove in 3.X and make true the only valid value
+ /// </deprecated>
+ public virtual bool IsReplaceInvalidAcronym()
+ {
+ return replaceInvalidAcronym;
+ }
+
+ /// <summary> </summary>
+ /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST.
+ /// </param>
+ /// <deprecated> Remove in 3.X and make true the only valid value
+ ///
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ /// </deprecated>
+ public virtual void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
+ {
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj Tue Jun 24 19:53:11 2008
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+// {{Aroush-2.3.1}} remove this file from SVN
options {
STATIC = false;
//IGNORE_CASE = true;
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs Tue Jun 24 19:53:11 2008
@@ -14,10 +14,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+// {{Aroush-2.3.1}} remove this file from SVN
/* Generated By:JavaCC: Do not edit this line. StandardTokenizerConstants.java */
using System;
-
+/*
namespace Lucene.Net.Analysis.Standard
{
@@ -42,4 +42,5 @@
public const int DEFAULT = 0;
public static System.String[] tokenImage = new System.String[]{"<EOF>", "<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<P>", "<HAS_DIGIT>", "<ALPHA>", "<LETTER>", "<CJ>", "<KOREAN>", "<DIGIT>", "<NOISE>"};
}
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs?rev=671406&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs Tue Jun 24 19:53:11 2008
@@ -0,0 +1,662 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* The following code was generated by JFlex 1.4.1 on 12/18/07 9:22 PM */
+using System;
+
+using Token = Lucene.Net.Analysis.Token;
+
+namespace Lucene.Net.Analysis.Standard
+{
+
+
+ /// <summary> This class is a scanner generated by
+ /// <a href="http://www.jflex.de/">JFlex</a> 1.4.1
+ /// on 12/18/07 9:22 PM from the specification file
+ /// <tt>/Volumes/User/grantingersoll/projects/lucene/java/lucene-clean/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
+ /// </summary>
+ class StandardTokenizerImpl
+ {
+
+ /// <summary>This character denotes the end of file </summary>
+ public const int YYEOF = - 1;
+
+ /// <summary>initial size of the lookahead buffer </summary>
+ private const int ZZ_BUFFERSIZE = 16384;
+
+ /// <summary>lexical states </summary>
+ public const int YYINITIAL = 0;
+
+ /// <summary> Translates characters to character classes</summary>
+ private const System.String ZZ_CMAP_PACKED = "\x0009\x0000\x0001\x0000\x0001\x000E\x0001\x0000\x0001\x0000\x0001\x000D\x0012\x0000\x0001\x0000\x0005\x0000\x0001\x0003" + "\x0001\x0001\x0004\x0000\x0001\x0007\x0001\x0005\x0001\x0002\x0001\x0007\x000A\x0009\x0006\x0000\x0001\x0004\x001A\x0008" + "\x0004\x0000\x0001\x0006\x0001\x0000\x001A\x0008\x0045\x0000\x0017\x0008\x0001\x0000\x001F\x0008\x0001\x0000\u0568\x0008" + "\x000A\x000A\x0086\x0008\x000A\x000A\u026c\x0008\x000A\x000A\x0076\x0008\x000A\x000A\x0076\x0008\x000A\x000A\x0076\x0008" + "\x000A\x000A\x0076\x0008\x000A\x000A\x0077\x0008\x0009\x000A\x0076\x0008\x000A\x000A\x0076\x0008\x000A\x000A\x0076\x0008" + "\x000A\x000A\x00E0\x0008\x000A\x000A\x0076\x0008\x000A\x000A\u0166\x0008\x000A\x000A\x00B6\x0008\u0100\x0008\u0e00\x0008" + "\u1040\x0000\u0150\x000C\x0060\x0000\x0010\x000C\u0100\x0000\x0080\x000C\x0080\x0000\u19c0\x000C\x0040\x0000\u5200\x000C" + "\u0c00\x0000\u2bb0\x000B\u2150\x0000\u0200\x000C\u0465\x0000\x003B
\x000C\x003D\x0008\x0023\x0000";
+
+ /// <summary> Translates characters to character classes</summary>
+ private static readonly char[] ZZ_CMAP = ZzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /// <summary> Translates DFA states to action switch labels.</summary>
+ private static readonly int[] ZZ_ACTION = ZzUnpackAction();
+
+ private const System.String ZZ_ACTION_PACKED_0 = "\x0001\x0000\x0001\x0001\x0004\x0002\x0001\x0003\x0001\x0001\x0006\x0000\x0002\x0002\x0006\x0000" + "\x0001\x0004\x0004\x0005\x0002\x0006\x0002\x0000\x0001\x0007\x0001\x0000\x0001\x0007\x0003\x0005" + "\x0006\x0007\x0003\x0005\x0001\x0008\x0001\x0000\x0001\x0009\x0002\x0000\x0001\x0008\x0001\x0009" + "\x0001\x0000\x0002\x0009\x0002\x0008\x0002\x0005\x0001\x000A";
+
+ private static int[] ZzUnpackAction()
+ {
+ int[] result = new int[61];
+ int offset = 0;
+ offset = ZzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int ZzUnpackAction(System.String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int count = packed[i++];
+ int value_Renamed = packed[i++];
+ do
+ result[j++] = value_Renamed;
+ while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /// <summary> Translates a state to a row index in the transition table</summary>
+ private static readonly int[] ZZ_ROWMAP = ZzUnpackRowMap();
+
+ private const System.String ZZ_ROWMAP_PACKED_0 = "\x0000\x0000\x0000\x000F\x0000\x001E\x0000\x002D\x0000\x003C\x0000\x004B\x0000\x000F\x0000\x005A" + "\x0000\x0069\x0000\x0078\x0000\x0087\x0000\x0096\x0000\x00A5\x0000\x00B4\x0000\x00C3\x0000\x00D2" + "\x0000\x00E1\x0000\x00F0\x0000\x00FF\x0000\u010e\x0000\u011d\x0000\u012c\x0000\u013b\x0000\u014a" + "\x0000\u0159\x0000\u0168\x0000\u0177\x0000\x0087\x0000\u0186\x0000\u0195\x0000\u01a4\x0000\u01b3" + "\x0000\u01c2\x0000\u01d1\x0000\u01e0\x0000\u01ef\x0000\u01fe\x0000\u020d\x0000\u021c\x0000\u022b" + "\x0000\u023a\x0000\u0249\x0000\u0258\x0000\u0267\x0000\u0276\x0000\u0285\x0000\u0294\x0000\u02a3" + "\x0000\u02b2\x0000\u02c1\x0000\u02d0\x0000\u02df\x0000\u02ee\x0000\u02fd\x0000\u012c\x0000\x00E1" + "\x0000\x0078\x0000\u011d\x0000\u030c\x0000\u031b\x0000\u032a";
+
+ private static int[] ZzUnpackRowMap()
+ {
+ int[] result = new int[61];
+ int offset = 0;
+ offset = ZzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int ZzUnpackRowMap(System.String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int high = packed[i++] << 16;
+ result[j++] = high | packed[i++];
+ }
+ return j;
+ }
+
+ /// <summary> The transition table of the DFA</summary>
+ private static readonly int[] ZZ_TRANS = ZzUnpackTrans();
+
+ private const System.String ZZ_TRANS_PACKED_0 = "\x0008\x0002\x0001\x0003\x0001\x0004\x0001\x0005\x0001\x0006\x0001\x0007\x0001\x0008\x0001\x0002" + "\x0010\x0000\x0001\x0009\x0001\x000A\x0001\x000B\x0001\x000C\x0002\x000D\x0001\x000E\x0001\x000F" + "\x0001\x0004\x0001\x0010\x0001\x0006\x0005\x0000\x0001\x0011\x0001\x0000\x0001\x0012\x0002\x0013" + "\x0001\x0014\x0003\x0004\x0001\x0006\x0004\x0000\x0001\x0009\x0001\x0015\x0001\x000B\x0001\x000C" + "\x0002\x0013\x0001\x0014\x0001\x0010\x0001\x0004\x0001\x0010\x0001\x0006\x0005\x0000\x0001\x0016" + "\x0001\x0000\x0001\x0012\x0002\x000D\x0001\x000E\x0004\x0006\x0011\x0000\x0001\x0002\x0008\x0000" + "\x0001\x0017\x0001\x0000\x0001\x0017\x000C\x0000\x0001\x0018\x0001\x0019\x0001\x001A\x0001\x001B" + "\x000B\x0000\x0001\x001C\x0001\x0000\x0001\x001C\x000C\x0000\x0001\x001D\x0001\x001E\x0001\x001D" + "\x0001\x001E\x000B\x0000\x0001\x001F\x0002\x0020\x0001\x0021\x000B\x0000\x0001\x000E\x0002\x0022" + "\x0005\x0000\x0001\x0009\x000
1\x0016\x0001\x000B\x0001\x000C\x0002\x000D\x0001\x000E\x0001\x000F" + "\x0001\x0004\x0001\x0010\x0001\x0006\x0004\x0000\x0001\x0009\x0001\x0011\x0001\x000B\x0001\x000C" + "\x0002\x0013\x0001\x0014\x0001\x0010\x0001\x0004\x0001\x0010\x0001\x0006\x000B\x0000\x0001\x0023" + "\x0002\x0024\x0001\x0025\x000B\x0000\x0004\x001E\x000B\x0000\x0001\x0026\x0002\x0027\x0001\x0028" + "\x000B\x0000\x0001\x0029\x0002\x002A\x0001\x002B\x000B\x0000\x0001\x002C\x0001\x0024\x0001\x002D" + "\x0001\x0025\x000B\x0000\x0001\x002E\x0002\x0019\x0001\x001B\x0004\x0000\x0001\x0009\x0006\x0000" + "\x0001\x0017\x0001\x0000\x0001\x0017\x0006\x0000\x0001\x002F\x0001\x0000\x0001\x0012\x0002\x0030" + "\x0001\x0000\x0001\x002E\x0002\x0019\x0001\x001B\x0005\x0000\x0001\x0031\x0001\x0000\x0001\x0012" + "\x0002\x0032\x0001\x0033\x0003\x0019\x0001\x001B\x0005\x0000\x0001\x0034\x0001\x0000\x0001\x0012" + "\x0002\x0032\x0001\x0033\x0003\x0019\x0001\x001B\x0005\x0000\x0001\x0035\x0001\x0000\x0001\x0012" +
+ "\x0002\x0030\x0001\x0000\x0004\x001B\x0005\x0000\x0001\x0036\x0002\x0000\x0001\x0036\x0002\x0000" + "\x0001\x001D\x0001\x001E\x0001\x001D\x0001\x001E\x0005\x0000\x0001\x0036\x0002\x0000\x0001\x0036" + "\x0002\x0000\x0004\x001E\x0005\x0000\x0001\x0030\x0001\x0000\x0001\x0012\x0002\x0030\x0001\x0000" + "\x0001\x001F\x0002\x0020\x0001\x0021\x0005\x0000\x0001\x0032\x0001\x0000\x0001\x0012\x0002\x0032" + "\x0001\x0033\x0003\x0020\x0001\x0021\x0005\x0000\x0001\x0030\x0001\x0000\x0001\x0012\x0002\x0030" + "\x0001\x0000\x0004\x0021\x0005\x0000\x0001\x0033\x0002\x0000\x0003\x0033\x0003\x0022\x0006\x0000" + "\x0001\x0037\x0001\x0000\x0001\x0012\x0002\x000D\x0001\x000E\x0001\x0023\x0002\x0024\x0001\x0025" + "\x0005\x0000\x0001\x0038\x0001\x0000\x0001\x0012\x0002\x0013\x0001\x0014\x0003\x0024\x0001\x0025" + "\x0005\x0000\x0001\x0037\x0001\x0000\x0001\x0012\x0002\x000D\x0001\x000E\x0004\x0025\x0005\x0000" + "\x0001\x000D\x0001\x0000\x0001\x0012\x0002\x000D\x0001\x000E\x0001\x0026\x00
02\x0027\x0001\x0028" + "\x0005\x0000\x0001\x0013\x0001\x0000\x0001\x0012\x0002\x0013\x0001\x0014\x0003\x0027\x0001\x0028" + "\x0005\x0000\x0001\x000D\x0001\x0000\x0001\x0012\x0002\x000D\x0001\x000E\x0004\x0028\x0005\x0000" + "\x0001\x000E\x0002\x0000\x0003\x000E\x0001\x0029\x0002\x002A\x0001\x002B\x0005\x0000\x0001\x0014" + "\x0002\x0000\x0003\x0014\x0003\x002A\x0001\x002B\x0005\x0000\x0001\x000E\x0002\x0000\x0003\x000E" + "\x0004\x002B\x0005\x0000\x0001\x0039\x0001\x0000\x0001\x0012\x0002\x000D\x0001\x000E\x0001\x0023" + "\x0002\x0024\x0001\x0025\x0005\x0000\x0001\x003A\x0001\x0000\x0001\x0012\x0002\x0013\x0001\x0014" + "\x0003\x0024\x0001\x0025\x0005\x0000\x0001\x0035\x0001\x0000\x0001\x0012\x0002\x0030\x0001\x0000" + "\x0001\x002E\x0002\x0019\x0001\x001B\x000B\x0000\x0001\x003B\x0001\x001B\x0001\x003B\x0001\x001B" + "\x000B\x0000\x0004\x0021\x000B\x0000\x0004\x0025\x000B\x0000\x0004\x0028\x000B\x0000\x0004\x002B" + "\x000B\x0000\x0001\x003C\x0001\x0025\x0001\x003C\x0001\
x0025\x000B\x0000\x0004\x001B\x000B\x0000" +
+ "\x0004\x003D\x0005\x0000\x0001\x002F\x0001\x0000\x0001\x0012\x0002\x0030\x0001\x0000\x0004\x001B" + "\x0005\x0000\x0001\x0039\x0001\x0000\x0001\x0012\x0002\x000D\x0001\x000E\x0004\x0025\x0005\x0000" + "\x0001\x0036\x0002\x0000\x0001\x0036\x0002\x0000\x0004\x003D\x0003\x0000";
+
+ private static int[] ZzUnpackTrans()
+ {
+ int[] result = new int[825];
+ int offset = 0;
+ offset = ZzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int ZzUnpackTrans(System.String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int count = packed[i++];
+ int value_Renamed = packed[i++];
+ value_Renamed--;
+ do
+ result[j++] = value_Renamed;
+ while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private const int ZZ_UNKNOWN_ERROR = 0;
+ private const int ZZ_NO_MATCH = 1;
+ private const int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static readonly System.String[] ZZ_ERROR_MSG = new System.String[]{"Unkown internal scanner error", "Error: could not match input", "Error: pushback value was too large"};
+
+ /// <summary> ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code></summary>
+ private static readonly int[] ZZ_ATTRIBUTE = ZzUnpackAttribute();
+
+ private const System.String ZZ_ATTRIBUTE_PACKED_0 = "\x0001\x0000\x0001\x0009\x0004\x0001\x0001\x0009\x0001\x0001\x0006\x0000\x0002\x0001\x0006\x0000" + "\x0007\x0001\x0002\x0000\x0001\x0001\x0001\x0000\x000E\x0001\x0001\x0000\x0001\x0001\x0002\x0000" + "\x0002\x0001\x0001\x0000\x0007\x0001";
+
+ private static int[] ZzUnpackAttribute()
+ {
+ int[] result = new int[61];
+ int offset = 0;
+ offset = ZzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int ZzUnpackAttribute(System.String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int count = packed[i++];
+ int value_Renamed = packed[i++];
+ do
+ result[j++] = value_Renamed;
+ while (--count > 0);
+ }
+ return j;
+ }
+
+ /// <summary>the input device </summary>
+ private System.IO.TextReader zzReader;
+
+ /// <summary>the current state of the DFA </summary>
+ private int zzState;
+
+ /// <summary>the current lexical state </summary>
+ private int zzLexicalState = YYINITIAL;
+
+ /// <summary>this buffer contains the current text to be matched and is
+ /// the source of the yytext() string
+ /// </summary>
+ private char[] zzBuffer = new char[ZZ_BUFFERSIZE];
+
+ /// <summary>the textposition at the last accepting state </summary>
+ private int zzMarkedPos;
+
+ /// <summary>the textposition at the last state to be included in yytext </summary>
+ private int zzPushbackPos;
+
+ /// <summary>the current text position in the buffer </summary>
+ private int zzCurrentPos;
+
+ /// <summary>startRead marks the beginning of the yytext() string in the buffer </summary>
+ private int zzStartRead;
+
+ /// <summary>endRead marks the last character in the buffer, that has been read
+ /// from input
+ /// </summary>
+ private int zzEndRead;
+
+ /// <summary>number of newlines encountered up to the start of the matched text </summary>
+ private int yyline;
+
+ /// <summary>the number of characters up to the start of the matched text </summary>
+ private int yychar;
+
+ /// <summary> the number of characters from the last newline up to the start of the
+ /// matched text
+ /// </summary>
+ private int yycolumn;
+
+ /// <summary> zzAtBOL == true <=> the scanner is currently at the beginning of a line</summary>
+ private bool zzAtBOL = true;
+
+ /// <summary>zzAtEOF == true <=> the scanner is at the EOF </summary>
+ private bool zzAtEOF;
+
+ /* user code: */
+
+ public const int ALPHANUM = 0;
+ public const int APOSTROPHE = 1;
+ public const int ACRONYM = 2;
+ public const int COMPANY = 3;
+ public const int EMAIL = 4;
+ public const int HOST = 5;
+ public const int NUM = 6;
+ public const int CJ = 7;
+ /// <deprecated> this solves a bug where HOSTs that end with '.' are identified
+ /// as ACRONYMs. It is deprecated and will be removed in the next
+ /// release.
+ /// </deprecated>
+ public const int ACRONYM_DEP = 8;
+
+ public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
+
+ public int Yychar()
+ {
+ return yychar;
+ }
+
+ /// <summary> Fills Lucene token with the current token text.</summary>
+ internal void GetText(Token t)
+ {
+ t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ }
+
+
+ /// <summary> Creates a new scanner
+ /// There is also a java.io.InputStream version of this constructor.
+ ///
+ /// </summary>
+ /// <param name="in"> the java.io.Reader to read input from.
+ /// </param>
+ internal StandardTokenizerImpl(System.IO.TextReader in_Renamed)
+ {
+ this.zzReader = in_Renamed;
+ }
+
+ /// <summary> Creates a new scanner.
+ /// There is also java.io.Reader version of this constructor.
+ ///
+ /// </summary>
+ /// <param name="in"> the java.io.Inputstream to read input from.
+ /// </param>
+ internal StandardTokenizerImpl(System.IO.Stream in_Renamed) : this(new System.IO.StreamReader(in_Renamed, System.Text.Encoding.Default))
+ {
+ }
+
+ /// <summary> Unpacks the compressed character translation table.
+ ///
+ /// </summary>
+ /// <param name="packed"> the packed character translation table
+ /// </param>
+ /// <returns> the unpacked character translation table
+ /// </returns>
+ private static char[] ZzUnpackCMap(System.String packed)
+ {
+ char[] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 156)
+ {
+ int count = packed[i++];
+ char value_Renamed = packed[i++];
+ do
+ map[j++] = value_Renamed;
+ while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /// <summary> Refills the input buffer.
+ ///
+ /// </summary>
+ /// <returns> <code>false</code>, iff there was new input.
+ ///
+ /// </returns>
+ /// <exception cref="java.io.IOException"> if any I/O-Error occurs
+ /// </exception>
+ private bool ZzRefill()
+ {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0)
+ {
+ Array.Copy(zzBuffer, zzStartRead, zzBuffer, 0, zzEndRead - zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead -= zzStartRead;
+ zzCurrentPos -= zzStartRead;
+ zzMarkedPos -= zzStartRead;
+ zzPushbackPos -= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.Length)
+ {
+ /* if not: blow it up */
+ char[] newBuffer = new char[zzCurrentPos * 2];
+ Array.Copy(zzBuffer, 0, newBuffer, 0, zzBuffer.Length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.Read(zzBuffer, zzEndRead, zzBuffer.Length - zzEndRead);
+
+ if (numRead < 1)
+ {
+ return true;
+ }
+ else
+ {
+ zzEndRead += numRead;
+ return false;
+ }
+ }
+
+
+ /// <summary> Closes the input stream.</summary>
+ public void Yyclose()
+ {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.Close();
+ }
+
+
+ /// <summary> Resets the scanner to read from a new input stream.
+ /// Does not close the old reader.
+ ///
+ /// All internal variables are reset, the old input stream
+ /// <b>cannot</b> be reused (internal buffer is discarded and lost).
+ /// Lexical state is set to <tt>ZZ_INITIAL</tt>.
+ ///
+ /// </summary>
+ /// <param name="reader"> the new input stream
+ /// </param>
+ public void Yyreset(System.IO.TextReader reader)
+ {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /// <summary> Returns the current lexical state.</summary>
+ public int Yystate()
+ {
+ return zzLexicalState;
+ }
+
+
+ /// <summary> Enters a new lexical state
+ ///
+ /// </summary>
+ /// <param name="newState">the new lexical state
+ /// </param>
+ public void Yybegin(int newState)
+ {
+ zzLexicalState = newState;
+ }
+
+
+ /// <summary> Returns the text matched by the current regular expression.</summary>
+ public System.String Yytext()
+ {
+ return new System.String(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ }
+
+
+ /// <summary> Returns the character at position <tt>pos</tt> from the
+ /// matched text.
+ ///
+ /// It is equivalent to yytext().charAt(pos), but faster
+ ///
+ /// </summary>
+ /// <param name="pos">the position of the character to fetch.
+ /// A value from 0 to yylength()-1.
+ ///
+ /// </param>
+ /// <returns> the character at position pos
+ /// </returns>
+ public char Yycharat(int pos)
+ {
+ return zzBuffer[zzStartRead + pos];
+ }
+
+
+ /// <summary> Returns the length of the matched text region.</summary>
+ public int Yylength()
+ {
+ return zzMarkedPos - zzStartRead;
+ }
+
+
+ /// <summary> Reports an error that occured while scanning.
+ ///
+ /// In a wellformed scanner (no or only correct usage of
+ /// yypushback(int) and a match-all fallback rule) this method
+ /// will only be called with things that "Can't Possibly Happen".
+ /// If this method is called, something is seriously wrong
+ /// (e.g. a JFlex bug producing a faulty scanner etc.).
+ ///
+ /// Usual syntax/scanner level error handling should be done
+ /// in error fallback rules.
+ ///
+ /// </summary>
+ /// <param name="errorCode"> the code of the errormessage to display
+ /// </param>
+ private void ZzScanError(int errorCode)
+ {
+ System.String message;
+ try
+ {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (System.IndexOutOfRangeException)
+ {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new System.ApplicationException(message);
+ }
+
+
+ /// <summary> Pushes the specified amount of characters back into the input stream.
+ ///
+ /// They will be read again by then next call of the scanning method
+ ///
+ /// </summary>
+ /// <param name="number"> the number of characters to be read again.
+ /// This number must not be greater than yylength()!
+ /// </param>
+ public virtual void Yypushback(int number)
+ {
+ if (number > Yylength())
+ ZzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /// <summary> Resumes scanning until the next regular expression is matched,
+ /// the end of input is encountered or an I/O-Error occurs.
+ ///
+ /// </summary>
+ /// <returns> the next token
+ /// </returns>
+ /// <exception cref="java.io.IOException"> if any I/O-Error occurs
+ /// </exception>
+ public virtual int GetNextToken()
+ {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char[] zzBufferL = zzBuffer;
+ char[] zzCMapL = ZZ_CMAP;
+
+ int[] zzTransL = ZZ_TRANS;
+ int[] zzRowMapL = ZZ_ROWMAP;
+ int[] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true)
+ {
+ zzMarkedPosL = zzMarkedPos;
+
+ yychar += zzMarkedPosL - zzStartRead;
+
+ zzAction = - 1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = zzLexicalState;
+
+
+ {
+ while (true)
+ {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF)
+ {
+ zzInput = YYEOF;
+ goto zzForAction_brk;
+ }
+ else
+ {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ bool eof = ZzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof)
+ {
+ zzInput = YYEOF;
+ goto zzForAction_brk;
+ }
+ else
+ {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[zzRowMapL[zzState] + zzCMapL[zzInput]];
+ if (zzNext == - 1)
+ {
+ goto zzForAction_brk;
+ }
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ((zzAttributes & 1) == 1)
+ {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ((zzAttributes & 8) == 8)
+ {
+ goto zzForAction_brk;
+ }
+ }
+ }
+ }
+
+zzForAction_brk: ;
+
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction])
+ {
+
+ case 5:
+ {
+ return HOST;
+ }
+
+ case 11: break;
+
+ case 9:
+ {
+ return ACRONYM_DEP;
+ }
+
+ case 12: break;
+
+ case 8:
+ {
+ return ACRONYM;
+ }
+
+ case 13: break;
+
+ case 1:
+ {
+ /* ignore */
+ }
+ goto case 14;
+
+ case 14: break;
+
+ case 7:
+ {
+ return NUM;
+ }
+
+ case 15: break;
+
+ case 3:
+ {
+ return CJ;
+ }
+
+ case 16: break;
+
+ case 2:
+ {
+ return ALPHANUM;
+ }
+
+ case 17: break;
+
+ case 6:
+ {
+ return COMPANY;
+ }
+
+ case 18: break;
+
+ case 4:
+ {
+ return APOSTROPHE;
+ }
+
+ case 19: break;
+
+ case 10:
+ {
+ return EMAIL;
+ }
+
+ case 20: break;
+
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos)
+ {
+ zzAtEOF = true;
+ return YYEOF;
+ }
+ else
+ {
+ ZzScanError(ZZ_NO_MATCH);
+ }
+ break;
+
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.jflex?rev=671406&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.jflex (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.jflex Tue Jun 24 19:53:11 2008
@@ -0,0 +1,140 @@
+package org.apache.lucene.analysis.standard;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+%%
+
+%class StandardTokenizerImpl
+%unicode
+%integer
+%function getNextToken
+%pack
+%char
+
+%{
+
+public static final int ALPHANUM = 0;
+public static final int APOSTROPHE = 1;
+public static final int ACRONYM = 2;
+public static final int COMPANY = 3;
+public static final int EMAIL = 4;
+public static final int HOST = 5;
+public static final int NUM = 6;
+public static final int CJ = 7;
+/**
+ * @deprecated this solves a bug where HOSTs that end with '.' are identified
+ * as ACRONYMs. It is deprecated and will be removed in the next
+ * release.
+ */
+public static final int ACRONYM_DEP = 8;
+
+public static final String [] TOKEN_TYPES = new String [] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>",
+ "<ACRONYM_DEP>"
+};
+
+public final int yychar()
+{
+ return yychar;
+}
+
+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(Token t) {
+ t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+%}
+
+// basic word: a sequence of digits & letters
+ALPHANUM = ({LETTER}|{DIGIT}|{KOREAN})+
+
+// internal apostrophes: O'Reilly, you're, O'Reilly's
+// use a post-filter to remove possesives
+APOSTROPHE = {ALPHA} ("'" {ALPHA})+
+
+// acronyms: U.S.A., I.B.M., etc.
+// use a post-filter to remove dots
+ACRONYM = {LETTER} "." ({LETTER} ".")+
+
+ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
+
+// company names like AT&T and Excite@Home.
+COMPANY = {ALPHA} ("&"|"@") {ALPHA}
+
+// email addresses
+EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
+
+// hostname
+HOST = {ALPHANUM} ((".") {ALPHANUM})+
+
+// floating point, serial, model numbers, ip addresses, etc.
+// every other segment must have at least one digit
+NUM = ({ALPHANUM} {P} {HAS_DIGIT}
+ | {HAS_DIGIT} {P} {ALPHANUM}
+ | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
+ | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
+
+// punctuation
+P = ("_"|"-"|"/"|"."|",")
+
+// at least one digit
+HAS_DIGIT =
+ ({LETTER}|{DIGIT})*
+ {DIGIT}
+ ({LETTER}|{DIGIT})*
+
+ALPHA = ({LETTER})+
+
+
+LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
+
+DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
+
+KOREAN = [\uac00-\ud7af\u1100-\u11ff]
+
+// Chinese, Japanese
+CJ = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
+
+WHITESPACE = \r\n | [ \r\n\t\f]
+
+%%
+
+{ALPHANUM} { return ALPHANUM; }
+{APOSTROPHE} { return APOSTROPHE; }
+{ACRONYM} { return ACRONYM; }
+{COMPANY} { return COMPANY; }
+{EMAIL} { return EMAIL; }
+{HOST} { return HOST; }
+{NUM} { return NUM; }
+{CJ} { return CJ; }
+{ACRONYM_DEP} { return ACRONYM_DEP; }
+
+/** Ignore the rest */
+. | {WHITESPACE} { /* ignore */ }
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs Tue Jun 24 19:53:11 2008
@@ -14,8 +14,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+// {{Aroush-2.3.1}} remove this file from SVN
/* Generated By:JavaCC: Do not edit this line. StandardTokenizerTokenManager.java */
+/*
using System;
namespace Lucene.Net.Analysis.Standard
@@ -113,7 +114,8 @@
ReInitRounds();
if (curChar < 64)
{
- ulong l = ((ulong) 1L) << curChar;
+ ulong l = ((ulong) 1L) << curChar;
+
MatchLoop:
do
{
@@ -463,7 +465,8 @@
}
else if (curChar < 128)
{
- ulong l = ((ulong) 1L) << (curChar & 63);
+ ulong l = ((ulong) 1L) << (curChar & 63);
+
MatchLoop1:
do
{
@@ -821,7 +824,8 @@
int i1 = hiByte >> 6;
ulong l1 = ((ulong) 1L) << (hiByte & 63);
int i2 = (curChar & 0xff) >> 6;
- ulong l2 = ((ulong) 1L) << (curChar & 63);
+ ulong l2 = ((ulong) 1L) << (curChar & 63);
+
MatchLoop1:
do
{
@@ -1421,7 +1425,7 @@
catch (System.IO.IOException)
{
EOFSeen = true;
- error_after = curPos <= 1?"":input_stream.GetImage();
+ error_after = curPos <= 1 ? "" : input_stream.GetImage();
if (curChar == '\n' || curChar == '\r')
{
error_line++;
@@ -1433,7 +1437,7 @@
if (!EOFSeen)
{
input_stream.Backup(1);
- error_after = curPos <= 1?"":input_stream.GetImage();
+ error_after = curPos <= 1 ? "" : input_stream.GetImage();
}
throw new TokenMgrError(EOFSeen, curLexState, error_line, error_column, error_after, curChar, TokenMgrError.LEXICAL_ERROR);
@@ -1441,4 +1445,4 @@
}
}
}
-}
\ No newline at end of file
+}*/
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Token.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/Token.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Token.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Token.cs Tue Jun 24 19:53:11 2008
@@ -14,8 +14,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+// {{Aroush-2.3.1}} remove this file from SVN
/* Generated By:JavaCC: Do not edit this line. Token.java Version 3.0 */
+/*
using System;
namespace Lucene.Net.Analysis.Standard
@@ -90,4 +91,5 @@
}
}
}
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs Tue Jun 24 19:53:11 2008
@@ -14,8 +14,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+// {{Aroush-2.3.1}} remove this file from SVN
/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 3.0 */
+/*
using System;
namespace Lucene.Net.Analysis.Standard
@@ -138,7 +139,7 @@
/// </summary>
protected internal static System.String LexicalError(bool EOFSeen, int lexState, int errorLine, int errorColumn, System.String errorAfter, char curChar)
{
- return ("Lexical error at line " + errorLine + ", column " + errorColumn + ". Encountered: " + (EOFSeen?"<EOF> ":("\"" + addEscapes(System.Convert.ToString(curChar)) + "\"") + " (" + (int) curChar + "), ") + "after : \"" + addEscapes(errorAfter) + "\"");
+ return ("Lexical error at line " + errorLine + ", column " + errorColumn + ". Encountered: " + (EOFSeen ? "<EOF> " : ("\"" + addEscapes(System.Convert.ToString(curChar)) + "\"") + " (" + (int) curChar + "), ") + "after : \"" + addEscapes(errorAfter) + "\"");
}
/*
@@ -158,4 +159,5 @@
{
}
}
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/StopAnalyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopAnalyzer.cs Tue Jun 24 19:53:11 2008
@@ -70,5 +70,44 @@
{
return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
}
+
+ /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
+ private class SavedStreams
+ {
+ public SavedStreams(StopAnalyzer enclosingInstance)
+ {
+ InitBlock(enclosingInstance);
+ }
+ private void InitBlock(StopAnalyzer enclosingInstance)
+ {
+ this.enclosingInstance = enclosingInstance;
+ }
+ private StopAnalyzer enclosingInstance;
+ public StopAnalyzer Enclosing_Instance
+ {
+ get
+ {
+ return enclosingInstance;
+ }
+
+ }
+ internal Tokenizer source;
+ internal TokenStream result;
+ }
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams(this);
+ streams.source = new LowerCaseTokenizer(reader);
+ streams.result = new StopFilter(streams.source, stopWords);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ streams.source.Reset(reader);
+ return streams.result;
+ }
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/StopFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopFilter.cs Tue Jun 24 19:53:11 2008
@@ -25,92 +25,159 @@
public sealed class StopFilter : TokenFilter
{
- private System.Collections.Hashtable stopWords;
- private bool ignoreCase;
+ private static bool ENABLE_POSITION_INCREMENTS_DEFAULT = false;
+
+ private CharArraySet stopWords;
+ private bool enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT;
/// <summary> Construct a token stream filtering the given input.</summary>
public StopFilter(TokenStream input, System.String[] stopWords) : this(input, stopWords, false)
{
}
-
- /// <summary> Constructs a filter which removes words from the input
- /// TokenStream that are named in the array of words.
- /// </summary>
- public StopFilter(TokenStream in_Renamed, System.String[] stopWords, bool ignoreCase) : base(in_Renamed)
- {
- this.ignoreCase = ignoreCase;
- this.stopWords = MakeStopSet(stopWords, ignoreCase);
- }
-
- /// <summary> Construct a token stream filtering the given input.</summary>
- /// <param name="input">
- /// </param>
- /// <param name="stopWords">The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
- /// </param>
- /// <param name="ignoreCase">-Ignore case when stopping. The stopWords set must be setup to contain only lower case words
- /// </param>
- public StopFilter(TokenStream input, System.Collections.Hashtable stopWords, bool ignoreCase) : base(input)
- {
- this.ignoreCase = ignoreCase;
- this.stopWords = stopWords;
- }
-
/// <summary> Constructs a filter which removes words from the input
- /// TokenStream that are named in the Set.
- /// It is crucial that an efficient Set implementation is used
- /// for maximum performance.
- ///
- /// </summary>
- /// <seealso cref="MakeStopSet(String[])">
- /// </seealso>
- public StopFilter(TokenStream in_Renamed, System.Collections.Hashtable stopWords) : this(in_Renamed, stopWords, false)
- {
- }
-
- /// <summary> Builds a Set from an array of stop words,
- /// appropriate for passing into the StopFilter constructor.
- /// This permits this stopWords construction to be cached once when
- /// an Analyzer is constructed.
- ///
+ /// TokenStream that are named in the array of words.
/// </summary>
- /// <seealso cref="MakeStopSet(String[], boolean) passing false to ignoreCase">
- /// </seealso>
- public static System.Collections.Hashtable MakeStopSet(System.String[] stopWords)
- {
- return MakeStopSet(stopWords, false);
- }
-
- /// <summary> </summary>
- /// <param name="stopWords">
- /// </param>
- /// <param name="ignoreCase">If true, all words are lower cased first.
- /// </param>
- /// <returns> a Set containing the words
- /// </returns>
- public static System.Collections.Hashtable MakeStopSet(System.String[] stopWords, bool ignoreCase)
- {
- System.Collections.Hashtable stopTable = new System.Collections.Hashtable(stopWords.Length);
- for (int i = 0; i < stopWords.Length; i++)
- {
- System.String tmp = ignoreCase ? stopWords[i].ToLower() : stopWords[i];
- stopTable.Add(tmp, tmp);
- }
- return stopTable;
- }
-
- /// <summary> Returns the next input Token whose termText() is not a stop word.</summary>
- public override Token Next()
- {
- // return the first non-stop word found
- for (Token token = input.Next(); token != null; token = input.Next())
- {
- System.String termText = ignoreCase ? token.termText.ToLower() : token.termText;
- if (!stopWords.Contains(termText))
- return token;
- }
- // reached EOS -- return null
- return null;
- }
- }
+ public StopFilter(TokenStream in_Renamed, System.String[] stopWords, bool ignoreCase) : base(in_Renamed)
+ {
+ this.stopWords = (CharArraySet) MakeStopSet(stopWords, ignoreCase);
+ }
+
+
+ /// <summary> Construct a token stream filtering the given input.
+ /// If <code>stopWords</code> is an instance of {@link CharArraySet} (true if
+ /// <code>makeStopSet()</code> was used to construct the set) it will be directly used
+ /// and <code>ignoreCase</code> will be ignored since <code>CharArraySet</code>
+ /// directly controls case sensitivity.
+ /// <p/>
+ /// If <code>stopWords</code> is not an instance of {@link CharArraySet},
+ /// a new CharArraySet will be constructed and <code>ignoreCase</code> will be
+ /// used to specify the case sensitivity of that set.
+ ///
+ /// </summary>
+ /// <param name="input">
+ /// </param>
+ /// <param name="stopWords">The set of Stop Words.
+ /// </param>
+ /// <param name="ignoreCase">-Ignore case when stopping.
+ /// </param>
+ public StopFilter(TokenStream input, System.Collections.Hashtable stopWords, bool ignoreCase) : base(input)
+ {
+ if (stopWords is CharArraySet)
+ {
+ this.stopWords = (CharArraySet) stopWords;
+ }
+ else
+ {
+ this.stopWords = new CharArraySet(stopWords.Count, ignoreCase);
+ for (int i = 0; i < stopWords.Count; i++)
+ {
+ this.stopWords.Add(stopWords[i]);
+ }
+ }
+ }
+
+ /// <summary> Constructs a filter which removes words from the input
+ /// TokenStream that are named in the Set.
+ ///
+ /// </summary>
+ /// <seealso cref="MakeStopSet(java.lang.String[])">
+ /// </seealso>
+ public StopFilter(TokenStream in_Renamed, System.Collections.Hashtable stopWords) : this(in_Renamed, stopWords, false)
+ {
+ }
+
+ /// <summary> Builds a Set from an array of stop words,
+ /// appropriate for passing into the StopFilter constructor.
+ /// This permits this stopWords construction to be cached once when
+ /// an Analyzer is constructed.
+ ///
+ /// </summary>
+ /// <seealso cref="MakeStopSet(java.lang.String[], boolean) passing false to ignoreCase">
+ /// </seealso>
+ public static System.Collections.Hashtable MakeStopSet(System.String[] stopWords)
+ {
+ return MakeStopSet(stopWords, false);
+ }
+
+ /// <summary> </summary>
+ /// <param name="stopWords">
+ /// </param>
+ /// <param name="ignoreCase">If true, all words are lower cased first.
+ /// </param>
+ /// <returns> a Set containing the words
+ /// </returns>
+ public static System.Collections.Hashtable MakeStopSet(System.String[] stopWords, bool ignoreCase)
+ {
+ CharArraySet stopSet = new CharArraySet(stopWords.Length, ignoreCase);
+ for (int i = 0; i < stopWords.Length; i++)
+ {
+ stopSet.Add(stopWords[i]);
+ }
+ return stopSet;
+ }
+
+ /// <summary> Returns the next input Token whose termText() is not a stop word.</summary>
+ public override Token Next(Token result)
+ {
+ // return the first non-stop word found
+ int skippedPositions = 0;
+ while ((result = input.Next(result)) != null)
+ {
+ if (!stopWords.Contains(result.TermBuffer(), 0, result.termLength))
+ {
+ if (enablePositionIncrements)
+ {
+ result.SetPositionIncrement(result.GetPositionIncrement() + skippedPositions);
+ }
+ return result;
+ }
+ skippedPositions += result.GetPositionIncrement();
+ }
+ // reached EOS -- return null
+ return null;
+ }
+
+ /// <seealso cref="setEnablePositionIncrementsDefault(boolean).">
+ /// </seealso>
+ public static bool GetEnablePositionIncrementsDefault()
+ {
+ return ENABLE_POSITION_INCREMENTS_DEFAULT;
+ }
+
+ /// <summary> Set the default position increments behavior of every StopFilter created from now on.
+ /// <p>
+ /// Note: behavior of a single StopFilter instance can be modified
+ /// with {@link #SetEnablePositionIncrements(boolean)}.
+ /// This static method allows control over behavior of classes using StopFilters internally,
+ /// for example {@link Lucene.Net.Analysis.Standard.StandardAnalyzer StandardAnalyzer}.
+ /// <p>
+ /// Default : false.
+ /// </summary>
+ /// <seealso cref="setEnablePositionIncrements(boolean).">
+ /// </seealso>
+ public static void SetEnablePositionIncrementsDefault(bool defaultValue)
+ {
+ ENABLE_POSITION_INCREMENTS_DEFAULT = defaultValue;
+ }
+
+ /// <seealso cref="setEnablePositionIncrements(boolean).">
+ /// </seealso>
+ public bool GetEnablePositionIncrements()
+ {
+ return enablePositionIncrements;
+ }
+
+ /// <summary> Set to <code>true</code> to make <b>this</b> StopFilter enable position increments to result tokens.
+ /// <p>
+ /// When set, when a token is stopped (omitted), the position increment of
+ /// the following token is incremented.
+ /// <p>
+ /// Default: see {@link #SetEnablePositionIncrementsDefault(boolean)}.
+ /// </summary>
+ public void SetEnablePositionIncrements(bool enable)
+ {
+ this.enablePositionIncrements = enable;
+ }
+ }
}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TeeTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/TeeTokenFilter.cs?rev=671406&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TeeTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TeeTokenFilter.cs Tue Jun 24 19:53:11 2008
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Analysis
+{
+
+
+ /// <summary> Works in conjunction with the SinkTokenizer to provide the ability to set aside tokens
+ /// that have already been analyzed. This is useful in situations where multiple fields share
+ /// many common analysis steps and then go their separate ways.
+ /// <p/>
+ /// It is also useful for doing things like entity extraction or proper noun analysis as
+ /// part of the analysis workflow and saving off those tokens for use in another field.
+ ///
+ /// <pre>
+ /// SinkTokenizer sink1 = new SinkTokenizer(null);
+ /// SinkTokenizer sink2 = new SinkTokenizer(null);
+ /// TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
+ /// TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
+ /// TokenStream final1 = new LowerCaseFilter(source1);
+ /// TokenStream final2 = source2;
+ /// TokenStream final3 = new EntityDetect(sink1);
+ /// TokenStream final4 = new URLDetect(sink2);
+ /// d.add(new Field("f1", final1));
+ /// d.add(new Field("f2", final2));
+ /// d.add(new Field("f3", final3));
+ /// d.add(new Field("f4", final4));
+ /// </pre>
+ /// In this example, sink1 and sink2 will both get tokens from both reader1 and reader2 after whitespace tokenizer
+ /// and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
+ /// Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
+ /// <p/>
+ ///
+ /// See http://issues.apache.org/jira/browse/LUCENE-1058
+ /// </summary>
+ /// <seealso cref="SinkTokenizer">
+ ///
+ ///
+ /// </seealso>
+ public class TeeTokenFilter : TokenFilter
+ {
+ internal SinkTokenizer sink;
+
+ public TeeTokenFilter(TokenStream input, SinkTokenizer sink) : base(input)
+ {
+ this.sink = sink;
+ }
+
+ public override Token Next(Token result)
+ {
+ Token t = input.Next(result);
+ sink.Add(t);
+ return t;
+ }
+ }
+}
\ No newline at end of file