You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by mh...@apache.org on 2013/09/24 20:33:17 UTC
[41/50] [abbrv] git commit: Implement Standard and Classic Analyzers
Implement Standard and Classic Analyzers
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/7a4b442f
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/7a4b442f
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/7a4b442f
Branch: refs/heads/branch_4x
Commit: 7a4b442f13ad71b5094c6058e1746c04d97ac34c
Parents: 98e877d
Author: Paul Irwin <pa...@gmail.com>
Authored: Thu Aug 8 14:58:40 2013 -0400
Committer: Paul Irwin <pa...@gmail.com>
Committed: Thu Aug 8 14:58:40 2013 -0400
----------------------------------------------------------------------
src/contrib/Analyzers/Contrib.Analyzers.csproj | 15 +
.../Analyzers/Standard/ClassicAnalyzer.cs | 70 +
src/contrib/Analyzers/Standard/ClassicFilter.cs | 59 +
.../Analyzers/Standard/ClassicFilterFactory.cs | 25 +
.../Analyzers/Standard/ClassicTokenizer.cs | 131 ++
.../Standard/ClassicTokenizerFactory.cs | 31 +
.../Analyzers/Standard/ClassicTokenizerImpl.cs | 657 ++++++++++
.../Standard/IStandardTokenizerInterface.cs | 27 +
.../Analyzers/Standard/StandardAnalyzer.cs | 70 +
.../Analyzers/Standard/StandardFilter.cs | 73 ++
.../Analyzers/Standard/StandardFilterFactory.cs | 26 +
.../Analyzers/Standard/StandardTokenizer.cs | 167 +++
.../Standard/StandardTokenizerFactory.cs | 31 +
.../Analyzers/Standard/StandardTokenizerImpl.cs | 1241 ++++++++++++++++++
.../Standard/Std31/StandardTokenizerImpl31.cs | 1116 ++++++++++++++++
.../Standard/Std34/StandardTokenizerImpl34.cs | 1134 ++++++++++++++++
16 files changed, 4873 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Contrib.Analyzers.csproj
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj
index 74b0f63..e13f118 100644
--- a/src/contrib/Analyzers/Contrib.Analyzers.csproj
+++ b/src/contrib/Analyzers/Contrib.Analyzers.csproj
@@ -122,6 +122,21 @@
<Compile Include="Core\WhitespaceTokenizer.cs" />
<Compile Include="Core\WhitespaceTokenizerFactory.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
+ <Compile Include="Standard\ClassicAnalyzer.cs" />
+ <Compile Include="Standard\ClassicFilter.cs" />
+ <Compile Include="Standard\ClassicFilterFactory.cs" />
+ <Compile Include="Standard\ClassicTokenizer.cs" />
+ <Compile Include="Standard\ClassicTokenizerFactory.cs" />
+ <Compile Include="Standard\ClassicTokenizerImpl.cs" />
+ <Compile Include="Standard\IStandardTokenizerInterface.cs" />
+ <Compile Include="Standard\StandardAnalyzer.cs" />
+ <Compile Include="Standard\StandardFilter.cs" />
+ <Compile Include="Standard\StandardFilterFactory.cs" />
+ <Compile Include="Standard\StandardTokenizer.cs" />
+ <Compile Include="Standard\StandardTokenizerFactory.cs" />
+ <Compile Include="Standard\StandardTokenizerImpl.cs" />
+ <Compile Include="Standard\Std31\StandardTokenizerImpl31.cs" />
+ <Compile Include="Standard\Std34\StandardTokenizerImpl34.cs" />
<Compile Include="Support\AbstractSet.cs" />
<Compile Include="Support\StringExtensions.cs" />
<Compile Include="Util\AbstractAnalysisFactory.cs" />
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicAnalyzer.cs b/src/contrib/Analyzers/Standard/ClassicAnalyzer.cs
new file mode 100644
index 0000000..193f111
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicAnalyzer.cs
@@ -0,0 +1,70 @@
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public sealed class ClassicAnalyzer : StopwordAnalyzerBase
+ {
+ public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+
+ public ClassicAnalyzer(Version? matchVersion, CharArraySet stopWords)
+ : base(matchVersion, stopWords)
+ {
+ }
+
+ public ClassicAnalyzer(Version? matchVersion)
+ : this(matchVersion, STOP_WORDS_SET)
+ {
+ }
+
+ public ClassicAnalyzer(Version? matchVersion, TextReader stopwords)
+ : this(matchVersion, LoadStopwordSet(stopwords, matchVersion))
+ {
+ }
+
+ public int MaxTokenLength
+ {
+ get { return maxTokenLength; }
+ set { maxTokenLength = value; }
+ }
+
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
+ src.MaxTokenLength = maxTokenLength;
+ TokenStream tok = new ClassicFilter(src);
+ tok = new LowerCaseFilter(matchVersion, tok);
+ tok = new StopFilter(matchVersion, tok, stopwords);
+ return new AnonymousTokenStreamComponents(this, src, tok);
+ }
+
+ private sealed class AnonymousTokenStreamComponents : TokenStreamComponents
+ {
+ private readonly ClassicAnalyzer parent;
+ private readonly ClassicTokenizer src;
+
+ public AnonymousTokenStreamComponents(ClassicAnalyzer parent, ClassicTokenizer src, TokenStream tok)
+ : base(src, tok)
+ {
+ this.parent = parent;
+ this.src = src;
+ }
+
+ public override void SetReader(TextReader reader)
+ {
+ src.MaxTokenLength = parent.maxTokenLength;
+ base.SetReader(reader);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicFilter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicFilter.cs b/src/contrib/Analyzers/Standard/ClassicFilter.cs
new file mode 100644
index 0000000..eac2d3e
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicFilter.cs
@@ -0,0 +1,59 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public class ClassicFilter : TokenFilter
+ {
+ public ClassicFilter(TokenStream input)
+ : base(input)
+ {
+ typeAtt = AddAttribute<ITypeAttribute>();
+ termAtt = AddAttribute<ICharTermAttribute>();
+ }
+
+ private static readonly String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+ private static readonly String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+
+ // this filters uses attribute type
+ private readonly ITypeAttribute typeAtt; // = addAttribute(TypeAttribute.class);
+ private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+
+ public override bool IncrementToken()
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
+
+ char[] buffer = termAtt.Buffer;
+ int bufferLength = termAtt.Length;
+ String type = typeAtt.Type;
+
+ if (type == APOSTROPHE_TYPE && // remove 's
+ bufferLength >= 2 &&
+ buffer[bufferLength - 2] == '\'' &&
+ (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
+ {
+ // Strip last 2 characters off
+ termAtt.SetLength(bufferLength - 2);
+ }
+ else if (type == ACRONYM_TYPE)
+ { // remove dots
+ int upto = 0;
+ for (int i = 0; i < bufferLength; i++)
+ {
+ char c = buffer[i];
+ if (c != '.')
+ buffer[upto++] = c;
+ }
+ termAtt.SetLength(upto);
+ }
+
+ return true;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicFilterFactory.cs b/src/contrib/Analyzers/Standard/ClassicFilterFactory.cs
new file mode 100644
index 0000000..378004b
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicFilterFactory.cs
@@ -0,0 +1,25 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public class ClassicFilterFactory : TokenFilterFactory
+ {
+ public ClassicFilterFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new ClassicFilter(input);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicTokenizer.cs b/src/contrib/Analyzers/Standard/ClassicTokenizer.cs
new file mode 100644
index 0000000..bad1c9e
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicTokenizer.cs
@@ -0,0 +1,131 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public sealed class ClassicTokenizer : Tokenizer
+ {
+ private IStandardTokenizerInterface scanner;
+
+ public const int ALPHANUM = 0;
+ public const int APOSTROPHE = 1;
+ public const int ACRONYM = 2;
+ public const int COMPANY = 3;
+ public const int EMAIL = 4;
+ public const int HOST = 5;
+ public const int NUM = 6;
+ public const int CJ = 7;
+
+ public const int ACRONYM_DEP = 8;
+
+ public static readonly string[] TOKEN_TYPES = new string[] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>",
+ "<ACRONYM_DEP>"
+ };
+
+ private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+ public int MaxTokenLength
+ {
+ get { return maxTokenLength; }
+ set { maxTokenLength = value; }
+ }
+
+ public ClassicTokenizer(Version? matchVersion, TextReader input)
+ : base(input)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ typeAtt = AddAttribute<ITypeAttribute>();
+
+ Init(matchVersion);
+ }
+
+ public ClassicTokenizer(Version? matchVersion, AttributeFactory factory, TextReader input)
+ : base(factory, input)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ typeAtt = AddAttribute<ITypeAttribute>();
+
+ Init(matchVersion);
+ }
+
+ private void Init(Version? matchVersion)
+ {
+ this.scanner = new ClassicTokenizerImpl(null); // best effort NPE if you dont call reset
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+ private readonly IOffsetAttribute offsetAtt; // = addAttribute(OffsetAttribute.class);
+ private readonly IPositionIncrementAttribute posIncrAtt; // = addAttribute(PositionIncrementAttribute.class);
+ private readonly ITypeAttribute typeAtt; // = addAttribute(TypeAttribute.class);
+
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ int posIncr = 1;
+
+ while (true)
+ {
+ int tokenType = scanner.GetNextToken();
+
+ if (tokenType == StandardTokenizerInterface.YYEOF)
+ {
+ return false;
+ }
+
+ if (scanner.YYLength <= maxTokenLength)
+ {
+ posIncrAtt.PositionIncrement = posIncr;
+ scanner.GetText(termAtt);
+ int start = scanner.YYChar;
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
+
+ if (tokenType == ClassicTokenizer.ACRONYM_DEP)
+ {
+ typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST];
+ termAtt.SetLength(termAtt.Length - 1); // remove extra '.'
+ }
+ else
+ {
+ typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[tokenType];
+ }
+ return true;
+ }
+ else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ posIncr++;
+ }
+ }
+
+ public override void End()
+ {
+ // set final offset
+ int finalOffset = CorrectOffset(scanner.YYChar + scanner.YYLength);
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset()
+ {
+ scanner.YYReset(input);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicTokenizerFactory.cs b/src/contrib/Analyzers/Standard/ClassicTokenizerFactory.cs
new file mode 100644
index 0000000..2bcd775
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicTokenizerFactory.cs
@@ -0,0 +1,31 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public class ClassicTokenizerFactory : TokenizerFactory
+ {
+ private readonly int maxTokenLength;
+
+ public ClassicTokenizerFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ AssureMatchVersion();
+ maxTokenLength = GetInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+ {
+ ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, factory, input);
+ tokenizer.MaxTokenLength = maxTokenLength;
+ return tokenizer;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicTokenizerImpl.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicTokenizerImpl.cs b/src/contrib/Analyzers/Standard/ClassicTokenizerImpl.cs
new file mode 100644
index 0000000..9a096ac
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicTokenizerImpl.cs
@@ -0,0 +1,657 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ internal class ClassicTokenizerImpl : IStandardTokenizerInterface
+ {
+ /** This character denotes the end of file */
+ public const int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private const int ZZ_BUFFERSIZE = 4096;
+
+ /** lexical states */
+ public const int YYINITIAL = 0;
+
+ /**
+ * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+ * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+ * at the beginning of a line
+ * l is of the form l = 2*k, k a non negative integer
+ */
+ private readonly int[] ZZ_LEXSTATE = {
+ 0, 0
+ };
+
+ /**
+ * Translates characters to character classes
+ */
+ private const string ZZ_CMAP_PACKED =
+ "\u0009\0\u0001\0\u0001\u000d\u0001\0\u0001\0\u0001\u000c\u0012\0\u0001\0\u0005\0\u0001\u0005" +
+ "\u0001\u0003\u0004\0\u0001\u0009\u0001\u0007\u0001\u0004\u0001\u0009\u000a\u0002\u0006\0\u0001\u0006\u001a\u000a" +
+ "\u0004\0\u0001\u0008\u0001\0\u001a\u000a\u002f\0\u0001\u000a\u000a\0\u0001\u000a\u0004\0\u0001\u000a" +
+ "\u0005\0\u0017\u000a\u0001\0\u001f\u000a\u0001\0\u0128\u000a\u0002\0\u0012\u000a\u001c\0\u005e\u000a" +
+ "\u0002\0\u0009\u000a\u0002\0\u0007\u000a\u000e\0\u0002\u000a\u000e\0\u0005\u000a\u0009\0\u0001\u000a" +
+ "\u008b\0\u0001\u000a\u000b\0\u0001\u000a\u0001\0\u0003\u000a\u0001\0\u0001\u000a\u0001\0\u0014\u000a" +
+ "\u0001\0\u002c\u000a\u0001\0\u0008\u000a\u0002\0\u001a\u000a\u000c\0\u0082\u000a\u000a\0\u0039\u000a" +
+ "\u0002\0\u0002\u000a\u0002\0\u0002\u000a\u0003\0\u0026\u000a\u0002\0\u0002\u000a\u0037\0\u0026\u000a" +
+ "\u0002\0\u0001\u000a\u0007\0\u0027\u000a\u0048\0\u001b\u000a\u0005\0\u0003\u000a\u002e\0\u001a\u000a" +
+ "\u0005\0\u000b\u000a\u0015\0\u000a\u0002\u0007\0\u0063\u000a\u0001\0\u0001\u000a\u000f\0\u0002\u000a" +
+ "\u0009\0\u000a\u0002\u0003\u000a\u0013\0\u0001\u000a\u0001\0\u001b\u000a\u0053\0\u0026\u000a\u015f\0" +
+ "\u0035\u000a\u0003\0\u0001\u000a\u0012\0\u0001\u000a\u0007\0\u000a\u000a\u0004\0\u000a\u0002\u0015\0" +
+ "\u0008\u000a\u0002\0\u0002\u000a\u0002\0\u0016\u000a\u0001\0\u0007\u000a\u0001\0\u0001\u000a\u0003\0" +
+ "\u0004\u000a\u0022\0\u0002\u000a\u0001\0\u0003\u000a\u0004\0\u000a\u0002\u0002\u000a\u0013\0\u0006\u000a" +
+ "\u0004\0\u0002\u000a\u0002\0\u0016\u000a\u0001\0\u0007\u000a\u0001\0\u0002\u000a\u0001\0\u0002\u000a" +
+ "\u0001\0\u0002\u000a\u001f\0\u0004\u000a\u0001\0\u0001\u000a\u0007\0\u000a\u0002\u0002\0\u0003\u000a" +
+ "\u0010\0\u0007\u000a\u0001\0\u0001\u000a\u0001\0\u0003\u000a\u0001\0\u0016\u000a\u0001\0\u0007\u000a" +
+ "\u0001\0\u0002\u000a\u0001\0\u0005\u000a\u0003\0\u0001\u000a\u0012\0\u0001\u000a\u000f\0\u0001\u000a" +
+ "\u0005\0\u000a\u0002\u0015\0\u0008\u000a\u0002\0\u0002\u000a\u0002\0\u0016\u000a\u0001\0\u0007\u000a" +
+ "\u0001\0\u0002\u000a\u0002\0\u0004\u000a\u0003\0\u0001\u000a\u001e\0\u0002\u000a\u0001\0\u0003\u000a" +
+ "\u0004\0\u000a\u0002\u0015\0\u0006\u000a\u0003\0\u0003\u000a\u0001\0\u0004\u000a\u0003\0\u0002\u000a" +
+ "\u0001\0\u0001\u000a\u0001\0\u0002\u000a\u0003\0\u0002\u000a\u0003\0\u0003\u000a\u0003\0\u0008\u000a" +
+ "\u0001\0\u0003\u000a\u002d\0\u0009\u0002\u0015\0\u0008\u000a\u0001\0\u0003\u000a\u0001\0\u0017\u000a" +
+ "\u0001\0\u000a\u000a\u0001\0\u0005\u000a\u0026\0\u0002\u000a\u0004\0\u000a\u0002\u0015\0\u0008\u000a" +
+ "\u0001\0\u0003\u000a\u0001\0\u0017\u000a\u0001\0\u000a\u000a\u0001\0\u0005\u000a\u0024\0\u0001\u000a" +
+ "\u0001\0\u0002\u000a\u0004\0\u000a\u0002\u0015\0\u0008\u000a\u0001\0\u0003\u000a\u0001\0\u0017\u000a" +
+ "\u0001\0\u0010\u000a\u0026\0\u0002\u000a\u0004\0\u000a\u0002\u0015\0\u0012\u000a\u0003\0\u0018\u000a" +
+ "\u0001\0\u0009\u000a\u0001\0\u0001\u000a\u0002\0\u0007\u000a\u0039\0\u0001\u0001\u0030\u000a\u0001\u0001" +
+ "\u0002\u000a\u000c\u0001\u0007\u000a\u0009\u0001\u000a\u0002\u0027\0\u0002\u000a\u0001\0\u0001\u000a\u0002\0" +
+ "\u0002\u000a\u0001\0\u0001\u000a\u0002\0\u0001\u000a\u0006\0\u0004\u000a\u0001\0\u0007\u000a\u0001\0" +
+ "\u0003\u000a\u0001\0\u0001\u000a\u0001\0\u0001\u000a\u0002\0\u0002\u000a\u0001\0\u0004\u000a\u0001\0" +
+ "\u0002\u000a\u0009\0\u0001\u000a\u0002\0\u0005\u000a\u0001\0\u0001\u000a\u0009\0\u000a\u0002\u0002\0" +
+ "\u0002\u000a\u0022\0\u0001\u000a\u001f\0\u000a\u0002\u0016\0\u0008\u000a\u0001\0\u0022\u000a\u001d\0" +
+ "\u0004\u000a\u0074\0\u0022\u000a\u0001\0\u0005\u000a\u0001\0\u0002\u000a\u0015\0\u000a\u0002\u0006\0" +
+ "\u0006\u000a\u004a\0\u0026\u000a\u000a\0\u0027\u000a\u0009\0\u005a\u000a\u0005\0\u0044\u000a\u0005\0" +
+ "\u0052\u000a\u0006\0\u0007\u000a\u0001\0\u003f\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0002\0" +
+ "\u0007\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0002\0\u0027\u000a\u0001\0\u0001\u000a\u0001\0" +
+ "\u0004\u000a\u0002\0\u001f\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0002\0\u0007\u000a\u0001\0" +
+ "\u0001\u000a\u0001\0\u0004\u000a\u0002\0\u0007\u000a\u0001\0\u0007\u000a\u0001\0\u0017\u000a\u0001\0" +
+ "\u001f\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0002\0\u0007\u000a\u0001\0\u0027\u000a\u0001\0" +
+ "\u0013\u000a\u000e\0\u0009\u0002\u002e\0\u0055\u000a\u000c\0\u026c\u000a\u0002\0\u0008\u000a\u000a\0" +
+ "\u001a\u000a\u0005\0\u004b\u000a\u0095\0\u0034\u000a\u002c\0\u000a\u0002\u0026\0\u000a\u0002\u0006\0" +
+ "\u0058\u000a\u0008\0\u0029\u000a\u0557\0\u009c\u000a\u0004\0\u005a\u000a\u0006\0\u0016\u000a\u0002\0" +
+ "\u0006\u000a\u0002\0\u0026\u000a\u0002\0\u0006\u000a\u0002\0\u0008\u000a\u0001\0\u0001\u000a\u0001\0" +
+ "\u0001\u000a\u0001\0\u0001\u000a\u0001\0\u001f\u000a\u0002\0\u0035\u000a\u0001\0\u0007\u000a\u0001\0" +
+ "\u0001\u000a\u0003\0\u0003\u000a\u0001\0\u0007\u000a\u0003\0\u0004\u000a\u0002\0\u0006\u000a\u0004\0" +
+ "\u000d\u000a\u0005\0\u0003\u000a\u0001\0\u0007\u000a\u0082\0\u0001\u000a\u0082\0\u0001\u000a\u0004\0" +
+ "\u0001\u000a\u0002\0\u000a\u000a\u0001\0\u0001\u000a\u0003\0\u0005\u000a\u0006\0\u0001\u000a\u0001\0" +
+ "\u0001\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0001\0\u0003\u000a\u0001\0\u0007\u000a\u0ecb\0" +
+ "\u0002\u000a\u002a\0\u0005\u000a\u000a\0\u0001\u000b\u0054\u000b\u0008\u000b\u0002\u000b\u0002\u000b\u005a\u000b" +
+ "\u0001\u000b\u0003\u000b\u0006\u000b\u0028\u000b\u0003\u000b\u0001\0\u005e\u000a\u0011\0\u0018\u000a\u0038\0" +
+ "\u0010\u000b\u0100\0\u0080\u000b\u0080\0\u19b6\u000b\u000a\u000b\u0040\0\u51a6\u000b\u005a\u000b\u048d\u000a" +
+ "\u0773\0\u2ba4\u000a\u215c\0\u012e\u000b\u00d2\u000b\u0007\u000a\u000c\0\u0005\u000a\u0005\0\u0001\u000a" +
+ "\u0001\0\u000a\u000a\u0001\0\u000d\u000a\u0001\0\u0005\u000a\u0001\0\u0001\u000a\u0001\0\u0002\u000a" +
+ "\u0001\0\u0002\u000a\u0001\0\u006c\u000a\u0021\0\u016b\u000a\u0012\0\u0040\u000a\u0002\0\u0036\u000a" +
+ "\u0028\0\u000c\u000a\u0074\0\u0003\u000a\u0001\0\u0001\u000a\u0001\0\u0087\u000a\u0013\0\u000a\u0002" +
+ "\u0007\0\u001a\u000a\u0006\0\u001a\u000a\u000a\0\u0001\u000b\u003a\u000b\u001f\u000a\u0003\0\u0006\u000a" +
+ "\u0002\0\u0006\u000a\u0002\0\u0006\u000a\u0002\0\u0003\u000a\u0023\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static readonly char[] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static readonly int[] ZZ_ACTION = zzUnpackAction();
+
+ private const String ZZ_ACTION_PACKED_0 =
+ "\u0001\0\u0001\u0001\u0003\u0002\u0001\u0003\u0001\u0001\u000b\0\u0001\u0002\u0003\u0004" +
+ "\u0002\0\u0001\u0005\u0001\0\u0001\u0005\u0003\u0004\u0006\u0005\u0001\u0006\u0001\u0004" +
+ "\u0002\u0007\u0001\u0008\u0001\0\u0001\u0008\u0003\0\u0002\u0008\u0001\u0009\u0001\u000a" +
+ "\u0001\u0004";
+
+ private static int[] zzUnpackAction()
+ {
+ int[] result = new int[51];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int count = packed[i++];
+ int value = packed[i++];
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static readonly int[] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private const String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\u000e\0\u001c\0\u002a\0\u0038\0\u000e\0\u0046\0\u0054" +
+ "\0\u0062\0\u0070\0\u007e\0\u008c\0\u009a\0\u00a8\0\u00b6\0\u00c4" +
+ "\0\u00d2\0\u00e0\0\u00ee\0\u00fc\0\u010a\0\u0118\0\u0126\0\u0134" +
+ "\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4" +
+ "\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\u00d2\0\u0206" +
+ "\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\u0054\0\u008c" +
+ "\0\u0268\0\u0276\0\u0284";
+
+ private static int[] zzUnpackRowMap()
+ {
+ int[] result = new int[51];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int high = packed[i++] << 16;
+ result[j++] = high | packed[i++];
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static readonly int[] ZZ_TRANS = zzUnpackTrans();
+
+ private const String ZZ_TRANS_PACKED_0 =
+ "\u0001\u0002\u0001\u0003\u0001\u0004\u0007\u0002\u0001\u0005\u0001\u0006\u0001\u0007\u0001\u0002" +
+ "\u000f\0\u0002\u0003\u0001\0\u0001\u0008\u0001\0\u0001\u0009\u0002\u000a\u0001\u000b" +
+ "\u0001\u0003\u0004\0\u0001\u0003\u0001\u0004\u0001\0\u0001\u000c\u0001\0\u0001\u0009" +
+ "\u0002\u000d\u0001\u000e\u0001\u0004\u0004\0\u0001\u0003\u0001\u0004\u0001\u000f\u0001\u0010" +
+ "\u0001\u0011\u0001\u0012\u0002\u000a\u0001\u000b\u0001\u0013\u0010\0\u0001\u0002\u0001\0" +
+ "\u0001\u0014\u0001\u0015\u0007\0\u0001\u0016\u0004\0\u0002\u0017\u0007\0\u0001\u0017" +
+ "\u0004\0\u0001\u0018\u0001\u0019\u0007\0\u0001\u001a\u0005\0\u0001\u001b\u0007\0" +
+ "\u0001\u000b\u0004\0\u0001\u001c\u0001\u001d\u0007\0\u0001\u001e\u0004\0\u0001\u001f" +
+ "\u0001\u0020\u0007\0\u0001\u0021\u0004\0\u0001\u0022\u0001\u0023\u0007\0\u0001\u0024" +
+ "\u000d\0\u0001\u0025\u0004\0\u0001\u0014\u0001\u0015\u0007\0\u0001\u0026\u000d\0" +
+ "\u0001\u0027\u0004\0\u0002\u0017\u0007\0\u0001\u0028\u0004\0\u0001\u0003\u0001\u0004" +
+ "\u0001\u000f\u0001\u0008\u0001\u0011\u0001\u0012\u0002\u000a\u0001\u000b\u0001\u0013\u0004\0" +
+ "\u0002\u0014\u0001\0\u0001\u0029\u0001\0\u0001\u0009\u0002\u002a\u0001\0\u0001\u0014" +
+ "\u0004\0\u0001\u0014\u0001\u0015\u0001\0\u0001\u002b\u0001\0\u0001\u0009\u0002\u002c" +
+ "\u0001\u002d\u0001\u0015\u0004\0\u0001\u0014\u0001\u0015\u0001\0\u0001\u0029\u0001\0" +
+ "\u0001\u0009\u0002\u002a\u0001\0\u0001\u0016\u0004\0\u0002\u0017\u0001\0\u0001\u002e" +
+ "\u0002\0\u0001\u002e\u0002\0\u0001\u0017\u0004\0\u0002\u0018\u0001\0\u0001\u002a" +
+ "\u0001\0\u0001\u0009\u0002\u002a\u0001\0\u0001\u0018\u0004\0\u0001\u0018\u0001\u0019" +
+ "\u0001\0\u0001\u002c\u0001\0\u0001\u0009\u0002\u002c\u0001\u002d\u0001\u0019\u0004\0" +
+ "\u0001\u0018\u0001\u0019\u0001\0\u0001\u002a\u0001\0\u0001\u0009\u0002\u002a\u0001\0" +
+ "\u0001\u001a\u0005\0\u0001\u001b\u0001\0\u0001\u002d\u0002\0\u0003\u002d\u0001\u001b" +
+ "\u0004\0\u0002\u001c\u0001\0\u0001\u002f\u0001\0\u0001\u0009\u0002\u000a\u0001\u000b" +
+ "\u0001\u001c\u0004\0\u0001\u001c\u0001\u001d\u0001\0\u0001\u0030\u0001\0\u0001\u0009" +
+ "\u0002\u000d\u0001\u000e\u0001\u001d\u0004\0\u0001\u001c\u0001\u001d\u0001\0\u0001\u002f" +
+ "\u0001\0\u0001\u0009\u0002\u000a\u0001\u000b\u0001\u001e\u0004\0\u0002\u001f\u0001\0" +
+ "\u0001\u000a\u0001\0\u0001\u0009\u0002\u000a\u0001\u000b\u0001\u001f\u0004\0\u0001\u001f" +
+ "\u0001\u0020\u0001\0\u0001\u000d\u0001\0\u0001\u0009\u0002\u000d\u0001\u000e\u0001\u0020" +
+ "\u0004\0\u0001\u001f\u0001\u0020\u0001\0\u0001\u000a\u0001\0\u0001\u0009\u0002\u000a" +
+ "\u0001\u000b\u0001\u0021\u0004\0\u0002\u0022\u0001\0\u0001\u000b\u0002\0\u0003\u000b" +
+ "\u0001\u0022\u0004\0\u0001\u0022\u0001\u0023\u0001\0\u0001\u000e\u0002\0\u0003\u000e" +
+ "\u0001\u0023\u0004\0\u0001\u0022\u0001\u0023\u0001\0\u0001\u000b\u0002\0\u0003\u000b" +
+ "\u0001\u0024\u0006\0\u0001\u000f\u0006\0\u0001\u0025\u0004\0\u0001\u0014\u0001\u0015" +
+ "\u0001\0\u0001\u0031\u0001\0\u0001\u0009\u0002\u002a\u0001\0\u0001\u0016\u0004\0" +
+ "\u0002\u0017\u0001\0\u0001\u002e\u0002\0\u0001\u002e\u0002\0\u0001\u0028\u0004\0" +
+ "\u0002\u0014\u0007\0\u0001\u0014\u0004\0\u0002\u0018\u0007\0\u0001\u0018\u0004\0" +
+ "\u0002\u001c\u0007\0\u0001\u001c\u0004\0\u0002\u001f\u0007\0\u0001\u001f\u0004\0" +
+ "\u0002\u0022\u0007\0\u0001\u0022\u0004\0\u0002\u0032\u0007\0\u0001\u0032\u0004\0" +
+ "\u0002\u0014\u0007\0\u0001\u0033\u0004\0\u0002\u0032\u0001\0\u0001\u002e\u0002\0" +
+ "\u0001\u002e\u0002\0\u0001\u0032\u0004\0\u0002\u0014\u0001\0\u0001\u0031\u0001\0" +
+ "\u0001\u0009\u0002\u002a\u0001\0\u0001\u0014\u0003\0";
+
+ private static int[] zzUnpackTrans()
+ {
+ int[] result = new int[658];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int count = packed[i++];
+ int value = packed[i++];
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /* error codes */
+ private const int ZZ_UNKNOWN_ERROR = 0;
+ private const int ZZ_NO_MATCH = 1;
+ private const int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static readonly String[] ZZ_ERROR_MSG = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+ */
+ private static readonly int[] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private const String ZZ_ATTRIBUTE_PACKED_0 =
+ "\u0001\0\u0001\u0009\u0003\u0001\u0001\u0009\u0001\u0001\u000b\0\u0004\u0001\u0002\0" +
+ "\u0001\u0001\u0001\0\u000f\u0001\u0001\0\u0001\u0001\u0003\0\u0005\u0001";
+
+ private static int[] zzUnpackAttribute()
+ {
+ int[] result = new int[51];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int[] result)
+ {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.Length;
+ while (i < l)
+ {
+ int count = packed[i++];
+ int value = packed[i++];
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private TextReader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char[] zzBuffer = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private bool zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private bool zzAtEOF;
+
+ /** denotes if the user-EOF-code has already been executed */
+ private bool zzEOFDone;
+
+
+ /* user code: */
+
+ public const int ALPHANUM = StandardTokenizer.ALPHANUM;
+ public const int APOSTROPHE = StandardTokenizer.APOSTROPHE;
+ public const int ACRONYM = StandardTokenizer.ACRONYM;
+ public const int COMPANY = StandardTokenizer.COMPANY;
+ public const int EMAIL = StandardTokenizer.EMAIL;
+ public const int HOST = StandardTokenizer.HOST;
+ public const int NUM = StandardTokenizer.NUM;
+ public const int CJ = StandardTokenizer.CJ;
+ public const int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
+
+ public static readonly String[] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
+
+ public int YYChar
+ {
+ get { return yychar; }
+ }
+
+ public void GetText(Tokenattributes.ICharTermAttribute t)
+ {
+ t.CopyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ }
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ internal ClassicTokenizerImpl(TextReader input)
+ {
+ this.zzReader = input;
+ }
+
+ private static char[] zzUnpackCMap(String packed)
+ {
+ char[] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 1154)
+ {
+ int count = packed[i++];
+ char value = packed[i++];
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+ private bool zzRefill()
+ {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0)
+ {
+ Array.Copy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead - zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead -= zzStartRead;
+ zzCurrentPos -= zzStartRead;
+ zzMarkedPos -= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.Length)
+ {
+ /* if not: blow it up */
+ char[] newBuffer = new char[zzCurrentPos * 2];
+ Array.Copy(zzBuffer, 0, newBuffer, 0, zzBuffer.Length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.Read(zzBuffer, zzEndRead,
+ zzBuffer.Length - zzEndRead);
+
+ if (numRead > 0)
+ {
+ zzEndRead += numRead;
+ return false;
+ }
+ // unlikely but not impossible: read 0 characters, but not at end of stream
+ if (numRead == 0)
+ {
+ int c = zzReader.Read();
+ if (c <= 0)
+ {
+ return true;
+ }
+ else
+ {
+ zzBuffer[zzEndRead++] = (char)c;
+ return false;
+ }
+ }
+
+ // numRead < 0
+ return true;
+ }
+
+ public void yyclose()
+ {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.Close();
+ }
+
+ public void YYReset(TextReader reader)
+ {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEOFDone = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ if (zzBuffer.Length > ZZ_BUFFERSIZE)
+ zzBuffer = new char[ZZ_BUFFERSIZE];
+ }
+
+ public int yystate()
+ {
+ return zzLexicalState;
+ }
+
+ public void yybegin(int newState)
+ {
+ zzLexicalState = newState;
+ }
+
+ public String yytext()
+ {
+ return new String(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ }
+
+ public char yycharat(int pos)
+ {
+ return zzBuffer[zzStartRead + pos];
+ }
+
+ public int YYLength
+ {
+ get { return zzMarkedPos - zzStartRead; }
+ }
+
+ private void zzScanError(int errorCode)
+ {
+ String message;
+ try
+ {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (IndexOutOfRangeException e)
+ {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Exception(message);
+ }
+
+ public void yypushback(int number)
+ {
+ if (number > YYLength)
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+ public int GetNextToken()
+ {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char[] zzBufferL = zzBuffer;
+ char[] zzCMapL = ZZ_CMAP;
+
+ int[] zzTransL = ZZ_TRANS;
+ int[] zzRowMapL = ZZ_ROWMAP;
+ int[] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true)
+ {
+ zzMarkedPosL = zzMarkedPos;
+
+ yychar += zzMarkedPosL - zzStartRead;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = ZZ_LEXSTATE[zzLexicalState];
+
+ // set up zzAction for empty match case:
+ int zzAttributes = zzAttrL[zzState];
+ if ((zzAttributes & 1) == 1)
+ {
+ zzAction = zzState;
+ }
+
+
+ //zzForAction:
+ {
+ while (true)
+ {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF)
+ {
+ zzInput = YYEOF;
+ break;
+ }
+ else
+ {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ bool eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof)
+ {
+ zzInput = YYEOF;
+ break;
+ }
+ else
+ {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[zzRowMapL[zzState] + zzCMapL[zzInput]];
+ if (zzNext == -1) break;
+ zzState = zzNext;
+
+ zzAttributes = zzAttrL[zzState];
+ if ((zzAttributes & 1) == 1)
+ {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ((zzAttributes & 8) == 8) break;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction])
+ {
+ case 1:
+ { /* Break so we don't hit fall-through warning: */
+ break;/* ignore */
+ }
+ case 11: break;
+ case 2:
+ {
+ return ALPHANUM;
+ }
+ case 12: break;
+ case 3:
+ {
+ return CJ;
+ }
+ case 13: break;
+ case 4:
+ {
+ return HOST;
+ }
+ case 14: break;
+ case 5:
+ {
+ return NUM;
+ }
+ case 15: break;
+ case 6:
+ {
+ return APOSTROPHE;
+ }
+ case 16: break;
+ case 7:
+ {
+ return COMPANY;
+ }
+ case 17: break;
+ case 8:
+ {
+ return ACRONYM_DEP;
+ }
+ case 18: break;
+ case 9:
+ {
+ return ACRONYM;
+ }
+ case 19: break;
+ case 10:
+ {
+ return EMAIL;
+ }
+ case 20: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos)
+ {
+ zzAtEOF = true;
+ return YYEOF;
+ }
+ else
+ {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ break;
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/IStandardTokenizerInterface.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/IStandardTokenizerInterface.cs b/src/contrib/Analyzers/Standard/IStandardTokenizerInterface.cs
new file mode 100644
index 0000000..883e7a0
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/IStandardTokenizerInterface.cs
@@ -0,0 +1,27 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public interface IStandardTokenizerInterface
+ {
+ void GetText(ICharTermAttribute t);
+
+ int YYChar { get; }
+
+ void YYReset(TextReader reader);
+
+ int YYLength { get; }
+
+ int GetNextToken();
+ }
+
+ public static class StandardTokenizerInterface
+ {
+ public const int YYEOF = -1;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardAnalyzer.cs b/src/contrib/Analyzers/Standard/StandardAnalyzer.cs
new file mode 100644
index 0000000..dead459
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardAnalyzer.cs
@@ -0,0 +1,70 @@
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public sealed class StandardAnalyzer : StopwordAnalyzerBase
+ {
+ public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+
+ public StandardAnalyzer(Version? matchVersion, CharArraySet stopWords)
+ : base(matchVersion, stopWords)
+ {
+ }
+
+ public StandardAnalyzer(Version? matchVersion)
+ : this(matchVersion, STOP_WORDS_SET)
+ {
+ }
+
+ public StandardAnalyzer(Version? matchVersion, TextReader stopwords)
+ : this(matchVersion, LoadStopwordSet(stopwords, matchVersion))
+ {
+ }
+
+ public int MaxTokenLength
+ {
+ get { return maxTokenLength; }
+ set { maxTokenLength = value; }
+ }
+
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
+ src.MaxTokenLength = maxTokenLength;
+ TokenStream tok = new StandardFilter(matchVersion, src);
+ tok = new LowerCaseFilter(matchVersion, tok);
+ tok = new StopFilter(matchVersion, tok, stopwords);
+ return new AnonymousTokenStreamComponents(this, src, tok);
+ }
+
+ private sealed class AnonymousTokenStreamComponents : TokenStreamComponents
+ {
+ private readonly StandardTokenizer src;
+ private readonly StandardAnalyzer parent;
+
+ public AnonymousTokenStreamComponents(StandardAnalyzer parent, StandardTokenizer src, TokenStream tok)
+ : base(src, tok)
+ {
+ this.parent = parent;
+ this.src = src;
+ }
+
+ public override void SetReader(TextReader reader)
+ {
+ src.MaxTokenLength = parent.maxTokenLength;
+ base.SetReader(reader);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardFilter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardFilter.cs b/src/contrib/Analyzers/Standard/StandardFilter.cs
new file mode 100644
index 0000000..9381883
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardFilter.cs
@@ -0,0 +1,73 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public class StandardFilter : TokenFilter
+ {
+ private readonly Version? matchVersion;
+
+ public StandardFilter(Version? matchVersion, TokenStream input)
+ : base(input)
+ {
+ this.matchVersion = matchVersion;
+
+ typeAtt = AddAttribute<ITypeAttribute>();
+ termAtt = AddAttribute<ICharTermAttribute>();
+ }
+
+ private static readonly String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+ private static readonly String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+
+ // this filters uses attribute type
+ private readonly ITypeAttribute typeAtt; // = addAttribute(TypeAttribute.class);
+ private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+
+ public override bool IncrementToken()
+ {
+ if (matchVersion.GetValueOrDefault().OnOrAfter(Version.LUCENE_31))
+ return input.IncrementToken(); // TODO: add some niceties for the new grammar
+ else
+ return IncrementTokenClassic();
+ }
+
+ public bool IncrementTokenClassic()
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
+
+ char[] buffer = termAtt.Buffer;
+ int bufferLength = termAtt.Length;
+ String type = typeAtt.Type;
+
+ if (type == APOSTROPHE_TYPE && // remove 's
+ bufferLength >= 2 &&
+ buffer[bufferLength - 2] == '\'' &&
+ (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
+ {
+ // Strip last 2 characters off
+ termAtt.SetLength(bufferLength - 2);
+ }
+ else if (type == ACRONYM_TYPE)
+ { // remove dots
+ int upto = 0;
+ for (int i = 0; i < bufferLength; i++)
+ {
+ char c = buffer[i];
+ if (c != '.')
+ buffer[upto++] = c;
+ }
+ termAtt.SetLength(upto);
+ }
+
+ return true;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardFilterFactory.cs b/src/contrib/Analyzers/Standard/StandardFilterFactory.cs
new file mode 100644
index 0000000..447b5e3
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardFilterFactory.cs
@@ -0,0 +1,26 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public class StandardFilterFactory : TokenFilterFactory
+ {
+ public StandardFilterFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ AssureMatchVersion();
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new StandardFilter(luceneMatchVersion, input);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardTokenizer.cs b/src/contrib/Analyzers/Standard/StandardTokenizer.cs
new file mode 100644
index 0000000..4c3d375
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardTokenizer.cs
@@ -0,0 +1,167 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Util;
+using Version = Lucene.Net.Util.Version;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Standard.Std31;
+using Lucene.Net.Analysis.Standard.Std34;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public sealed class StandardTokenizer : Tokenizer
+ {
+ private IStandardTokenizerInterface scanner;
+
+ public const int ALPHANUM = 0;
+ [Obsolete]
+ public const int APOSTROPHE = 1;
+ [Obsolete]
+ public const int ACRONYM = 2;
+ [Obsolete]
+ public const int COMPANY = 3;
+ public const int EMAIL = 4;
+ [Obsolete]
+ public const int HOST = 5;
+ public const int NUM = 6;
+ [Obsolete]
+ public const int CJ = 7;
+ [Obsolete]
+ public const int ACRONYM_DEP = 8;
+ public const int SOUTHEAST_ASIAN = 9;
+ public const int IDEOGRAPHIC = 10;
+ public const int HIRAGANA = 11;
+ public const int KATAKANA = 12;
+ public const int HANGUL = 13;
+
+ public static readonly string[] TOKEN_TYPES = new string[] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>",
+ "<ACRONYM_DEP>",
+ "<SOUTHEAST_ASIAN>",
+ "<IDEOGRAPHIC>",
+ "<HIRAGANA>",
+ "<KATAKANA>",
+ "<HANGUL>"
+ };
+
+ private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+ public int MaxTokenLength
+ {
+ get { return maxTokenLength; }
+ set { maxTokenLength = value; }
+ }
+
+ public StandardTokenizer(Version? matchVersion, TextReader input)
+ : base(input)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ typeAtt = AddAttribute<ITypeAttribute>();
+
+ Init(matchVersion.GetValueOrDefault());
+ }
+
+ public StandardTokenizer(Version? matchVersion, AttributeFactory factory, TextReader input)
+ : base(factory, input)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>();
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ typeAtt = AddAttribute<ITypeAttribute>();
+
+ Init(matchVersion.GetValueOrDefault());
+ }
+
+ private void Init(Version matchVersion)
+ {
+ // best effort NPE if you dont call reset
+ if (matchVersion.OnOrAfter(Version.LUCENE_40))
+ {
+ this.scanner = new StandardTokenizerImpl(null);
+ }
+ else if (matchVersion.OnOrAfter(Version.LUCENE_34))
+ {
+ this.scanner = new StandardTokenizerImpl34(null);
+ }
+ else if (matchVersion.OnOrAfter(Version.LUCENE_31))
+ {
+ this.scanner = new StandardTokenizerImpl31(null);
+ }
+ else
+ {
+ this.scanner = new ClassicTokenizerImpl(null);
+ }
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+ private readonly IOffsetAttribute offsetAtt; // = addAttribute(OffsetAttribute.class);
+ private readonly IPositionIncrementAttribute posIncrAtt; // = addAttribute(PositionIncrementAttribute.class);
+ private readonly ITypeAttribute typeAtt; // = addAttribute(TypeAttribute.class);
+
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ int posIncr = 1;
+
+ while (true)
+ {
+ int tokenType = scanner.GetNextToken();
+
+ if (tokenType == StandardTokenizerInterface.YYEOF)
+ {
+ return false;
+ }
+
+ if (scanner.YYLength <= maxTokenLength)
+ {
+ posIncrAtt.PositionIncrement = posIncr;
+ scanner.GetText(termAtt);
+ int start = scanner.YYChar;
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
+ // This 'if' should be removed in the next release. For now, it converts
+ // invalid acronyms to HOST. When removed, only the 'else' part should
+ // remain.
+ if (tokenType == StandardTokenizer.ACRONYM_DEP)
+ {
+ typeAtt.Type = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST];
+ termAtt.SetLength(termAtt.Length - 1); // remove extra '.'
+ }
+ else
+ {
+ typeAtt.Type = StandardTokenizer.TOKEN_TYPES[tokenType];
+ }
+ return true;
+ }
+ else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ posIncr++;
+ }
+ }
+
+ public override void End()
+ {
+ // set final offset
+ int finalOffset = CorrectOffset(scanner.YYChar + scanner.YYLength);
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset()
+ {
+ scanner.YYReset(input);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardTokenizerFactory.cs b/src/contrib/Analyzers/Standard/StandardTokenizerFactory.cs
new file mode 100644
index 0000000..bfc64ca
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardTokenizerFactory.cs
@@ -0,0 +1,31 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+ public class StandardTokenizerFactory : TokenizerFactory
+ {
+ private readonly int maxTokenLength;
+
+ public StandardTokenizerFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ AssureMatchVersion();
+ maxTokenLength = GetInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+ {
+ StandardTokenizer tokenizer = new StandardTokenizer(luceneMatchVersion, factory, input);
+ tokenizer.MaxTokenLength = maxTokenLength;
+ return tokenizer;
+ }
+ }
+}