You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ar...@apache.org on 2008/06/25 04:53:12 UTC
svn commit: r671406 [1/3] - in
/incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis: ./ Standard/
Author: aroush
Date: Tue Jun 24 19:53:11 2008
New Revision: 671406
URL: http://svn.apache.org/viewvc?rev=671406&view=rev
Log:
Release: Apache Lucene.Net.2.3.1 build 001 "Alpha"
Added:
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CachingTokenFilter.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharArraySet.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SinkTokenizer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.jflex
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TeeTokenFilter.cs
Modified:
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Analyzer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharTokenizer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordAnalyzer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordTokenizer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LengthFilter.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LowerCaseFilter.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Package.html
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemFilter.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemmer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SimpleAnalyzer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/CharStream.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/FastCharStream.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Package.html
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/ParseException.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardFilter.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Token.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopAnalyzer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopFilter.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs
incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Analyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Analyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Analyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Analyzer.cs Tue Jun 24 19:53:11 2008
@@ -39,9 +39,41 @@
/// field name for backward compatibility.
/// </summary>
public abstract TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader);
-
-
- /// <summary> Invoked before indexing a Fieldable instance if
+
+ /// <summary>Creates a TokenStream that is allowed to be re-used
+ /// from the previous time that the same thread called
+ /// this method. Callers that do not need to use more
+ /// than one TokenStream at the same time from this
+ /// analyzer should use this method for better
+ /// performance.
+ /// </summary>
+ public virtual TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ return TokenStream(fieldName, reader);
+ }
+
+ private System.LocalDataStoreSlot tokenStreams = System.Threading.Thread.AllocateDataSlot();
+
+ /// <summary>Used by Analyzers that implement reusableTokenStream
+ /// to retrieve previously saved TokenStreams for re-use
+ /// by the same thread.
+ /// </summary>
+ protected internal virtual System.Object GetPreviousTokenStream()
+ {
+ return System.Threading.Thread.GetData(tokenStreams);
+ }
+
+ /// <summary>Used by Analyzers that implement reusableTokenStream
+ /// to save a TokenStream for later re-use by the same
+ /// thread.
+ /// </summary>
+ protected internal virtual void SetPreviousTokenStream(System.Object obj)
+ {
+ System.Threading.Thread.SetData(tokenStreams, obj);
+ }
+
+
+ /// <summary> Invoked before indexing a Fieldable instance if
/// terms have already been added to that field. This allows custom
/// analyzers to place an automatic position increment gap between
/// Fieldable instances using the same field name. The default value
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CachingTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/CachingTokenFilter.cs?rev=671406&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CachingTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CachingTokenFilter.cs Tue Jun 24 19:53:11 2008
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Analysis
+{
+
+ /// <summary> This class can be used if the Tokens of a TokenStream
+ /// are intended to be consumed more than once. It caches
+ /// all Tokens locally in a List.
+ ///
+ /// CachingTokenFilter implements the optional method
+ /// {@link TokenStream#Reset()}, which repositions the
+ /// stream to the first Token.
+ ///
+ /// </summary>
+ public class CachingTokenFilter : TokenFilter
+ {
+ private System.Collections.IList cache;
+ private System.Collections.IEnumerator iterator;
+
+ public CachingTokenFilter(TokenStream input) : base(input)
+ {
+ }
+
+ public override Token Next()
+ {
+ if (cache == null)
+ {
+ // fill cache lazily
+ cache = new System.Collections.ArrayList();
+ FillCache();
+ iterator = cache.GetEnumerator();
+ }
+
+ if (!iterator.MoveNext())
+ {
+ // the cache is exhausted, return null
+ return null;
+ }
+
+ return (Token) iterator.Current;
+ }
+
+ public override void Reset()
+ {
+ if (cache != null)
+ {
+ iterator = cache.GetEnumerator();
+ }
+ }
+
+ private void FillCache()
+ {
+ Token token;
+ while ((token = input.Next()) != null)
+ {
+ cache.Add(token);
+ }
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharArraySet.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/CharArraySet.cs?rev=671406&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharArraySet.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharArraySet.cs Tue Jun 24 19:53:11 2008
@@ -0,0 +1,396 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Analysis
+{
+
+
+ /// <summary> A simple class that stores Strings as char[]'s in a
+ /// hash table. Note that this is not a general purpose
+ /// class. For example, it cannot remove items from the
+ /// set, nor does it resize its hash table to be smaller,
+ /// etc. It is designed to be quick to test if a char[]
+ /// is in the set without the necessity of converting it
+ /// to a String first.
+ /// </summary>
+
+ public class CharArraySet : System.Collections.Hashtable
+ {
+
+ private const int INIT_SIZE = 8;
+ private char[][] entries;
+ private int count;
+ private bool ignoreCase;
+
+ /// <summary>Create set with enough capacity to hold startSize
+ /// terms
+ /// </summary>
+ public CharArraySet(int startSize, bool ignoreCase)
+ {
+ this.ignoreCase = ignoreCase;
+ int size = INIT_SIZE;
+ while (startSize + (startSize >> 2) > size)
+ size <<= 1;
+ entries = new char[size][];
+ }
+
+ /// <summary>Create set from a Collection of char[] or String </summary>
+ public CharArraySet(System.Collections.ICollection c, bool ignoreCase) : this(c.Count, ignoreCase)
+ {
+ System.Collections.IEnumerator e = c.GetEnumerator();
+ while (e.MoveNext())
+ {
+ Add(e.Current);
+ }
+ }
+
+ /// <summary>true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+ /// are in the set
+ /// </summary>
+ public virtual bool Contains(char[] text, int off, int len)
+ {
+ return entries[GetSlot(text, off, len)] != null;
+ }
+
+ // {{Doug-2.3.1}}: commented to determine if used internally to library
+ // /// <summary>true if the <code>CharSequence</code> is in the set </summary>
+ // public virtual bool Contains(CharSequence cs)
+ // {
+ // return entries[GetSlot(cs)] != null;
+ // }
+
+ private int GetSlot(char[] text, int off, int len)
+ {
+ int code = GetHashCode(text, off, len);
+ int pos = code & (entries.Length - 1);
+ char[] text2 = entries[pos];
+ if (text2 != null && !Equals(text, off, len, text2))
+ {
+ int inc = ((code >> 8) + code) | 1;
+ do
+ {
+ code += inc;
+ pos = code & (entries.Length - 1);
+ text2 = entries[pos];
+ }
+ while (text2 != null && !Equals(text, off, len, text2));
+ }
+ return pos;
+ }
+
+ // {{Doug-2.3.1}}: commented to determine if used internally to library
+ // /// <summary>Returns true if the String is in the set </summary>
+ // private int GetSlot(CharSequence text)
+ // {
+ // int code = GetHashCode(text);
+ // int pos = code & (entries.Length - 1);
+ // char[] text2 = entries[pos];
+ // if (text2 != null && !Equals(text, text2))
+ // {
+ // int inc = ((code >> 8) + code) | 1;
+ // do
+ // {
+ // code += inc;
+ // pos = code & (entries.Length - 1);
+ // text2 = entries[pos];
+ // }
+ // while (text2 != null && !Equals(text, text2));
+ // }
+ // return pos;
+ // }
+
+ // {{Doug-2.3.1}}: commented to determine if used internally to library
+ // /// <summary>Add this CharSequence into the set </summary>
+ // public virtual bool Add(CharSequence text)
+ // {
+ // return Add(text.toString()); // could be more efficient
+ // }
+
+ /// <summary>Add this String into the set </summary>
+ public virtual bool Add(System.String text)
+ {
+ return Add(text.ToCharArray());
+ }
+
+ /// <summary>Add this char[] directly to the set.
+ /// If ignoreCase is true for this Set, the text array will be directly modified.
+ /// The user should never modify this text array after calling this method.
+ /// </summary>
+ public virtual bool Add(char[] text)
+ {
+ if (ignoreCase)
+ for (int i = 0; i < text.Length; i++)
+ text[i] = System.Char.ToLower(text[i]);
+ int slot = GetSlot(text, 0, text.Length);
+ if (entries[slot] != null)
+ return false;
+ entries[slot] = text;
+ count++;
+
+ if (count + (count >> 2) > entries.Length)
+ {
+ Rehash();
+ }
+
+ return true;
+ }
+
+ private bool Equals(char[] text1, int off, int len, char[] text2)
+ {
+ if (len != text2.Length)
+ return false;
+ if (ignoreCase)
+ {
+ for (int i = 0; i < len; i++)
+ {
+ if (System.Char.ToLower(text1[off + i]) != text2[i])
+ return false;
+ }
+ }
+ else
+ {
+ for (int i = 0; i < len; i++)
+ {
+ if (text1[off + i] != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // {{Doug-2.3.1}}: commented to determine if used internally to library
+ // private bool Equals(CharSequence text1, char[] text2)
+ // {
+ // int len = text1.length();
+ // if (len != text2.Length)
+ // return false;
+ // if (ignoreCase)
+ // {
+ // for (int i = 0; i < len; i++)
+ // {
+ // if (Character.toLowerCase(text1.charAt(i)) != text2[i])
+ // return false;
+ // }
+ // }
+ // else
+ // {
+ // for (int i = 0; i < len; i++)
+ // {
+ // if (text1.charAt(i) != text2[i])
+ // return false;
+ // }
+ // }
+ // return true;
+ // }
+
+ private void Rehash()
+ {
+ int newSize = 2 * entries.Length;
+ char[][] oldEntries = entries;
+ entries = new char[newSize][];
+
+ for (int i = 0; i < oldEntries.Length; i++)
+ {
+ char[] text = oldEntries[i];
+ if (text != null)
+ {
+ // todo: could be faster... no need to compare strings on collision
+ entries[GetSlot(text, 0, text.Length)] = text;
+ }
+ }
+ }
+
+ private int GetHashCode(char[] text, int offset, int len)
+ {
+ int code = 0;
+ int stop = offset + len;
+ if (ignoreCase)
+ {
+ for (int i = offset; i < stop; i++)
+ {
+ code = code * 31 + System.Char.ToLower(text[i]);
+ }
+ }
+ else
+ {
+ for (int i = offset; i < stop; i++)
+ {
+ code = code * 31 + text[i];
+ }
+ }
+ return code;
+ }
+
+ // {{Doug-2.3.1}}: commented to determine if used internally to library
+ // private int GetHashCode(CharSequence text)
+ // {
+ // int code;
+ // if (ignoreCase)
+ // {
+ // code = 0;
+ // int len = text.length();
+ // for (int i = 0; i < len; i++)
+ // {
+ // code = code * 31 + Character.toLowerCase(text.charAt(i));
+ // }
+ // }
+ // else
+ // {
+ // if (false && text is System.String)
+ // {
+ // code = text.hashCode();
+ // }
+ // else
+ // {
+ // code = 0;
+ // int len = text.length();
+ // for (int i = 0; i < len; i++)
+ // {
+ // code = code * 31 + text.charAt(i);
+ // }
+ // }
+ // }
+ // return code;
+ // }
+
+ public virtual int Size()
+ {
+ return count;
+ }
+
+ public virtual bool IsEmpty()
+ {
+ return count == 0;
+ }
+
+ public override bool Contains(System.Object o)
+ {
+ if (o is char[])
+ {
+ char[] text = (char[]) o;
+ return Contains(text, 0, text.Length);
+ }
+ else if (o is String)
+ {
+ return Contains((String) o);
+ }
+ // {{Doug-2.3.1}}: commented to determine if used internally to library
+ // else if (o is CharSequence)
+ // {
+ // return Contains((CharSequence) o);
+ // }
+ return false;
+ }
+
+ public virtual bool Add(System.Object o)
+ {
+ if (o is char[])
+ {
+ return Add((char[]) o);
+ }
+ else if (o is System.String)
+ {
+ return Add((System.String) o);
+ }
+ // {{Doug-2.3.1}}: commented to determine if used internally to library
+ // else if (o is CharSequence)
+ // {
+ // return Add((CharSequence) o);
+ // }
+ else
+ {
+ return Add(o.ToString());
+ }
+ }
+
+ /// <summary>The Iterator<String> for this set. Strings are constructed on the fly, so
+ /// use <code>nextCharArray</code> for more efficient access.
+ /// </summary>
+ public class CharArraySetIterator : System.Collections.IEnumerator
+ {
+ private void InitBlock(CharArraySet enclosingInstance)
+ {
+ this.enclosingInstance = enclosingInstance;
+ }
+ private CharArraySet enclosingInstance;
+ /// <summary>Returns the next String, as a Set<String> would...
+ /// use nextCharArray() for better efficiency.
+ /// </summary>
+ public virtual System.Object Current
+ {
+ get
+ {
+ return new System.String(NextCharArray());
+ }
+
+ }
+ public CharArraySet Enclosing_Instance
+ {
+ get
+ {
+ return enclosingInstance;
+ }
+
+ }
+ internal int pos = - 1;
+ internal char[] next_Renamed_Field;
+ internal CharArraySetIterator(CharArraySet enclosingInstance)
+ {
+ InitBlock(enclosingInstance);
+ GoNext();
+ }
+
+ private void GoNext()
+ {
+ next_Renamed_Field = null;
+ pos++;
+ while (pos < Enclosing_Instance.entries.Length && (next_Renamed_Field = Enclosing_Instance.entries[pos]) == null)
+ pos++;
+ }
+
+ public virtual bool MoveNext()
+ {
+ return next_Renamed_Field != null;
+ }
+
+ /// <summary>do not modify the returned char[] </summary>
+ public virtual char[] NextCharArray()
+ {
+ char[] ret = next_Renamed_Field;
+ GoNext();
+ return ret;
+ }
+
+ public virtual void Remove()
+ {
+ throw new System.NotSupportedException();
+ }
+
+ virtual public void Reset()
+ {
+ }
+ }
+
+
+ public new System.Collections.IEnumerator GetEnumerator()
+ {
+ return new CharArraySetIterator(this);
+ }
+ }
+}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/CharTokenizer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharTokenizer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharTokenizer.cs Tue Jun 24 19:53:11 2008
@@ -30,7 +30,6 @@
private int offset = 0, bufferIndex = 0, dataLen = 0;
private const int MAX_WORD_LEN = 255;
private const int IO_BUFFER_SIZE = 1024;
- private char[] buffer = new char[MAX_WORD_LEN];
private char[] ioBuffer = new char[IO_BUFFER_SIZE];
/// <summary>Returns true iff a character should be included in a token. This
@@ -48,43 +47,44 @@
{
return c;
}
-
- /// <summary>Returns the next token in the stream, or null at EOS. </summary>
- public override Token Next()
+
+ public override Token Next(Token token)
{
+ token.Clear();
int length = 0;
- int start = offset;
+ int start = bufferIndex;
+ char[] buffer = token.TermBuffer();
while (true)
{
- char c;
-
- offset++;
+
if (bufferIndex >= dataLen)
{
- dataLen = input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length);
+ offset += dataLen;
+ dataLen = input is Lucene.Net.Index.DocumentsWriter.ReusableStringReader ? ((Lucene.Net.Index.DocumentsWriter.ReusableStringReader) input).Read(ioBuffer) : input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length);
+ if (dataLen == -1)
+ {
+ if (length > 0)
+ break;
+ else
+ return null;
+ }
bufferIndex = 0;
}
- ;
- if (dataLen <= 0)
- {
- if (length > 0)
- break;
- else
- return null;
- }
- else
- c = ioBuffer[bufferIndex++];
-
+
+ char c = ioBuffer[bufferIndex++];
+
if (IsTokenChar(c))
{
// if it's a token char
-
+
if (length == 0)
// start of token
- start = offset - 1;
-
+ start = offset + bufferIndex - 1;
+ else if (length == buffer.Length)
+ buffer = token.ResizeTermBuffer(1 + length);
+
buffer[length++] = Normalize(c); // buffer it, normalized
-
+
if (length == MAX_WORD_LEN)
// buffer overflow!
break;
@@ -93,8 +93,19 @@
// at non-Letter w/ chars
break; // return 'em
}
-
- return new Token(new System.String(buffer, 0, length), start, start + length);
+
+ token.termLength = length;
+ token.startOffset = start;
+ token.endOffset = start + length;
+ return token;
+ }
+
+ public override void Reset(System.IO.TextReader input)
+ {
+ base.Reset(input);
+ bufferIndex = 0;
+ offset = 0;
+ dataLen = 0;
}
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs Tue Jun 24 19:53:11 2008
@@ -31,173 +31,260 @@
public ISOLatin1AccentFilter(TokenStream input) : base(input)
{
}
-
- public override Token Next()
+
+ private char[] output = new char[256];
+ private int outputPos;
+
+ public override Token Next(Token result)
{
- Token t = input.Next();
- if (t != null)
- t.SetTermText(RemoveAccents(t.TermText()));
- return t;
- }
+ result = input.Next(result);
+ if (result != null)
+ {
+ char[] buffer = result.TermBuffer();
+ int length = result.TermLength();
+ // If no characters actually require rewriting then we
+ // just return token as-is:
+ for (int i = 0; i < length; i++)
+ {
+ char c = buffer[i];
+ if (c >= '\u00c0' && c <= '\u0178')
+ {
+ RemoveAccents(buffer, length);
+ result.SetTermBuffer(output, 0, outputPos);
+ break;
+ }
+ }
+ return result;
+ }
+ else
+ return null;
+ }
/// <summary> To replace accented characters in a String by unaccented equivalents.</summary>
- public static System.String RemoveAccents(System.String input)
+ public void RemoveAccents(char[] input, int length)
{
- System.Text.StringBuilder output = new System.Text.StringBuilder();
- for (int i = 0; i < input.Length; i++)
+
+ // Worst-case length required:
+ int maxSizeNeeded = 2 * length;
+
+ int size = output.Length;
+ while (size < maxSizeNeeded)
+ size *= 2;
+
+ if (size != output.Length)
+ output = new char[size];
+
+ outputPos = 0;
+
+ int pos = 0;
+
+ for (int i = 0; i < length; i++, pos++)
{
- long val = input[i];
+ char c = input[pos];
- switch (input[i])
+ // Quick test: if it's not in range then just keep
+ // current character
+ if (c < '\u00c0')
+ output[outputPos++] = c;
+ else
{
-
- case '\u00C0': // Ãâ¬
- case '\u00C1': // Ã?
- case '\u00C2': // Ãâ
- case '\u00C3': // ÃÆ
- case '\u00C4': // Ãâ
- case '\u00C5': // Ãâ¦
- output.Append("A");
- break;
-
- case '\u00C6': // Ãâ
- output.Append("AE");
- break;
-
- case '\u00C7': // Ãâ¡
- output.Append("C");
- break;
-
- case '\u00C8': // ÃË
- case '\u00C9': // Ãâ°
- case '\u00CA': // ÃÅ
- case '\u00CB': // Ãâ¹
- output.Append("E");
- break;
-
- case '\u00CC': // ÃÅ
- case '\u00CD': // Ã?
- case '\u00CE': // ÃŽ
- case '\u00CF': // Ã?
- output.Append("I");
- break;
-
- case '\u00D0': // Ã?
- output.Append("D");
- break;
-
- case '\u00D1': // Ãâ
- output.Append("N");
- break;
-
- case '\u00D2': // Ãâ
- case '\u00D3': // Ãâ
- case '\u00D4': // Ãâ
- case '\u00D5': // Ãâ¢
- case '\u00D6': // Ãâ
- case '\u00D8': // ÃË
- output.Append("O");
- break;
-
- case '\u0152': // Ã
â
- output.Append("OE");
- break;
-
- case '\u00DE': // Þ
- output.Append("TH");
- break;
-
- case '\u00D9': // Ãâ¢
- case '\u00DA': // ÃÅ¡
- case '\u00DB': // Ãâº
- case '\u00DC': // ÃÅ
- output.Append("U");
- break;
-
- case '\u00DD': // Ã?
- case '\u0178': // Ã
¸
- output.Append("Y");
- break;
-
- case '\u00E0': // ÃÂ
- case '\u00E1': // á
- case '\u00E2': // â
- case '\u00E3': // ã
- case '\u00E4': // ä
- case '\u00E5': // ÃÂ¥
- output.Append("a");
- break;
-
- case '\u00E6': // æ
- output.Append("ae");
- break;
-
- case '\u00E7': // ç
- output.Append("c");
- break;
-
- case '\u00E8': // è
- case '\u00E9': // é
- case '\u00EA': // ê
- case '\u00EB': // ë
- output.Append("e");
- break;
-
- case '\u00EC': // ì
- case '\u00ED': // ÃÂ
- case '\u00EE': // î
- case '\u00EF': // ï
- output.Append("i");
- break;
-
- case '\u00F0': // ð
- output.Append("d");
- break;
-
- case '\u00F1': // ñ
- output.Append("n");
- break;
-
- case '\u00F2': // ò
- case '\u00F3': // ó
- case '\u00F4': // ô
- case '\u00F5': // õ
- case '\u00F6': // ö
- case '\u00F8': // ø
- output.Append("o");
- break;
-
- case '\u0153': // Ã
â
- output.Append("oe");
- break;
-
- case '\u00DF': // ß
- output.Append("ss");
- break;
-
- case '\u00FE': // þ
- output.Append("th");
- break;
-
- case '\u00F9': // ù
- case '\u00FA': // ú
- case '\u00FB': // û
- case '\u00FC': // ü
- output.Append("u");
- break;
-
- case '\u00FD': // ý
- case '\u00FF': // ÿ
- output.Append("y");
- break;
-
- default:
- output.Append(input[i]);
- break;
-
+ switch (c)
+ {
+
+ case '\u00C0':
+ // Ãâ¬
+ case '\u00C1':
+ // ÃÂ
+ case '\u00C2':
+ // Ãâ
+ case '\u00C3':
+ // ÃÆ
+ case '\u00C4':
+ // Ãâ
+ case '\u00C5': // Ãâ¦
+ output[outputPos++] = 'A';
+ break;
+
+ case '\u00C6': // Ãâ
+ output[outputPos++] = 'A';
+ output[outputPos++] = 'E';
+ break;
+
+ case '\u00C7': // Ãâ¡
+ output[outputPos++] = 'C';
+ break;
+
+ case '\u00C8':
+ // ÃË
+ case '\u00C9':
+ // Ãâ°
+ case '\u00CA':
+ // ÃÅ
+ case '\u00CB': // Ãâ¹
+ output[outputPos++] = 'E';
+ break;
+
+ case '\u00CC':
+ // ÃÅ
+ case '\u00CD':
+ // ÃÂ
+ case '\u00CE':
+ // ÃŽ
+ case '\u00CF': // ÃÂ
+ output[outputPos++] = 'I';
+ break;
+
+ case '\u00D0': // ÃÂ
+ output[outputPos++] = 'D';
+ break;
+
+ case '\u00D1': // Ãâ
+ output[outputPos++] = 'N';
+ break;
+
+ case '\u00D2':
+ // Ãâ
+ case '\u00D3':
+ // Ãâ
+ case '\u00D4':
+ // Ãâ
+ case '\u00D5':
+ // Ãâ¢
+ case '\u00D6':
+ // Ãâ
+ case '\u00D8': // ÃË
+ output[outputPos++] = 'O';
+ break;
+
+ case '\u0152': // Ã
â
+ output[outputPos++] = 'O';
+ output[outputPos++] = 'E';
+ break;
+
+ case '\u00DE': // Þ
+ output[outputPos++] = 'T';
+ output[outputPos++] = 'H';
+ break;
+
+ case '\u00D9':
+ // Ãâ¢
+ case '\u00DA':
+ // ÃÅ¡
+ case '\u00DB':
+ // Ãâº
+ case '\u00DC': // ÃÅ
+ output[outputPos++] = 'U';
+ break;
+
+ case '\u00DD':
+ // ÃÂ
+ case '\u0178': // Ã
¸
+ output[outputPos++] = 'Y';
+ break;
+
+ case '\u00E0':
+ // ÃÂ
+ case '\u00E1':
+ // á
+ case '\u00E2':
+ // â
+ case '\u00E3':
+ // ã
+ case '\u00E4':
+ // ä
+ case '\u00E5': // ÃÂ¥
+ output[outputPos++] = 'a';
+ break;
+
+ case '\u00E6': // æ
+ output[outputPos++] = 'a';
+ output[outputPos++] = 'e';
+ break;
+
+ case '\u00E7': // ç
+ output[outputPos++] = 'c';
+ break;
+
+ case '\u00E8':
+ // è
+ case '\u00E9':
+ // é
+ case '\u00EA':
+ // ê
+ case '\u00EB': // ë
+ output[outputPos++] = 'e';
+ break;
+
+ case '\u00EC':
+ // ì
+ case '\u00ED':
+ // ÃÂ
+ case '\u00EE':
+ // î
+ case '\u00EF': // ï
+ output[outputPos++] = 'i';
+ break;
+
+ case '\u00F0': // ð
+ output[outputPos++] = 'd';
+ break;
+
+ case '\u00F1': // ñ
+ output[outputPos++] = 'n';
+ break;
+
+ case '\u00F2':
+ // ò
+ case '\u00F3':
+ // ó
+ case '\u00F4':
+ // ô
+ case '\u00F5':
+ // õ
+ case '\u00F6':
+ // ö
+ case '\u00F8': // ø
+ output[outputPos++] = 'o';
+ break;
+
+ case '\u0153': // Ã
â
+ output[outputPos++] = 'o';
+ output[outputPos++] = 'e';
+ break;
+
+ case '\u00DF': // ß
+ output[outputPos++] = 's';
+ output[outputPos++] = 's';
+ break;
+
+ case '\u00FE': // þ
+ output[outputPos++] = 't';
+ output[outputPos++] = 'h';
+ break;
+
+ case '\u00F9':
+ // ù
+ case '\u00FA':
+ // ú
+ case '\u00FB':
+ // û
+ case '\u00FC': // ü
+ output[outputPos++] = 'u';
+ break;
+
+ case '\u00FD':
+ // ý
+ case '\u00FF': // ÿ
+ output[outputPos++] = 'y';
+ break;
+
+ default:
+ output[outputPos++] = c;
+ break;
+
+ }
}
}
- return output.ToString();
}
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/KeywordAnalyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordAnalyzer.cs Tue Jun 24 19:53:11 2008
@@ -29,5 +29,18 @@
{
return new KeywordTokenizer(reader);
}
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ Tokenizer tokenizer = (Tokenizer)GetPreviousTokenStream();
+ if (tokenizer == null)
+ {
+ tokenizer = new KeywordTokenizer(reader);
+ SetPreviousTokenStream(tokenizer);
+ }
+ else
+ tokenizer.Reset(reader);
+ return tokenizer;
+ }
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/KeywordTokenizer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordTokenizer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordTokenizer.cs Tue Jun 24 19:53:11 2008
@@ -27,7 +27,6 @@
private const int DEFAULT_BUFFER_SIZE = 256;
private bool done;
- private char[] buffer;
public KeywordTokenizer(System.IO.TextReader input) : this(input, DEFAULT_BUFFER_SIZE)
{
@@ -35,29 +34,36 @@
public KeywordTokenizer(System.IO.TextReader input, int bufferSize) : base(input)
{
- this.buffer = new char[bufferSize];
this.done = false;
}
-
- public override Token Next()
+
+ public override Token Next(Token result)
{
if (!done)
{
done = true;
- System.Text.StringBuilder buffer = new System.Text.StringBuilder();
- int length;
+ int upto = 0;
+ result.Clear();
+ char[] buffer = result.TermBuffer();
while (true)
{
- length = input.Read((System.Char[]) this.buffer, 0, this.buffer.Length);
- if (length <= 0)
+ int length = input.Read(buffer, upto, buffer.Length - upto);
+ if (length == -1)
break;
-
- buffer.Append(this.buffer, 0, length);
+ upto += length;
+ if (upto == buffer.Length)
+ buffer = result.ResizeTermBuffer(1 + buffer.Length);
}
- System.String text = buffer.ToString();
- return new Token(text, 0, text.Length);
+ result.termLength = upto;
+ return result;
}
return null;
}
+
+ public override void Reset(System.IO.TextReader input)
+ {
+ base.Reset(input);
+ this.done = false;
+ }
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LengthFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/LengthFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LengthFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LengthFilter.cs Tue Jun 24 19:53:11 2008
@@ -22,10 +22,9 @@
/// <summary> Removes words that are too long and too short from the stream.
///
+ ///
/// </summary>
- /// <author> David Spencer
- /// </author>
- /// <version> $Id: LengthFilter.java 347992 2005-11-21 21:41:43Z dnaber $
+ /// <version> $Id: LengthFilter.java 564715 2007-08-10 18:34:33Z mikemccand $
/// </version>
public sealed class LengthFilter : TokenFilter
{
@@ -43,10 +42,10 @@
}
/// <summary> Returns the next input Token whose termText() is the right len</summary>
- public override Token Next()
+ public override Token Next(Token result)
{
// return the first non-stop word found
- for (Token token = input.Next(); token != null; token = input.Next())
+ for (Token token = input.Next(result); token != null; token = input.Next(result))
{
int len = token.TermText().Length;
if (len >= min && len <= max)
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LowerCaseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/LowerCaseFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LowerCaseFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LowerCaseFilter.cs Tue Jun 24 19:53:11 2008
@@ -23,7 +23,7 @@
/// <summary> Normalizes token text to lower case.
///
/// </summary>
- /// <version> $Id: LowerCaseFilter.java 150259 2004-03-29 22:48:07Z cutting $
+ /// <version> $Id: LowerCaseFilter.java 564715 2007-08-10 18:34:33Z mikemccand $
/// </version>
public sealed class LowerCaseFilter : TokenFilter
{
@@ -31,16 +31,21 @@
{
}
- public override Token Next()
+ public override Token Next(Token result)
{
- Token t = input.Next();
-
- if (t == null)
+ result = input.Next(result);
+ if (result != null)
+ {
+
+ char[] buffer = result.TermBuffer();
+ int length = result.termLength;
+ for (int i = 0; i < length; i++)
+ buffer[i] = System.Char.ToLower(buffer[i]);
+
+ return result;
+ }
+ else
return null;
-
- t.termText = t.termText.ToLower();
-
- return t;
}
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Package.html
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Package.html?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Package.html (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Package.html Tue Jun 24 19:53:11 2008
@@ -1,10 +1,256 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<html>
-<head>
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
- <meta name="Author" content="Doug Cutting">
-</head>
-<body>
-API and code to convert text into indexable tokens.
-</body>
-</html>
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+ <meta name="Author" content="Doug Cutting">
+</head>
+<body>
+<p>API and code to convert text into indexable/searchable tokens. Covers {@link org.apache.lucene.analysis.Analyzer} and related classes.</p>
+<h2>Parsing? Tokenization? Analysis!</h2>
+<p>
+Lucene, indexing and search library, accepts only plain text input.
+<p>
+<h2>Parsing</h2>
+<p>
+Applications that build their search capabilities upon Lucene may support documents in various formats – HTML, XML, PDF, Word – just to name a few.
+Lucene does not care about the <i>Parsing</i> of these and other document formats, and it is the responsibility of the
+application using Lucene to use an appropriate <i>Parser</i> to convert the original format into plain text before passing that plain text to Lucene.
+<p>
+<h2>Tokenization</h2>
+<p>
+Plain text passed to Lucene for indexing goes through a process generally called tokenization – namely breaking of the
+input text into small indexing elements –
+{@link org.apache.lucene.analysis.Token Tokens}.
+The way input text is broken into tokens very
+much dictates further capabilities of search upon that text.
+For instance, sentences beginnings and endings can be identified to provide for more accurate phrase
+and proximity searches (though sentence identification is not provided by Lucene).
+<p>
+In some cases simply breaking the input text into tokens is not enough – a deeper <i>Analysis</i> is needed,
+providing for several functions, including (but not limited to):
+<ul>
+ <li><a href = "http://en.wikipedia.org//wiki/Stemming">Stemming</a> –
+ Replacing of words by their stems.
+ For instance with English stemming "bikes" is replaced by "bike";
+ now query "bike" can find both documents containing "bike" and those containing "bikes".
+ </li>
+ <li><a href = "http://en.wikipedia.org//wiki/Stop_words">Stop Words Filtering</a> –
+ Common words like "the", "and" and "a" rarely add any value to a search.
+ Removing them shrinks the index size and increases performance.
+ It may also reduce some "noise" and actually improve search quality.
+ </li>
+ <li><a href = "http://en.wikipedia.org//wiki/Text_normalization">Text Normalization</a> –
+ Stripping accents and other character markings can make for better searching.
+ </li>
+ <li><a href = "http://en.wikipedia.org//wiki/Synonym">Synonym Expansion</a> –
+ Adding in synonyms at the same token position as the current word can mean better
+ matching when users search with words in the synonym set.
+ </li>
+</ul>
+<p>
+<h2>Core Analysis</h2>
+<p>
+ The analysis package provides the mechanism to convert Strings and Readers into tokens that can be indexed by Lucene. There
+ are three main classes in the package from which all analysis processes are derived. These are:
+ <ul>
+ <li>{@link org.apache.lucene.analysis.Analyzer} – An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed
+ by the indexing and searching processes. See below for more information on implementing your own Analyzer.</li>
+ <li>{@link org.apache.lucene.analysis.Tokenizer} – A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking
+ up incoming text into {@link org.apache.lucene.analysis.Token}s. In most cases, an Analyzer will use a Tokenizer as the first step in
+ the analysis process.</li>
+ <li>{@link org.apache.lucene.analysis.TokenFilter} – A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
+ for modifying {@link org.apache.lucene.analysis.Token}s that have been created by the Tokenizer. Common modifications performed by a
+ TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters</li>
+ </ul>
+</p>
+<h2>Hints, Tips and Traps</h2>
+<p>
+ The synergy between {@link org.apache.lucene.analysis.Analyzer} and {@link org.apache.lucene.analysis.Tokenizer}
+ is sometimes confusing. To ease on this confusion, some clarifications:
+ <ul>
+ <li>The {@link org.apache.lucene.analysis.Analyzer} is responsible for the entire task of
+ <u>creating</u> tokens out of the input text, while the {@link org.apache.lucene.analysis.Tokenizer}
+ is only responsible for <u>breaking</u> the input text into tokens. Very likely, tokens created
+ by the {@link org.apache.lucene.analysis.Tokenizer} would be modified or even omitted
+ by the {@link org.apache.lucene.analysis.Analyzer} (via one or more
+ {@link org.apache.lucene.analysis.TokenFilter}s) before being returned.
+ </li>
+ <li>{@link org.apache.lucene.analysis.Tokenizer} is a {@link org.apache.lucene.analysis.TokenStream},
+ but {@link org.apache.lucene.analysis.Analyzer} is not.
+ </li>
+ <li>{@link org.apache.lucene.analysis.Analyzer} is "field aware", but
+ {@link org.apache.lucene.analysis.Tokenizer} is not.
+ </li>
+ </ul>
+</p>
+<p>
+ Lucene Java provides a number of analysis capabilities, the most commonly used one being the {@link
+ org.apache.lucene.analysis.standard.StandardAnalyzer}. Many applications will have a long and industrious life with nothing more
+ than the StandardAnalyzer. However, there are a few other classes/packages that are worth mentioning:
+ <ol>
+ <li>{@link org.apache.lucene.analysis.PerFieldAnalyzerWrapper} – Most Analyzers perform the same operation on all
+ {@link org.apache.lucene.document.Field}s. The PerFieldAnalyzerWrapper can be used to associate a different Analyzer with different
+ {@link org.apache.lucene.document.Field}s.</li>
+ <li>The contrib/analyzers library located at the root of the Lucene distribution has a number of different Analyzer implementations to solve a variety
+ of different problems related to searching. Many of the Analyzers are designed to analyze non-English languages.</li>
+ <li>The {@link org.apache.lucene.analysis.snowball contrib/snowball library}
+ located at the root of the Lucene distribution has Analyzer and TokenFilter
+ implementations for a variety of Snowball stemmers.
+ See <a href = "http://snowball.tartarus.org">http://snowball.tartarus.org</a>
+ for more information on Snowball stemmers.</li>
+ <li>There are a variety of Tokenizer and TokenFilter implementations in this package. Take a look around, chances are someone has implemented what you need.</li>
+ </ol>
+</p>
+<p>
+ Analysis is one of the main causes of performance degradation during indexing. Simply put, the more you analyze the slower the indexing (in most cases).
+ Perhaps your application would be just fine using the simple {@link org.apache.lucene.analysis.WhitespaceTokenizer} combined with a
+ {@link org.apache.lucene.analysis.StopFilter}. The contrib/benchmark library can be useful for testing out the speed of the analysis process.
+</p>
+<h2>Invoking the Analyzer</h2>
+<p>
+ Applications usually do not invoke analysis – Lucene does it for them:
+ <ul>
+ <li>At indexing, as a consequence of
+ {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document) addDocument(doc)},
+ the Analyzer in effect for indexing is invoked for each indexed field of the added document.
+ </li>
+ <li>At search, as a consequence of
+ {@link org.apache.lucene.queryParser.QueryParser#parse(java.lang.String) QueryParser.parse(queryText)},
+ the QueryParser may invoke the Analyzer in effect.
+ Note that for some queries analysis does not take place, e.g. wildcard queries.
+ </li>
+ </ul>
+ However an application might invoke Analysis of any text for testing or for any other purpose, something like:
+ <PRE>
+ Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
+ TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
+ Token t = ts.next();
+ while (t!=null) {
+ System.out.println("token: "+t));
+ t = ts.next();
+ }
+ </PRE>
+</p>
+<h2>Indexing Analysis vs. Search Analysis</h2>
+<p>
+ Selecting the "correct" analyzer is crucial
+ for search quality, and can also affect indexing and search performance.
+ The "correct" analyzer differs between applications.
+ Lucene java's wiki page
+ <a href = "http://wiki.apache.org//lucene-java/AnalysisParalysis">AnalysisParalysis</a>
+ provides some data on "analyzing your analyzer".
+ Here are some rules of thumb:
+ <ol>
+ <li>Test test test... (did we say test?)</li>
+ <li>Beware of over analysis – might hurt indexing performance.</li>
+ <li>Start with same analyzer for indexing and search, otherwise searches would not find what they are supposed to...</li>
+ <li>In some cases a different analyzer is required for indexing and search, for instance:
+ <ul>
+ <li>Certain searches require more stop words to be filtered. (I.e. more than those that were filtered at indexing.)</li>
+ <li>Query expansion by synonyms, acronyms, auto spell correction, etc.</li>
+ </ul>
+ This might sometimes require a modified analyzer – see the next section on how to do that.
+ </li>
+ </ol>
+</p>
+<h2>Implementing your own Analyzer</h2>
+<p>Creating your own Analyzer is straightforward. It usually involves either wrapping an existing Tokenizer and set of TokenFilters to create a new Analyzer
+or creating both the Analyzer and a Tokenizer or TokenFilter. Before pursuing this approach, you may find it worthwhile
+to explore the contrib/analyzers library and/or ask on the java-user@lucene.apache.org mailing list first to see if what you need already exists.
+If you are still committed to creating your own Analyzer or TokenStream derivation (Tokenizer or TokenFilter) have a look at
+the source code of any one of the many samples located in this package.
+</p>
+<p>
+ The following sections discuss some aspects of implementing your own analyzer.
+</p>
+<h3>Field Section Boundaries</h2>
+<p>
+ When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
+ is called multiple times for the same field name, we could say that each such call creates a new
+ section for that field in that document.
+ In fact, a separate call to
+ {@link org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) tokenStream(field,reader)}
+ would take place for each of these so called "sections".
+ However, the default Analyzer behavior is to treat all these sections as one large section.
+ This allows phrase search and proximity search to seamlessly cross
+ boundaries between these "sections".
+ In other words, if a certain field "f" is added like this:
+ <PRE>
+ document.add(new Field("f","first ends",...);
+ document.add(new Field("f","starts two",...);
+ indexWriter.addDocument(document);
+ </PRE>
+ Then, a phrase search for "ends starts" would find that document.
+ Where desired, this behavior can be modified by introducing a "position gap" between consecutive field "sections",
+ simply by overriding
+ {@link org.apache.lucene.analysis.Analyzer#getPositionIncrementGap(java.lang.String) Analyzer.getPositionIncrementGap(fieldName)}:
+ <PRE>
+ Analyzer myAnalyzer = new StandardAnalyzer() {
+ public int getPositionIncrementGap(String fieldName) {
+ return 10;
+ }
+ };
+ </PRE>
+</p>
+<h3>Token Position Increments</h2>
+<p>
+ By default, all tokens created by Analyzers and Tokenizers have a
+ {@link org.apache.lucene.analysis.Token#getPositionIncrement() position increment} of one.
+ This means that the position stored for that token in the index would be one more than
+ that of the previous token.
+ Recall that phrase and proximity searches rely on position info.
+</p>
+<p>
+ If the selected analyzer filters the stop words "is" and "the", then for a document
+ containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
+ with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
+ would find that document, because the same analyzer filters the same stop words from
+ that query. But also the phrase query "blue sky" would find that document.
+</p>
+<p>
+ If this behavior does not fit the application needs,
+ a modified analyzer can be used, that would increment further the positions of
+ tokens following a removed stop word, using
+ {@link org.apache.lucene.analysis.Token#setPositionIncrement(int)}.
+ This can be done with something like:
+ <PRE>
+ public TokenStream tokenStream(final String fieldName, Reader reader) {
+ final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
+ TokenStream res = new TokenStream() {
+ public Token next() throws IOException {
+ int extraIncrement = 0;
+ while (true) {
+ Token t = ts.next();
+ if (t!=null) {
+ if (stopWords.contains(t.termText())) {
+ extraIncrement++; // filter this word
+ continue;
+ }
+ if (extraIncrement>0) {
+ t.setPositionIncrement(t.getPositionIncrement()+extraIncrement);
+ }
+ }
+ return t;
+ }
+ }
+ };
+ return res;
+ }
+ </PRE>
+ Now, with this modified analyzer, the phrase query "blue sky" would find that document.
+ But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
+ where both w1 and w2 are stop words would match that document.
+</p>
+<p>
+ Few more use cases for modifying position increments are:
+ <ol>
+ <li>Inhibiting phrase and proximity matches in sentence boundaries – for this, a tokenizer that
+ identifies a new sentence can add 1 to the position increment of the first token of the new sentence.</li>
+ <li>Injecting synonyms – here, synonyms of a token should be added after that token,
+ and their position increment should be set to 0.
+ As result, all synonyms of a token would be considered to appear in exactly the
+ same position as that token, and so would they be seen by phrase and proximity searches.</li>
+ </ol>
+</p>
+</body>
+</html>
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs Tue Jun 24 19:53:11 2008
@@ -78,7 +78,16 @@
return analyzer.TokenStream(fieldName, reader);
}
-
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ Analyzer analyzer = (Analyzer)analyzerMap[fieldName];
+ if (analyzer == null)
+ analyzer = defaultAnalyzer;
+
+ return analyzer.ReusableTokenStream(fieldName, reader);
+ }
+
/// <summary>Return the positionIncrementGap from the analyzer assigned to fieldName </summary>
public override int GetPositionIncrementGap(System.String fieldName)
{
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/PorterStemFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemFilter.cs Tue Jun 24 19:53:11 2008
@@ -46,21 +46,18 @@
{
stemmer = new PorterStemmer();
}
-
- /// <summary>Returns the next input Token, after being stemmed </summary>
- public override Token Next()
+
+ public override Token Next(Token result)
{
- Token token = input.Next();
- if (token == null)
- return null;
- else
+ result = input.Next(result);
+ if (result != null)
{
- System.String s = stemmer.Stem(token.termText);
- if ((System.Object) s != (System.Object) token.termText)
- // Yes, I mean object reference comparison here
- token.termText = s;
- return token;
+ if (stemmer.Stem(result.TermBuffer(), 0, result.termLength))
+ result.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength());
+ return result;
}
+ else
+ return null;
}
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/PorterStemmer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemmer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemmer.cs Tue Jun 24 19:53:11 2008
@@ -84,8 +84,7 @@
if (b.Length <= i + EXTRA)
{
char[] new_b = new char[b.Length + INC];
- for (int c = 0; c < b.Length; c++)
- new_b[c] = b[c];
+ Array.Copy(b, 0, new_b, 0, b.Length);
b = new_b;
}
b[i++] = ch;
@@ -643,8 +642,7 @@
char[] new_b = new char[wordLen + EXTRA];
b = new_b;
}
- for (int j = 0; j < wordLen; j++)
- b[j] = wordBuffer[offset + j];
+ Array.Copy(wordBuffer, offset, b, 0, wordLen);
i = wordLen;
return Stem(0);
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SimpleAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/SimpleAnalyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SimpleAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SimpleAnalyzer.cs Tue Jun 24 19:53:11 2008
@@ -28,5 +28,18 @@
{
return new LowerCaseTokenizer(reader);
}
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ Tokenizer tokenizer = (Tokenizer) GetPreviousTokenStream();
+ if (tokenizer == null)
+ {
+ tokenizer = new LowerCaseTokenizer(reader);
+ SetPreviousTokenStream(tokenizer);
+ }
+ else
+ tokenizer.Reset(reader);
+ return tokenizer;
+ }
}
}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SinkTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/SinkTokenizer.cs?rev=671406&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SinkTokenizer.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SinkTokenizer.cs Tue Jun 24 19:53:11 2008
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Analysis
+{
+
+
+ /// <summary> A SinkTokenizer can be used to cache Tokens for use in an Analyzer
+ ///
+ /// </summary>
+ /// <seealso cref="TeeTokenFilter">
+ ///
+ ///
+ /// </seealso>
+ public class SinkTokenizer : Tokenizer
+ {
+ protected internal System.Collections.IList lst = new System.Collections.ArrayList();
+ protected internal System.Collections.IEnumerator iter;
+
+ public SinkTokenizer(System.Collections.IList input)
+ {
+ this.lst = input;
+ if (this.lst == null)
+ this.lst = new System.Collections.ArrayList();
+ }
+
+ public SinkTokenizer()
+ {
+ this.lst = new System.Collections.ArrayList();
+ }
+
+ public SinkTokenizer(int initCap)
+ {
+ this.lst = new System.Collections.ArrayList(initCap);
+ }
+
+ /// <summary> Get the tokens in the internal List.
+ /// <p/>
+ /// WARNING: Adding tokens to this list requires the {@link #Reset()} method to be called in order for them
+ /// to be made available. Also, this Tokenizer does nothing to protect against {@link java.util.ConcurrentModificationException}s
+ /// in the case of adds happening while {@link #Next(Lucene.Net.Analysis.Token)} is being called.
+ ///
+ /// </summary>
+ /// <returns> A List of {@link Lucene.Net.Analysis.Token}s
+ /// </returns>
+ public virtual System.Collections.IList GetTokens()
+ {
+ return lst;
+ }
+
+ /// <summary> Returns the next token out of the list of cached tokens</summary>
+ /// <returns> The next {@link Lucene.Net.Analysis.Token} in the Sink.
+ /// </returns>
+ /// <throws> IOException </throws>
+ public override Token Next()
+ {
+ if (iter == null)
+ iter = lst.GetEnumerator();
+ return iter.MoveNext() ? (Token) iter.Current : null;
+ }
+
+
+
+ /// <summary> Override this method to cache only certain tokens, or new tokens based
+ /// on the old tokens.
+ ///
+ /// </summary>
+ /// <param name="t">The {@link Lucene.Net.Analysis.Token} to add to the sink
+ /// </param>
+ public virtual void Add(Token t)
+ {
+ if (t == null)
+ return ;
+ lst.Add((Token) t.Clone());
+ }
+
+ public override void Close()
+ {
+ //nothing to close
+ input = null;
+ lst = null;
+ }
+
+ /// <summary> Reset the internal data structures to the start at the front of the list of tokens. Should be called
+ /// if tokens were added to the list after an invocation of {@link #Next(Token)}
+ /// </summary>
+ /// <throws> IOException </throws>
+ public override void Reset()
+ {
+ iter = lst.GetEnumerator();
+ }
+ }
+}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/CharStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/CharStream.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/CharStream.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/CharStream.cs Tue Jun 24 19:53:11 2008
@@ -14,8 +14,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+// {{Aroush-2.3.1}} remove this file from SVN
/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 3.0 */
+/*
using System;
namespace Lucene.Net.Analysis.Standard
@@ -117,4 +118,5 @@
/// </summary>
void Done();
}
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/FastCharStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/FastCharStream.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/FastCharStream.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/FastCharStream.cs Tue Jun 24 19:53:11 2008
@@ -14,7 +14,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+// {{Aroush-2.3.1}} remove this file from SVN
+/*
using System;
namespace Lucene.Net.Analysis.Standard
@@ -148,4 +149,5 @@
return 1;
}
}
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Package.html
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/Package.html?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Package.html (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Package.html Tue Jun 24 19:53:11 2008
@@ -1,15 +1,10 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<html>
-<head>
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
- <meta name="Author" content="Doug Cutting">
-</head>
-<body>
-A grammar-based tokenizer constructed with JavaCC.
-<p>Note that JavaCC defines lots of public classes, methods and fields
-that do not need to be public. These clutter the documentation.
-Sorry.
-<p>Note that because JavaCC defines a class named <tt>Token</tt>, <tt>Lucene.Net.Analysis.Token</tt>
-must always be fully qualified in source code in this package.
-</body>
-</html>
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+ <meta name="Author" content="Stanislaw Osinski">
+</head>
+<body>
+A fast grammar-based tokenizer constructed with JFlex.
+</body>
+</html>
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/ParseException.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/ParseException.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/ParseException.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/ParseException.cs Tue Jun 24 19:53:11 2008
@@ -14,9 +14,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+// {{Aroush-2.3.1}} remove this file from SVN
/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 0.7pre6 */
-
+/*
using System;
namespace Lucene.Net.Analysis.Standard
@@ -227,4 +227,5 @@
return retval.ToString();
}
}
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs Tue Jun 24 19:53:11 2008
@@ -26,12 +26,22 @@
/// LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
///
/// </summary>
- /// <version> $Id: StandardAnalyzer.java 219090 2005-07-14 20:36:28Z dnaber $
+ /// <version> $Id: StandardAnalyzer.java 613280 2008-01-18 21:27:10Z gsingers $
/// </version>
public class StandardAnalyzer : Analyzer
{
private System.Collections.Hashtable stopSet;
+ /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
+ /// This is false by default to support backward compatibility.
+ ///
+ /// </summary>
+ /// <deprecated> this should be removed in the next release (3.0).
+ ///
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ /// </deprecated>
+ private bool replaceInvalidAcronym = false;
+
/// <summary>An array containing some common English words that are usually not
/// useful for searching.
/// </summary>
@@ -70,20 +80,162 @@
stopSet = WordlistLoader.GetWordSet(stopwords);
}
- /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
- /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
- /// </summary>
- public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
- {
- TokenStream result = new StandardTokenizer(reader);
- result = new StandardFilter(result);
- result = new LowerCaseFilter(result);
- result = new StopFilter(result, stopSet);
- return result;
- }
- static StandardAnalyzer()
- {
- STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
- }
- }
+ /// <summary> </summary>
+ /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+ ///
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ ///
+ /// </param>
+ /// <deprecated> Remove in 3.X and make true the only valid value
+ /// </deprecated>
+ public StandardAnalyzer(bool replaceInvalidAcronym):this(STOP_WORDS)
+ {
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ }
+
+ /// <param name="stopwords">The stopwords to use
+ /// </param>
+ /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+ ///
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ ///
+ /// </param>
+ /// <deprecated> Remove in 3.X and make true the only valid value
+ /// </deprecated>
+ public StandardAnalyzer(System.IO.TextReader stopwords, bool replaceInvalidAcronym):this(stopwords)
+ {
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ }
+
+ /// <param name="stopwords">The stopwords to use
+ /// </param>
+ /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+ ///
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ ///
+ /// </param>
+ /// <deprecated> Remove in 3.X and make true the only valid value
+ /// </deprecated>
+ public StandardAnalyzer(System.IO.FileInfo stopwords, bool replaceInvalidAcronym):this(stopwords)
+ {
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ }
+
+ /// <summary> </summary>
+ /// <param name="stopwords">The stopwords to use
+ /// </param>
+ /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+ ///
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ ///
+ /// </param>
+ /// <deprecated> Remove in 3.X and make true the only valid value
+ /// </deprecated>
+ public StandardAnalyzer(System.String[] stopwords, bool replaceInvalidAcronym):this(stopwords)
+ {
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ }
+
+ /// <param name="stopwords">The stopwords to use
+ /// </param>
+ /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+ ///
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ ///
+ /// </param>
+ /// <deprecated> Remove in 3.X and make true the only valid value
+ /// </deprecated>
+ public StandardAnalyzer(System.Collections.Hashtable stopwords, bool replaceInvalidAcronym) : this(stopwords)
+ {
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ }
+
+ /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
+ /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
+ /// </summary>
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
+ tokenStream.SetMaxTokenLength(maxTokenLength);
+ TokenStream result = new StandardFilter(tokenStream);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(result, stopSet);
+ return result;
+ }
+
+ private sealed class SavedStreams
+ {
+ internal StandardTokenizer tokenStream;
+ internal TokenStream filteredTokenStream;
+ }
+
+ /// <summary>Default maximum allowed token length </summary>
+ public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /// <summary> Set maximum allowed token length. If a token is seen
+ /// that exceeds this length then it is discarded. This
+ /// setting only takes effect the next time tokenStream or
+ /// reusableTokenStream is called.
+ /// </summary>
+ public virtual void SetMaxTokenLength(int length)
+ {
+ maxTokenLength = length;
+ }
+
+ /// <seealso cref="setMaxTokenLength">
+ /// </seealso>
+ public virtual int GetMaxTokenLength()
+ {
+ return maxTokenLength;
+ }
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ SetPreviousTokenStream(streams);
+ streams.tokenStream = new StandardTokenizer(reader);
+ streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
+ streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
+ streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
+ }
+ else
+ {
+ streams.tokenStream.Reset(reader);
+ }
+ streams.tokenStream.SetMaxTokenLength(maxTokenLength);
+
+ streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym);
+
+ return streams.filteredTokenStream;
+ }
+
+ /// <summary> </summary>
+ /// <returns> true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
+ ///
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ /// </returns>
+ public virtual bool IsReplaceInvalidAcronym()
+ {
+ return replaceInvalidAcronym;
+ }
+
+ /// <summary> </summary>
+ /// <param name="replaceInvalidAcronym">Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
+ ///
+ /// See https://issues.apache.org/jira/browse/LUCENE-1068
+ /// </param>
+ public virtual void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
+ {
+ this.replaceInvalidAcronym = replaceInvalidAcronym;
+ }
+ static StandardAnalyzer()
+ {
+ STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardFilter.cs Tue Jun 24 19:53:11 2008
@@ -17,59 +17,66 @@
using System;
-using Lucene.Net.Analysis;
+using Token = Lucene.Net.Analysis.Token;
+using TokenFilter = Lucene.Net.Analysis.TokenFilter;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
namespace Lucene.Net.Analysis.Standard
{
- /// <summary>Normalizes tokens extracted with {@link StandardTokenizer}. </summary>
+ /// <summary>Normalizes tokens extracted with {@link StandardTokenizer}. </summary>
- public sealed class StandardFilter : TokenFilter
- {
+ public sealed class StandardFilter:TokenFilter
+ {
- /// <summary>Construct filtering <i>in</i>. </summary>
- public StandardFilter(TokenStream in_Renamed) : base(in_Renamed)
- {
- }
+ /// <summary>Construct filtering <i>in</i>. </summary>
+ public StandardFilter(TokenStream in_Renamed):base(in_Renamed)
+ {
+ }
- private static readonly System.String APOSTROPHE_TYPE = Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage[Lucene.Net.Analysis.Standard.StandardTokenizerConstants.APOSTROPHE];
- private static readonly System.String ACRONYM_TYPE = Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage[Lucene.Net.Analysis.Standard.StandardTokenizerConstants.ACRONYM];
+ private static readonly System.String APOSTROPHE_TYPE;
+ private static readonly System.String ACRONYM_TYPE;
- /// <summary>Returns the next token in the stream, or null at EOS.
- /// <p>Removes <tt>'s</tt> from the end of words.
- /// <p>Removes dots from acronyms.
- /// </summary>
- public override Lucene.Net.Analysis.Token Next()
- {
- Lucene.Net.Analysis.Token t = input.Next();
+ /// <summary>Returns the next token in the stream, or null at EOS.
+ /// <p>Removes <tt>'s</tt> from the end of words.
+ /// <p>Removes dots from acronyms.
+ /// </summary>
+ public override Token Next(Token result)
+ {
+ Token t = input.Next(result);
- if (t == null)
- return null;
+ if (t == null)
+ return null;
- System.String text = t.TermText();
- System.String type = t.Type();
+ char[] buffer = t.TermBuffer();
+ int bufferLength = t.TermLength();
+ System.String type = t.Type();
- if (type == APOSTROPHE_TYPE && (text.EndsWith("'s") || text.EndsWith("'S")))
- {
- return new Lucene.Net.Analysis.Token(text.Substring(0, (text.Length - 2) - (0)), t.StartOffset(), t.EndOffset(), type);
- }
- else if (type == ACRONYM_TYPE)
- {
- // remove dots
- System.Text.StringBuilder trimmed = new System.Text.StringBuilder();
- for (int i = 0; i < text.Length; i++)
- {
- char c = text[i];
- if (c != '.')
- trimmed.Append(c);
- }
- return new Lucene.Net.Analysis.Token(trimmed.ToString(), t.StartOffset(), t.EndOffset(), type);
- }
- else
- {
- return t;
- }
- }
- }
+ if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
+ {
+ // Strip last 2 characters off
+ t.SetTermLength(bufferLength - 2);
+ }
+ else if (type == ACRONYM_TYPE)
+ {
+ // remove dots
+ int upto = 0;
+ for (int i = 0; i < bufferLength; i++)
+ {
+ char c = buffer[i];
+ if (c != '.')
+ buffer[upto++] = c;
+ }
+ t.SetTermLength(upto);
+ }
+
+ return t;
+ }
+ static StandardFilter()
+ {
+ APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
+ ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
+ }
+ }
}
\ No newline at end of file