You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2015/04/15 01:32:27 UTC
[1/3] lucenenet git commit: TextReader.Read actually returns 0
Repository: lucenenet
Updated Branches:
refs/heads/master e670c1e76 -> b4eaf2fc4
TextReader.Read actually returns 0
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/8d7a54fc
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/8d7a54fc
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/8d7a54fc
Branch: refs/heads/master
Commit: 8d7a54fc66ecc1ffe2e4d7af59d6b73c81854db7
Parents: e670c1e
Author: Itamar Syn-Hershko <it...@code972.com>
Authored: Tue Apr 14 02:22:15 2015 +0300
Committer: Itamar Syn-Hershko <it...@code972.com>
Committed: Tue Apr 14 02:22:15 2015 +0300
----------------------------------------------------------------------
.../Analysis/Util/CharacterUtils.cs | 874 +++++++++----------
1 file changed, 437 insertions(+), 437 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8d7a54fc/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
index e0e9a78..4d2e076 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
@@ -8,443 +8,443 @@ using Version = Lucene.Net.Util.LuceneVersion;
namespace Lucene.Net.Analysis.Util
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/// <summary>
- /// <seealso cref="CharacterUtils"/> provides a unified interface to Character-related
- /// operations to implement backwards compatible character operations based on a
- /// <seealso cref="LuceneVersion"/> instance.
- ///
- /// @lucene.internal
- /// </summary>
- public abstract class CharacterUtils
- {
- private static readonly CharacterUtils JAVA_4 = new Java4CharacterUtils();
- private static readonly CharacterUtils JAVA_5 = new Java5CharacterUtils();
-
- /// <summary>
- /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given
- /// <seealso cref="LuceneVersion"/> instance.
- /// </summary>
- /// <param name="matchVersion">
- /// a version instance </param>
- /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given
- /// <seealso cref="LuceneVersion"/> instance. </returns>
- public static CharacterUtils GetInstance(LuceneVersion matchVersion)
- {
- return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4;
- }
-
- /// <summary>
- /// Return a <seealso cref="CharacterUtils"/> instance compatible with Java 1.4. </summary>
- public static CharacterUtils Java4Instance
- {
- get
- {
- return JAVA_4;
- }
- }
-
- /// <summary>
- /// Returns the code point at the given index of the <seealso cref="CharSequence"/>.
- /// Depending on the <seealso cref="LuceneVersion"/> passed to
- /// <seealso cref="CharacterUtils#getInstance(Version)"/> this method mimics the behavior
- /// of <seealso cref="Character#codePointAt(char[], int)"/> as it would have been
- /// available on a Java 1.4 JVM or on a later virtual machine version.
- /// </summary>
- /// <param name="seq">
- /// a character sequence </param>
- /// <param name="offset">
- /// the offset to the char values in the chars array to be converted
- /// </param>
- /// <returns> the Unicode code point at the given index </returns>
- /// <exception cref="NullPointerException">
- /// - if the sequence is null. </exception>
- /// <exception cref="IndexOutOfBoundsException">
- /// - if the value offset is negative or not less than the length of
- /// the character sequence. </exception>
- public abstract int CodePointAt(string seq, int offset);
-
- /// <summary>
- /// Returns the code point at the given index of the char array where only elements
- /// with index less than the limit are used.
- /// Depending on the <seealso cref="LuceneVersion"/> passed to
- /// <seealso cref="CharacterUtils#getInstance(Version)"/> this method mimics the behavior
- /// of <seealso cref="Character#codePointAt(char[], int)"/> as it would have been
- /// available on a Java 1.4 JVM or on a later virtual machine version.
- /// </summary>
- /// <param name="chars">
- /// a character array </param>
- /// <param name="offset">
- /// the offset to the char values in the chars array to be converted </param>
- /// <param name="limit"> the index afer the last element that should be used to calculate
- /// codepoint.
- /// </param>
- /// <returns> the Unicode code point at the given index </returns>
- /// <exception cref="NullPointerException">
- /// - if the array is null. </exception>
- /// <exception cref="IndexOutOfBoundsException">
- /// - if the value offset is negative or not less than the length of
- /// the char array. </exception>
- public abstract int CodePointAt(char[] chars, int offset, int limit);
-
- /// <summary>
- /// Return the number of characters in <code>seq</code>. </summary>
- public abstract int CodePointCount(string seq);
-
- /// <summary>
- /// Creates a new <seealso cref="CharacterBuffer"/> and allocates a <code>char[]</code>
- /// of the given bufferSize.
- /// </summary>
- /// <param name="bufferSize">
- /// the internal char buffer size, must be <code>>= 2</code> </param>
- /// <returns> a new <seealso cref="CharacterBuffer"/> instance. </returns>
- public static CharacterBuffer NewCharacterBuffer(int bufferSize)
- {
- if (bufferSize < 2)
- {
- throw new System.ArgumentException("buffersize must be >= 2");
- }
- return new CharacterBuffer(new char[bufferSize], 0, 0);
- }
-
-
- /// <summary>
- /// Converts each unicode codepoint to lowerCase via <seealso cref="Character#toLowerCase(int)"/> starting
- /// at the given offset. </summary>
- /// <param name="buffer"> the char buffer to lowercase </param>
- /// <param name="offset"> the offset to start at </param>
- /// <param name="limit"> the max char in the buffer to lower case </param>
- public void ToLower(char[] buffer, int offset, int limit)
- {
- Debug.Assert(buffer.Length >= limit);
- Debug.Assert(offset <= 0 && offset <= buffer.Length);
- for (int i = offset; i < limit;)
- {
- i += Character.ToChars(char.ToLower((char)CodePointAt(buffer, i, limit)), buffer, i);
- }
- }
-
- /// <summary>
- /// Converts each unicode codepoint to UpperCase via <seealso cref="Character#toUpperCase(int)"/> starting
- /// at the given offset. </summary>
- /// <param name="buffer"> the char buffer to UPPERCASE </param>
- /// <param name="offset"> the offset to start at </param>
- /// <param name="limit"> the max char in the buffer to lower case </param>
- public void ToUpper(char[] buffer, int offset, int limit)
- {
- Debug.Assert(buffer.Length >= limit);
- Debug.Assert(offset <= 0 && offset <= buffer.Length);
- for (int i = offset; i < limit;)
- {
- i += Character.ToChars(char.ToUpper((char)CodePointAt(buffer, i, limit)), buffer, i);
- }
- }
-
- /// <summary>
- /// Converts a sequence of Java characters to a sequence of unicode code points. </summary>
- /// <returns> the number of code points written to the destination buffer </returns>
- public int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff)
- {
- if (srcLen < 0)
- {
- throw new System.ArgumentException("srcLen must be >= 0");
- }
- int codePointCount_Renamed = 0;
- for (int i = 0; i < srcLen;)
- {
- int cp = CodePointAt(src, srcOff + i, srcOff + srcLen);
- int charCount = Character.CharCount(cp);
- dest[destOff + codePointCount_Renamed++] = cp;
- i += charCount;
- }
- return codePointCount_Renamed;
- }
-
- /// <summary>
- /// Converts a sequence of unicode code points to a sequence of Java characters. </summary>
- /// <returns> the number of chars written to the destination buffer </returns>
- public int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff)
- {
- if (srcLen < 0)
- {
- throw new System.ArgumentException("srcLen must be >= 0");
- }
- int written = 0;
- for (int i = 0; i < srcLen; ++i)
- {
- written += Character.ToChars(src[srcOff + i], dest, destOff + written);
- }
- return written;
- }
-
- /// <summary>
- /// Fills the <seealso cref="CharacterBuffer"/> with characters read from the given
- /// reader <seealso cref="Reader"/>. This method tries to read <code>numChars</code>
- /// characters into the <seealso cref="CharacterBuffer"/>, each call to fill will start
- /// filling the buffer from offset <code>0</code> up to <code>numChars</code>.
- /// In case code points can span across 2 java characters, this method may
- /// only fill <code>numChars - 1</code> characters in order not to split in
- /// the middle of a surrogate pair, even if there are remaining characters in
- /// the <seealso cref="Reader"/>.
- /// <para>
- /// Depending on the <seealso cref="LuceneVersion"/> passed to
- /// <seealso cref="CharacterUtils#getInstance(Version)"/> this method implements
- /// supplementary character awareness when filling the given buffer. For all
- /// <seealso cref="LuceneVersion"/> > 3.0 <seealso cref="#fill(CharacterBuffer, Reader, int)"/> guarantees
- /// that the given <seealso cref="CharacterBuffer"/> will never contain a high surrogate
- /// character as the last element in the buffer unless it is the last available
- /// character in the reader. In other words, high and low surrogate pairs will
- /// always be preserved across buffer boarders.
- /// </para>
- /// <para>
- /// A return value of <code>false</code> means that this method call exhausted
- /// the reader, but there may be some bytes which have been read, which can be
- /// verified by checking whether <code>buffer.getLength() > 0</code>.
- /// </para>
- /// </summary>
- /// <param name="buffer">
- /// the buffer to fill. </param>
- /// <param name="reader">
- /// the reader to read characters from. </param>
- /// <param name="numChars">
- /// the number of chars to read </param>
- /// <returns> <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer </returns>
- /// <exception cref="IOException">
- /// if the reader throws an <seealso cref="IOException"/>. </exception>
- public abstract bool Fill(CharacterBuffer buffer, Reader reader, int numChars);
-
- /// <summary>
- /// Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. </summary>
- public virtual bool Fill(CharacterBuffer buffer, Reader reader)
- {
- return Fill(buffer, reader, buffer.buffer.Length);
- }
-
- /// <summary>
- /// Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
- /// code points from <code>index</code>.
- /// </summary>
- public abstract int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset);
-
- internal static int ReadFully(Reader reader, char[] dest, int offset, int len)
- {
- int read = 0;
- while (read < len)
- {
- int r = reader.Read(dest, offset + read, len - read);
- if (r == -1)
- {
- break;
- }
- read += r;
- }
- return read;
- }
-
- private sealed class Java5CharacterUtils : CharacterUtils
- {
- internal Java5CharacterUtils()
- {
- }
-
- public override int CodePointAt(string seq, int offset)
- {
- return Character.CodePointAt(seq, offset);
- }
-
- public override int CodePointAt(char[] chars, int offset, int limit)
- {
- return Character.CodePointAt(chars, offset, limit);
- }
-
- public override bool Fill(CharacterBuffer buffer, Reader reader, int numChars)
- {
- Debug.Assert(buffer.buffer.Length >= 2);
- if (numChars < 2 || numChars > buffer.buffer.Length)
- {
- throw new System.ArgumentException("numChars must be >= 2 and <= the buffer size");
- }
- char[] charBuffer = buffer.buffer;
- buffer.offset = 0;
- int offset;
-
- // Install the previously saved ending high surrogate:
- if (buffer.lastTrailingHighSurrogate != 0)
- {
- charBuffer[0] = buffer.lastTrailingHighSurrogate;
- buffer.lastTrailingHighSurrogate = (char)0;
- offset = 1;
- }
- else
- {
- offset = 0;
- }
-
- int read = ReadFully(reader, charBuffer, offset, numChars - offset);
-
- buffer.length = offset + read;
- bool result = buffer.length == numChars;
- if (buffer.length < numChars)
- {
- // We failed to fill the buffer. Even if the last char is a high
- // surrogate, there is nothing we can do
- return result;
- }
-
- if (char.IsHighSurrogate(charBuffer[buffer.length - 1]))
- {
- buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
- }
- return result;
- }
-
- public override int CodePointCount(string seq)
- {
- return Character.CodePointCount(seq, 0, seq.Length);
- }
-
- public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset)
- {
- return Character.OffsetByCodePoints(buf, start, count, index, offset);
- }
- }
-
- private sealed class Java4CharacterUtils : CharacterUtils
- {
- internal Java4CharacterUtils()
- {
- }
-
- public override int CodePointAt(string seq, int offset)
- {
- return seq[offset];
- }
-
- public override int CodePointAt(char[] chars, int offset, int limit)
- {
- if (offset >= limit)
- {
- throw new System.IndexOutOfRangeException("offset must be less than limit");
- }
- return chars[offset];
- }
-
- public override bool Fill(CharacterBuffer buffer, Reader reader, int numChars)
- {
- Debug.Assert(buffer.buffer.Length >= 1);
- if (numChars < 1 || numChars > buffer.buffer.Length)
- {
- throw new System.ArgumentException("numChars must be >= 1 and <= the buffer size");
- }
- buffer.offset = 0;
- int read = ReadFully(reader, buffer.buffer, 0, numChars);
- buffer.length = read;
- buffer.lastTrailingHighSurrogate = (char)0;
- return read == numChars;
- }
-
- public override int CodePointCount(string seq)
- {
- return seq.Length;
- }
-
- public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset)
- {
- int result = index + offset;
- if (result < 0 || result > count)
- {
- throw new System.IndexOutOfRangeException();
- }
- return result;
- }
-
- }
-
- /// <summary>
- /// A simple IO buffer to use with
- /// <seealso cref="CharacterUtils#fill(CharacterBuffer, Reader)"/>.
- /// </summary>
- public sealed class CharacterBuffer
- {
-
- internal readonly char[] buffer;
- internal int offset;
- internal int length;
- // NOTE: not private so outer class can access without
- // $access methods:
- internal char lastTrailingHighSurrogate;
-
- internal CharacterBuffer(char[] buffer, int offset, int length)
- {
- this.buffer = buffer;
- this.offset = offset;
- this.length = length;
- }
-
- /// <summary>
- /// Returns the internal buffer
- /// </summary>
- /// <returns> the buffer </returns>
- public char[] Buffer
- {
- get
- {
- return buffer;
- }
- }
-
- /// <summary>
- /// Returns the data offset in the internal buffer.
- /// </summary>
- /// <returns> the offset </returns>
- public int Offset
- {
- get
- {
- return offset;
- }
- }
-
- /// <summary>
- /// Return the length of the data in the internal buffer starting at
- /// <seealso cref="#getOffset()"/>
- /// </summary>
- /// <returns> the length </returns>
- public int Length
- {
- get
- {
- return length;
- }
- }
-
- /// <summary>
- /// Resets the CharacterBuffer. All internals are reset to its default
- /// values.
- /// </summary>
- public void reset()
- {
- offset = 0;
- length = 0;
- lastTrailingHighSurrogate = (char)0;
- }
- }
-
- }
+ /// <seealso cref="CharacterUtils"/> provides a unified interface to Character-related
+ /// operations to implement backwards compatible character operations based on a
+ /// <seealso cref="LuceneVersion"/> instance.
+ ///
+ /// @lucene.internal
+ /// </summary>
+ public abstract class CharacterUtils
+ {
+ private static readonly CharacterUtils JAVA_4 = new Java4CharacterUtils();
+ private static readonly CharacterUtils JAVA_5 = new Java5CharacterUtils();
+
+ /// <summary>
+ /// Returns a <seealso cref="CharacterUtils"/> implementation according to the given
+ /// <seealso cref="LuceneVersion"/> instance.
+ /// </summary>
+ /// <param name="matchVersion">
+ /// a version instance </param>
+ /// <returns> a <seealso cref="CharacterUtils"/> implementation according to the given
+ /// <seealso cref="LuceneVersion"/> instance. </returns>
+ public static CharacterUtils GetInstance(LuceneVersion matchVersion)
+ {
+ return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_5 : JAVA_4;
+ }
+
+ /// <summary>
+ /// Return a <seealso cref="CharacterUtils"/> instance compatible with Java 1.4. </summary>
+ public static CharacterUtils Java4Instance
+ {
+ get
+ {
+ return JAVA_4;
+ }
+ }
+
+ /// <summary>
+ /// Returns the code point at the given index of the <seealso cref="CharSequence"/>.
+ /// Depending on the <seealso cref="LuceneVersion"/> passed to
+ /// <seealso cref="CharacterUtils#getInstance(Version)"/> this method mimics the behavior
+ /// of <seealso cref="Character#codePointAt(char[], int)"/> as it would have been
+ /// available on a Java 1.4 JVM or on a later virtual machine version.
+ /// </summary>
+ /// <param name="seq">
+ /// a character sequence </param>
+ /// <param name="offset">
+ /// the offset to the char values in the chars array to be converted
+ /// </param>
+ /// <returns> the Unicode code point at the given index </returns>
+ /// <exception cref="NullPointerException">
+ /// - if the sequence is null. </exception>
+ /// <exception cref="IndexOutOfBoundsException">
+ /// - if the value offset is negative or not less than the length of
+ /// the character sequence. </exception>
+ public abstract int CodePointAt(string seq, int offset);
+
+ /// <summary>
+ /// Returns the code point at the given index of the char array where only elements
+ /// with index less than the limit are used.
+ /// Depending on the <seealso cref="LuceneVersion"/> passed to
+ /// <seealso cref="CharacterUtils#getInstance(Version)"/> this method mimics the behavior
+ /// of <seealso cref="Character#codePointAt(char[], int)"/> as it would have been
+ /// available on a Java 1.4 JVM or on a later virtual machine version.
+ /// </summary>
+ /// <param name="chars">
+ /// a character array </param>
+ /// <param name="offset">
+ /// the offset to the char values in the chars array to be converted </param>
+ /// <param name="limit"> the index afer the last element that should be used to calculate
+ /// codepoint.
+ /// </param>
+ /// <returns> the Unicode code point at the given index </returns>
+ /// <exception cref="NullPointerException">
+ /// - if the array is null. </exception>
+ /// <exception cref="IndexOutOfBoundsException">
+ /// - if the value offset is negative or not less than the length of
+ /// the char array. </exception>
+ public abstract int CodePointAt(char[] chars, int offset, int limit);
+
+ /// <summary>
+ /// Return the number of characters in <code>seq</code>. </summary>
+ public abstract int CodePointCount(string seq);
+
+ /// <summary>
+ /// Creates a new <seealso cref="CharacterBuffer"/> and allocates a <code>char[]</code>
+ /// of the given bufferSize.
+ /// </summary>
+ /// <param name="bufferSize">
+ /// the internal char buffer size, must be <code>>= 2</code> </param>
+ /// <returns> a new <seealso cref="CharacterBuffer"/> instance. </returns>
+ public static CharacterBuffer NewCharacterBuffer(int bufferSize)
+ {
+ if (bufferSize < 2)
+ {
+ throw new System.ArgumentException("buffersize must be >= 2");
+ }
+ return new CharacterBuffer(new char[bufferSize], 0, 0);
+ }
+
+
+ /// <summary>
+ /// Converts each unicode codepoint to lowerCase via <seealso cref="Character#toLowerCase(int)"/> starting
+ /// at the given offset. </summary>
+ /// <param name="buffer"> the char buffer to lowercase </param>
+ /// <param name="offset"> the offset to start at </param>
+ /// <param name="limit"> the max char in the buffer to lower case </param>
+ public void ToLower(char[] buffer, int offset, int limit)
+ {
+ Debug.Assert(buffer.Length >= limit);
+ Debug.Assert(offset <= 0 && offset <= buffer.Length);
+ for (int i = offset; i < limit; )
+ {
+ i += Character.ToChars(char.ToLower((char)CodePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /// <summary>
+ /// Converts each unicode codepoint to UpperCase via <seealso cref="Character#toUpperCase(int)"/> starting
+ /// at the given offset. </summary>
+ /// <param name="buffer"> the char buffer to UPPERCASE </param>
+ /// <param name="offset"> the offset to start at </param>
+ /// <param name="limit"> the max char in the buffer to lower case </param>
+ public void ToUpper(char[] buffer, int offset, int limit)
+ {
+ Debug.Assert(buffer.Length >= limit);
+ Debug.Assert(offset <= 0 && offset <= buffer.Length);
+ for (int i = offset; i < limit; )
+ {
+ i += Character.ToChars(char.ToUpper((char)CodePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /// <summary>
+ /// Converts a sequence of Java characters to a sequence of unicode code points. </summary>
+ /// <returns> the number of code points written to the destination buffer </returns>
+ public int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff)
+ {
+ if (srcLen < 0)
+ {
+ throw new System.ArgumentException("srcLen must be >= 0");
+ }
+ int codePointCount_Renamed = 0;
+ for (int i = 0; i < srcLen; )
+ {
+ int cp = CodePointAt(src, srcOff + i, srcOff + srcLen);
+ int charCount = Character.CharCount(cp);
+ dest[destOff + codePointCount_Renamed++] = cp;
+ i += charCount;
+ }
+ return codePointCount_Renamed;
+ }
+
+ /// <summary>
+ /// Converts a sequence of unicode code points to a sequence of Java characters. </summary>
+ /// <returns> the number of chars written to the destination buffer </returns>
+ public int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff)
+ {
+ if (srcLen < 0)
+ {
+ throw new System.ArgumentException("srcLen must be >= 0");
+ }
+ int written = 0;
+ for (int i = 0; i < srcLen; ++i)
+ {
+ written += Character.ToChars(src[srcOff + i], dest, destOff + written);
+ }
+ return written;
+ }
+
+ /// <summary>
+ /// Fills the <seealso cref="CharacterBuffer"/> with characters read from the given
+ /// reader <seealso cref="Reader"/>. This method tries to read <code>numChars</code>
+ /// characters into the <seealso cref="CharacterBuffer"/>, each call to fill will start
+ /// filling the buffer from offset <code>0</code> up to <code>numChars</code>.
+ /// In case code points can span across 2 java characters, this method may
+ /// only fill <code>numChars - 1</code> characters in order not to split in
+ /// the middle of a surrogate pair, even if there are remaining characters in
+ /// the <seealso cref="Reader"/>.
+ /// <para>
+ /// Depending on the <seealso cref="LuceneVersion"/> passed to
+ /// <seealso cref="CharacterUtils#getInstance(Version)"/> this method implements
+ /// supplementary character awareness when filling the given buffer. For all
+ /// <seealso cref="LuceneVersion"/> > 3.0 <seealso cref="#fill(CharacterBuffer, Reader, int)"/> guarantees
+ /// that the given <seealso cref="CharacterBuffer"/> will never contain a high surrogate
+ /// character as the last element in the buffer unless it is the last available
+ /// character in the reader. In other words, high and low surrogate pairs will
+ /// always be preserved across buffer boarders.
+ /// </para>
+ /// <para>
+ /// A return value of <code>false</code> means that this method call exhausted
+ /// the reader, but there may be some bytes which have been read, which can be
+ /// verified by checking whether <code>buffer.getLength() > 0</code>.
+ /// </para>
+ /// </summary>
+ /// <param name="buffer">
+ /// the buffer to fill. </param>
+ /// <param name="reader">
+ /// the reader to read characters from. </param>
+ /// <param name="numChars">
+ /// the number of chars to read </param>
+ /// <returns> <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer </returns>
+ /// <exception cref="IOException">
+ /// if the reader throws an <seealso cref="IOException"/>. </exception>
+ public abstract bool Fill(CharacterBuffer buffer, Reader reader, int numChars);
+
+ /// <summary>
+ /// Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. </summary>
+ public virtual bool Fill(CharacterBuffer buffer, Reader reader)
+ {
+ return Fill(buffer, reader, buffer.buffer.Length);
+ }
+
+ /// <summary>
+ /// Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
+ /// code points from <code>index</code>.
+ /// </summary>
+ public abstract int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset);
+
+ internal static int ReadFully(Reader reader, char[] dest, int offset, int len)
+ {
+ int read = 0;
+ while (read < len)
+ {
+ int r = reader.Read(dest, offset + read, len - read);
+ if (r == 0)
+ {
+ break;
+ }
+ read += r;
+ }
+ return read;
+ }
+
+ private sealed class Java5CharacterUtils : CharacterUtils
+ {
+ internal Java5CharacterUtils()
+ {
+ }
+
+ public override int CodePointAt(string seq, int offset)
+ {
+ return Character.CodePointAt(seq, offset);
+ }
+
+ public override int CodePointAt(char[] chars, int offset, int limit)
+ {
+ return Character.CodePointAt(chars, offset, limit);
+ }
+
+ public override bool Fill(CharacterBuffer buffer, Reader reader, int numChars)
+ {
+ Debug.Assert(buffer.buffer.Length >= 2);
+ if (numChars < 2 || numChars > buffer.buffer.Length)
+ {
+ throw new System.ArgumentException("numChars must be >= 2 and <= the buffer size");
+ }
+ char[] charBuffer = buffer.buffer;
+ buffer.offset = 0;
+ int offset;
+
+ // Install the previously saved ending high surrogate:
+ if (buffer.lastTrailingHighSurrogate != 0)
+ {
+ charBuffer[0] = buffer.lastTrailingHighSurrogate;
+ buffer.lastTrailingHighSurrogate = (char)0;
+ offset = 1;
+ }
+ else
+ {
+ offset = 0;
+ }
+
+ int read = ReadFully(reader, charBuffer, offset, numChars - offset);
+
+ buffer.length = offset + read;
+ bool result = buffer.length == numChars;
+ if (buffer.length < numChars)
+ {
+ // We failed to fill the buffer. Even if the last char is a high
+ // surrogate, there is nothing we can do
+ return result;
+ }
+
+ if (char.IsHighSurrogate(charBuffer[buffer.length - 1]))
+ {
+ buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+ }
+ return result;
+ }
+
+ public override int CodePointCount(string seq)
+ {
+ return Character.CodePointCount(seq, 0, seq.Length);
+ }
+
+ public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset)
+ {
+ return Character.OffsetByCodePoints(buf, start, count, index, offset);
+ }
+ }
+
+ private sealed class Java4CharacterUtils : CharacterUtils
+ {
+ internal Java4CharacterUtils()
+ {
+ }
+
+ public override int CodePointAt(string seq, int offset)
+ {
+ return seq[offset];
+ }
+
+ public override int CodePointAt(char[] chars, int offset, int limit)
+ {
+ if (offset >= limit)
+ {
+ throw new System.IndexOutOfRangeException("offset must be less than limit");
+ }
+ return chars[offset];
+ }
+
+ public override bool Fill(CharacterBuffer buffer, Reader reader, int numChars)
+ {
+ Debug.Assert(buffer.buffer.Length >= 1);
+ if (numChars < 1 || numChars > buffer.buffer.Length)
+ {
+ throw new System.ArgumentException("numChars must be >= 1 and <= the buffer size");
+ }
+ buffer.offset = 0;
+ int read = ReadFully(reader, buffer.buffer, 0, numChars);
+ buffer.length = read;
+ buffer.lastTrailingHighSurrogate = (char)0;
+ return read == numChars;
+ }
+
+ public override int CodePointCount(string seq)
+ {
+ return seq.Length;
+ }
+
+ public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset)
+ {
+ int result = index + offset;
+ if (result < 0 || result > count)
+ {
+ throw new System.IndexOutOfRangeException();
+ }
+ return result;
+ }
+
+ }
+
+ /// <summary>
+ /// A simple IO buffer to use with
+ /// <seealso cref="CharacterUtils#fill(CharacterBuffer, Reader)"/>.
+ /// </summary>
+ public sealed class CharacterBuffer
+ {
+
+ internal readonly char[] buffer;
+ internal int offset;
+ internal int length;
+ // NOTE: not private so outer class can access without
+ // $access methods:
+ internal char lastTrailingHighSurrogate;
+
+ internal CharacterBuffer(char[] buffer, int offset, int length)
+ {
+ this.buffer = buffer;
+ this.offset = offset;
+ this.length = length;
+ }
+
+ /// <summary>
+ /// Returns the internal buffer
+ /// </summary>
+ /// <returns> the buffer </returns>
+ public char[] Buffer
+ {
+ get
+ {
+ return buffer;
+ }
+ }
+
+ /// <summary>
+ /// Returns the data offset in the internal buffer.
+ /// </summary>
+ /// <returns> the offset </returns>
+ public int Offset
+ {
+ get
+ {
+ return offset;
+ }
+ }
+
+ /// <summary>
+ /// Return the length of the data in the internal buffer starting at
+ /// <seealso cref="#getOffset()"/>
+ /// </summary>
+ /// <returns> the length </returns>
+ public int Length
+ {
+ get
+ {
+ return length;
+ }
+ }
+
+ /// <summary>
+ /// Resets the CharacterBuffer. All internals are reset to its default
+ /// values.
+ /// </summary>
+ public void reset()
+ {
+ offset = 0;
+ length = 0;
+ lastTrailingHighSurrogate = (char)0;
+ }
+ }
+
+ }
}
\ No newline at end of file
[2/3] lucenenet git commit: More porting work
Posted by sy...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardAnalyzer.cs
index ecb534f..d0502aa 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardAnalyzer.cs
@@ -2,145 +2,148 @@
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
-using org.apache.lucene.analysis.standard;
namespace Lucene.Net.Analysis.Standard
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/// <summary>
- /// Filters <seealso cref="StandardTokenizer"/> with <seealso cref="StandardFilter"/>, {@link
- /// LowerCaseFilter} and <seealso cref="StopFilter"/>, using a list of
- /// English stop words.
- ///
- /// <a name="version"/>
- /// <para>You must specify the required <seealso cref="LuceneVersion"/>
- /// compatibility when creating StandardAnalyzer:
- /// <ul>
- /// <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
- /// from their combining characters. If you use a previous version number,
- /// you get the exact broken behavior for backwards compatibility.
- /// <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
- /// and StopFilter correctly handles Unicode 4.0 supplementary characters
- /// in stopwords. <seealso cref="ClassicTokenizer"/> and <seealso cref="ClassicAnalyzer"/>
- /// are the pre-3.1 implementations of StandardTokenizer and
- /// StandardAnalyzer.
- /// <li> As of 2.9, StopFilter preserves position increments
- /// <li> As of 2.4, Tokens incorrectly identified as acronyms
- /// are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
- /// </ul>
- /// </para>
- /// </summary>
- public sealed class StandardAnalyzer : StopwordAnalyzerBase
- {
-
- /// <summary>
- /// Default maximum allowed token length </summary>
- public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
-
- private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
-
- /// <summary>
- /// An unmodifiable set containing some common English words that are usually not
- /// useful for searching.
- /// </summary>
- public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
-
- /// <summary>
- /// Builds an analyzer with the given stop words. </summary>
- /// <param name="matchVersion"> Lucene version to match See {@link
- /// <a href="#version">above</a>} </param>
- /// <param name="stopWords"> stop words </param>
- public StandardAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords)
- {
- }
-
- /// <summary>
- /// Builds an analyzer with the default stop words ({@link
- /// #STOP_WORDS_SET}). </summary>
- /// <param name="matchVersion"> Lucene version to match See {@link
- /// <a href="#version">above</a>} </param>
- public StandardAnalyzer(LuceneVersion matchVersion) : this(matchVersion, STOP_WORDS_SET)
- {
- }
-
- /// <summary>
- /// Builds an analyzer with the stop words from the given reader. </summary>
- /// <seealso cref= WordlistLoader#getWordSet(Reader, Version) </seealso>
- /// <param name="matchVersion"> Lucene version to match See {@link
- /// <a href="#version">above</a>} </param>
- /// <param name="stopwords"> Reader to read stop words from </param>
- public StandardAnalyzer(LuceneVersion matchVersion, TextReader stopwords) : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
- {
- }
-
- /// <summary>
- /// Set maximum allowed token length. If a token is seen
- /// that exceeds this length then it is discarded. This
- /// setting only takes effect the next time tokenStream or
- /// tokenStream is called.
- /// </summary>
- public int MaxTokenLength
- {
- set
- {
- maxTokenLength = value;
- }
- get
- {
- return maxTokenLength;
- }
- }
-
-
- protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
- {
- var src = new StandardTokenizer(matchVersion, reader);
- src.MaxTokenLength = maxTokenLength;
- TokenStream tok = new StandardFilter(matchVersion, src);
- tok = new LowerCaseFilter(matchVersion, tok);
- tok = new StopFilter(matchVersion, tok, stopwords);
- return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
- }
-
- private class TokenStreamComponentsAnonymousInnerClassHelper : TokenStreamComponents
- {
- private readonly StandardAnalyzer outerInstance;
-
- private TextReader reader;
- private readonly StandardTokenizer src;
-
- public TokenStreamComponentsAnonymousInnerClassHelper(StandardAnalyzer outerInstance, StandardTokenizer src, TokenStream tok, Reader reader) : base(src, tok)
- {
- this.outerInstance = outerInstance;
- this.reader = reader;
- this.src = src;
- }
-
- protected internal override Reader Reader
- {
- set
- {
- src.MaxTokenLength = outerInstance.maxTokenLength;
- base.Reader = value;
- }
- }
- }
- }
+ /// Filters <seealso cref="StandardTokenizer"/> with <seealso cref="StandardFilter"/>, {@link
+ /// LowerCaseFilter} and <seealso cref="StopFilter"/>, using a list of
+ /// English stop words.
+ ///
+ /// <a name="version"/>
+ /// <para>You must specify the required <seealso cref="LuceneVersion"/>
+ /// compatibility when creating StandardAnalyzer:
+ /// <ul>
+ /// <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
+ /// from their combining characters. If you use a previous version number,
+ /// you get the exact broken behavior for backwards compatibility.
+ /// <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
+ /// and StopFilter correctly handles Unicode 4.0 supplementary characters
+ /// in stopwords. <seealso cref="ClassicTokenizer"/> and <seealso cref="ClassicAnalyzer"/>
+ /// are the pre-3.1 implementations of StandardTokenizer and
+ /// StandardAnalyzer.
+ /// <li> As of 2.9, StopFilter preserves position increments
+ /// <li> As of 2.4, Tokens incorrectly identified as acronyms
+ /// are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public sealed class StandardAnalyzer : StopwordAnalyzerBase
+ {
+
+ /// <summary>
+ /// Default maximum allowed token length </summary>
+ public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /// <summary>
+ /// An unmodifiable set containing some common English words that are usually not
+ /// useful for searching.
+ /// </summary>
+ public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words. </summary>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ /// <param name="stopWords"> stop words </param>
+ public StandardAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords)
+ : base(matchVersion, stopWords)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words ({@link
+ /// #STOP_WORDS_SET}). </summary>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ public StandardAnalyzer(LuceneVersion matchVersion)
+ : this(matchVersion, STOP_WORDS_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the stop words from the given reader. </summary>
+ /// <seealso cref= WordlistLoader#getWordSet(Reader, Version) </seealso>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ /// <param name="stopwords"> Reader to read stop words from </param>
+ public StandardAnalyzer(LuceneVersion matchVersion, TextReader stopwords)
+ : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
+ {
+ }
+
+ /// <summary>
+ /// Set maximum allowed token length. If a token is seen
+ /// that exceeds this length then it is discarded. This
+ /// setting only takes effect the next time tokenStream or
+ /// tokenStream is called.
+ /// </summary>
+ public int MaxTokenLength
+ {
+ set
+ {
+ maxTokenLength = value;
+ }
+ get
+ {
+ return maxTokenLength;
+ }
+ }
+
+
+ public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ var src = new StandardTokenizer(matchVersion, reader);
+ src.MaxTokenLength = maxTokenLength;
+ TokenStream tok = new StandardFilter(matchVersion, src);
+ tok = new LowerCaseFilter(matchVersion, tok);
+ tok = new StopFilter(matchVersion, tok, stopwords);
+ return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
+ }
+
+ private class TokenStreamComponentsAnonymousInnerClassHelper : TokenStreamComponents
+ {
+ private readonly StandardAnalyzer outerInstance;
+
+ private TextReader reader;
+ private readonly StandardTokenizer src;
+
+ public TokenStreamComponentsAnonymousInnerClassHelper(StandardAnalyzer outerInstance, StandardTokenizer src, TokenStream tok, TextReader reader)
+ : base(src, tok)
+ {
+ this.outerInstance = outerInstance;
+ this.reader = reader;
+ this.src = src;
+ }
+
+ protected override TextReader Reader
+ {
+ set
+ {
+ src.MaxTokenLength = outerInstance.maxTokenLength;
+ base.Reader = value;
+ }
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilter.cs
index a2641ce..6093cd6 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilter.cs
@@ -1,102 +1,91 @@
-using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
-namespace org.apache.lucene.analysis.standard
+namespace Lucene.Net.Analysis.Standard
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Normalizes tokens extracted with <seealso cref="StandardTokenizer"/>.
+ /// </summary>
+ public class StandardFilter : TokenFilter
+ {
+ private readonly LuceneVersion matchVersion;
- using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
- using Version = org.apache.lucene.util.Version;
+ public StandardFilter(LuceneVersion matchVersion, TokenStream @in)
+ : base(@in)
+ {
+ this.matchVersion = matchVersion;
+ typeAtt = AddAttribute<ITypeAttribute>();
+ termAtt = AddAttribute<ICharTermAttribute>();
+ }
- /// <summary>
- /// Normalizes tokens extracted with <seealso cref="StandardTokenizer"/>.
- /// </summary>
- public class StandardFilter : TokenFilter
- {
- private readonly Version matchVersion;
+ private static readonly string APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+ private static readonly string ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
- public StandardFilter(Version matchVersion, TokenStream @in) : base(@in)
- {
- this.matchVersion = matchVersion;
- }
+ // this filters uses attribute type
+ private readonly ITypeAttribute typeAtt;
+ private readonly ICharTermAttribute termAtt;
- private static readonly string APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
- private static readonly string ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+ public override bool IncrementToken()
+ {
+ if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
+ {
+ return input.IncrementToken(); // TODO: add some niceties for the new grammar
+ }
+ else
+ {
+ return IncrementTokenClassic();
+ }
+ }
- // this filters uses attribute type
- private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
- private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ public bool IncrementTokenClassic()
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
- public override bool incrementToken()
- {
- if (matchVersion.onOrAfter(Version.LUCENE_31))
- {
- return input.incrementToken(); // TODO: add some niceties for the new grammar
- }
- else
- {
- return incrementTokenClassic();
- }
- }
+ char[] buffer = termAtt.Buffer();
+ int bufferLength = termAtt.Length;
+ string type = typeAtt.Type;
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public final boolean incrementTokenClassic() throws java.io.IOException
- public bool incrementTokenClassic()
- {
- if (!input.incrementToken())
- {
- return false;
- }
+ if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) // remove 's
+ {
+ // Strip last 2 characters off
+ termAtt.Length = bufferLength - 2;
+ } // remove dots
+ else if (type == ACRONYM_TYPE)
+ {
+ int upto = 0;
+ for (int i = 0; i < bufferLength; i++)
+ {
+ char c = buffer[i];
+ if (c != '.')
+ {
+ buffer[upto++] = c;
+ }
+ }
+ termAtt.Length = upto;
+ }
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final char[] buffer = termAtt.buffer();
- char[] buffer = termAtt.buffer();
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int bufferLength = termAtt.length();
- int bufferLength = termAtt.length();
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final String type = typeAtt.type();
- string type = typeAtt.type();
-
- if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) // remove 's
- {
- // Strip last 2 characters off
- termAtt.Length = bufferLength - 2;
- } // remove dots
- else if (type == ACRONYM_TYPE)
- {
- int upto = 0;
- for (int i = 0;i < bufferLength;i++)
- {
- char c = buffer[i];
- if (c != '.')
- {
- buffer[upto++] = c;
- }
- }
- termAtt.Length = upto;
- }
-
- return true;
- }
- }
+ return true;
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilterFactory.cs
index eab0156..b634397 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilterFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardFilterFactory.cs
@@ -1,4 +1,5 @@
using System.Collections.Generic;
+using Lucene.Net.Analysis.Standard;
using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
namespace org.apache.lucene.analysis.standard
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizer.cs
index abf55e8..e47b481 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizer.cs
@@ -16,7 +16,10 @@
*/
using System;
using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
using org.apache.lucene.analysis.standard;
+using Version = Lucene.Net.Util.LuceneVersion;
+using Reader = System.IO.TextReader;
namespace Lucene.Net.Analysis.Standard
{
@@ -144,15 +147,15 @@ namespace Lucene.Net.Analysis.Standard
{
this.scanner = new StandardTokenizerImpl(input);
}
- else if (matchVersion.onOrAfter(Version.LUCENE_40))
+ else if (matchVersion.OnOrAfter(Version.LUCENE_40))
{
this.scanner = new StandardTokenizerImpl40(input);
}
- else if (matchVersion.onOrAfter(Version.LUCENE_34))
+ else if (matchVersion.OnOrAfter(Version.LUCENE_34))
{
this.scanner = new StandardTokenizerImpl34(input);
}
- else if (matchVersion.onOrAfter(Version.LUCENE_31))
+ else if (matchVersion.OnOrAfter(Version.LUCENE_31))
{
this.scanner = new StandardTokenizerImpl31(input);
}
@@ -229,9 +232,9 @@ namespace Lucene.Net.Analysis.Standard
posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
}
- public override void Close()
+ public override void Dispose()
{
- base.Close();
+ base.Dispose();
scanner.yyreset(input);
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerFactory.cs
index 0b6bbe6..2c4560f 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerFactory.cs
@@ -1,8 +1,9 @@
using System.Collections.Generic;
-using Lucene.Net.Analysis.Standard;
-using TokenizerFactory = Lucene.Net.Analysis.Util.TokenizerFactory;
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
-namespace org.apache.lucene.analysis.standard
+namespace Lucene.Net.Analysis.Standard
{
/*
@@ -21,12 +22,7 @@ namespace org.apache.lucene.analysis.standard
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
- using TokenizerFactory = TokenizerFactory;
- using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
-
-
- /// <summary>
+ /// <summary>
/// Factory for <seealso cref="StandardTokenizer"/>.
/// <pre class="prettyprint">
/// <fieldType name="text_stndrd" class="solr.TextField" positionIncrementGap="100">
@@ -51,9 +47,9 @@ namespace org.apache.lucene.analysis.standard
}
}
- public override StandardTokenizer create(AttributeFactory factory, Reader input)
+ public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
{
- StandardTokenizer tokenizer = new StandardTokenizer(luceneMatchVersion, factory, input);
+ var tokenizer = new StandardTokenizer(luceneMatchVersion, factory, input);
tokenizer.MaxTokenLength = maxTokenLength;
return tokenizer;
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerImpl.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerImpl.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerImpl.cs
index 86ba884..44a9bbe 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerImpl.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/StandardTokenizerImpl.cs
@@ -22,8 +22,6 @@ namespace org.apache.lucene.analysis.standard
* limitations under the License.
*/
- using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
/// <summary>
/// This class implements Word Break rules from the Unicode Text Segmentation
/// algorithm, as specified in
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs
index 628ca23..273896b 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/UAX29URLEmailAnalyzer.cs
@@ -1,9 +1,10 @@
using Lucene.Net.Analysis.Core;
-using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Util;
-using StopwordAnalyzerBase = Lucene.Net.Analysis.Util.StopwordAnalyzerBase;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.standard;
+using Reader = System.IO.TextReader;
-namespace org.apache.lucene.analysis.standard
+namespace Lucene.Net.Analysis.Standard
{
/*
@@ -22,18 +23,9 @@ namespace org.apache.lucene.analysis.standard
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
- using LowerCaseFilter = LowerCaseFilter;
- using StopAnalyzer = StopAnalyzer;
- using StopFilter = StopFilter;
- using CharArraySet = CharArraySet;
- using StopwordAnalyzerBase = StopwordAnalyzerBase;
- using Version = org.apache.lucene.util.Version;
-
-
- /// <summary>
+ /// <summary>
/// Filters <seealso cref="org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer"/>
- /// with <seealso cref="org.apache.lucene.analysis.standard.StandardFilter"/>,
+ /// with <seealso cref="StandardFilter"/>,
/// <seealso cref="LowerCaseFilter"/> and
/// <seealso cref="StopFilter"/>, using a list of
/// English stop words.
@@ -64,7 +56,7 @@ namespace org.apache.lucene.analysis.standard
/// <param name="matchVersion"> Lucene version to match See {@link
/// <a href="#version">above</a>} </param>
/// <param name="stopWords"> stop words </param>
- public UAX29URLEmailAnalyzer(Version matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords)
+ public UAX29URLEmailAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords)
{
}
@@ -73,7 +65,7 @@ namespace org.apache.lucene.analysis.standard
/// #STOP_WORDS_SET}). </summary>
/// <param name="matchVersion"> Lucene version to match See {@link
/// <a href="#version">above</a>} </param>
- public UAX29URLEmailAnalyzer(Version matchVersion) : this(matchVersion, STOP_WORDS_SET)
+ public UAX29URLEmailAnalyzer(LuceneVersion matchVersion) : this(matchVersion, STOP_WORDS_SET)
{
}
@@ -83,9 +75,7 @@ namespace org.apache.lucene.analysis.standard
/// <param name="matchVersion"> Lucene version to match See {@link
/// <a href="#version">above</a>} </param>
/// <param name="stopwords"> Reader to read stop words from </param>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public UAX29URLEmailAnalyzer(org.apache.lucene.util.Version matchVersion, java.io.Reader stopwords) throws java.io.IOException
- public UAX29URLEmailAnalyzer(Version matchVersion, Reader stopwords) : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
+ public UAX29URLEmailAnalyzer(LuceneVersion matchVersion, Reader stopwords) : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
{
}
@@ -108,12 +98,8 @@ namespace org.apache.lucene.analysis.standard
}
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
-//ORIGINAL LINE: @Override protected TokenStreamComponents createComponents(final String fieldName, final java.io.Reader reader)
- protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ public override TokenStreamComponents CreateComponents(string fieldName, Reader reader)
{
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(matchVersion, reader);
UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(matchVersion, reader);
src.MaxTokenLength = maxTokenLength;
TokenStream tok = new StandardFilter(matchVersion, src);
@@ -127,19 +113,16 @@ namespace org.apache.lucene.analysis.standard
private readonly UAX29URLEmailAnalyzer outerInstance;
private Reader reader;
- private org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer src;
+ private UAX29URLEmailTokenizer src;
- public TokenStreamComponentsAnonymousInnerClassHelper(UAX29URLEmailAnalyzer outerInstance, org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer src, TokenStream tok, Reader reader) : base(src, tok)
+ public TokenStreamComponentsAnonymousInnerClassHelper(UAX29URLEmailAnalyzer outerInstance, UAX29URLEmailTokenizer src, TokenStream tok, Reader reader) : base(src, tok)
{
this.outerInstance = outerInstance;
this.reader = reader;
this.src = src;
}
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override protected void setReader(final java.io.Reader reader) throws java.io.IOException
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
- protected internal override Reader Reader
+ protected override Reader Reader
{
set
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArraySet.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArraySet.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArraySet.cs
index f319675..4faa921 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArraySet.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArraySet.cs
@@ -1,7 +1,6 @@
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Util;
-using org.apache.lucene.analysis.util;
namespace Lucene.Net.Analysis.Util
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs
index 1fd76f8..f4fa262 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs
@@ -1,7 +1,4 @@
-using System.Collections.Generic;
-using System.Text;
-
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -17,20 +14,14 @@ using System.Text;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+using System.Collections.Generic;
+using System.Text;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
-namespace org.apache.lucene.analysis.wikipedia
+namespace Lucene.Net.Analysis.Wikipedia
{
-
- using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- using FlagsAttribute = org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
- using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
- using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
- using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
- using AttributeSource = org.apache.lucene.util.AttributeSource;
-
-
-
- /// <summary>
+ /// <summary>
/// Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
/// Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
/// <p/>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizerFactory.cs
index ad7027f..e320469 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizerFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizerFactory.cs
@@ -1,4 +1,5 @@
using System.Collections.Generic;
+using Lucene.Net.Analysis.Wikipedia;
using TokenizerFactory = Lucene.Net.Analysis.Util.TokenizerFactory;
namespace org.apache.lucene.analysis.wikipedia
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Core/Util/StringHelper.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Util/StringHelper.cs b/src/Lucene.Net.Core/Util/StringHelper.cs
index 9a8dc3c..74d6992 100644
--- a/src/Lucene.Net.Core/Util/StringHelper.cs
+++ b/src/Lucene.Net.Core/Util/StringHelper.cs
@@ -66,18 +66,18 @@ namespace Lucene.Net.Util
}
}
- private static IComparer<string> versionComparator = new ComparatorAnonymousInnerClassHelper();
+ private static readonly IComparer<string> versionComparator = new ComparatorAnonymousInnerClassHelper();
- private class ComparatorAnonymousInnerClassHelper : IComparer<string>
+ private sealed class ComparatorAnonymousInnerClassHelper : IComparer<string>
{
public ComparatorAnonymousInnerClassHelper()
{
}
- public virtual int Compare(string a, string b)
+ public int Compare(string a, string b)
{
- StringTokenizer aTokens = new StringTokenizer(a, ".");
- StringTokenizer bTokens = new StringTokenizer(b, ".");
+ var aTokens = new StringTokenizer(a, ".");
+ var bTokens = new StringTokenizer(b, ".");
while (aTokens.HasMoreTokens())
{
[3/3] lucenenet git commit: More porting work
Posted by sy...@apache.org.
More porting work
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/b4eaf2fc
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/b4eaf2fc
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/b4eaf2fc
Branch: refs/heads/master
Commit: b4eaf2fc441dfd5d32732eda844ef1e8e62588a1
Parents: 8d7a54f
Author: Itamar Syn-Hershko <it...@code972.com>
Authored: Wed Apr 15 02:32:11 2015 +0300
Committer: Itamar Syn-Hershko <it...@code972.com>
Committed: Wed Apr 15 02:32:11 2015 +0300
----------------------------------------------------------------------
.../Compound/CompoundWordTokenFilterBase.cs | 365 ++++++-----
.../Analysis/Core/UpperCaseFilter.cs | 114 ++--
.../Ngram/Lucene43EdgeNGramTokenizer.cs | 609 +++++++++----------
.../Analysis/Standard/ClassicAnalyzer.cs | 299 +++++----
.../Analysis/Standard/ClassicFilter.cs | 153 +++--
.../Analysis/Standard/ClassicFilterFactory.cs | 92 ++-
.../Analysis/Standard/ClassicTokenizer.cs | 369 ++++++-----
.../Analysis/Standard/ClassicTokenizerImpl.cs | 14 +-
.../Analysis/Standard/StandardAnalyzer.cs | 273 +++++----
.../Analysis/Standard/StandardFilter.cs | 167 +++--
.../Analysis/Standard/StandardFilterFactory.cs | 1 +
.../Analysis/Standard/StandardTokenizer.cs | 13 +-
.../Standard/StandardTokenizerFactory.cs | 18 +-
.../Analysis/Standard/StandardTokenizerImpl.cs | 2 -
.../Analysis/Standard/UAX29URLEmailAnalyzer.cs | 43 +-
.../Analysis/Util/CharArraySet.cs | 1 -
.../Analysis/Wikipedia/WikipediaTokenizer.cs | 23 +-
.../Wikipedia/WikipediaTokenizerFactory.cs | 1 +
src/Lucene.Net.Core/Util/StringHelper.cs | 10 +-
19 files changed, 1239 insertions(+), 1328 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
index ba8fd6c..c6bc4cd 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
@@ -2,193 +2,192 @@
using System.Diagnostics;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Compound
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/// <summary>
- /// Base class for decomposition token filters.
- /// <para>
- ///
- /// <a name="version"></a>
- /// You must specify the required <seealso cref="LuceneVersion"/> compatibility when creating
- /// CompoundWordTokenFilterBase:
- /// <ul>
- /// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
- /// supplementary characters in strings and char arrays provided as compound word
- /// dictionaries.
- /// <li>As of 4.4, <seealso cref="CompoundWordTokenFilterBase"/> doesn't update offsets.
- /// </ul>
- /// </para>
- /// </summary>
- public abstract class CompoundWordTokenFilterBase : TokenFilter
- {
- /// <summary>
- /// The default for minimal word length that gets decomposed
- /// </summary>
- public const int DEFAULT_MIN_WORD_SIZE = 5;
-
- /// <summary>
- /// The default for minimal length of subwords that get propagated to the output of this filter
- /// </summary>
- public const int DEFAULT_MIN_SUBWORD_SIZE = 2;
-
- /// <summary>
- /// The default for maximal length of subwords that get propagated to the output of this filter
- /// </summary>
- public const int DEFAULT_MAX_SUBWORD_SIZE = 15;
-
- protected internal readonly LuceneVersion matchVersion;
- protected internal readonly CharArraySet dictionary;
- protected internal readonly LinkedList<CompoundToken> tokens;
- protected internal readonly int minWordSize;
- protected internal readonly int minSubwordSize;
- protected internal readonly int maxSubwordSize;
- protected internal readonly bool onlyLongestMatch;
-
- protected internal readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
- protected internal readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
- private readonly PositionIncrementAttribute posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
-
- private AttributeSource.State current;
-
- protected internal CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, bool onlyLongestMatch) : this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch)
- {
- }
-
- protected internal CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary) : this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false)
- {
- }
-
- protected internal CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input)
- {
- this.matchVersion = matchVersion;
- this.tokens = new LinkedList<CompoundToken>();
- if (minWordSize < 0)
- {
- throw new System.ArgumentException("minWordSize cannot be negative");
- }
- this.minWordSize = minWordSize;
- if (minSubwordSize < 0)
- {
- throw new System.ArgumentException("minSubwordSize cannot be negative");
- }
- this.minSubwordSize = minSubwordSize;
- if (maxSubwordSize < 0)
- {
- throw new System.ArgumentException("maxSubwordSize cannot be negative");
- }
- this.maxSubwordSize = maxSubwordSize;
- this.onlyLongestMatch = onlyLongestMatch;
- this.dictionary = dictionary;
- }
-
- public override bool IncrementToken()
- {
- if (tokens.Count > 0)
- {
- Debug.Assert(current != null);
- CompoundToken token = tokens.First.Value; tokens.RemoveFirst();
- RestoreState(current); // keep all other attributes untouched
- termAtt.SetEmpty().Append(token.txt);
- offsetAtt.SetOffset(token.startOffset, token.endOffset);
- posIncAtt.PositionIncrement = 0;
- return true;
- }
-
- current = null; // not really needed, but for safety
- if (input.incrementToken())
- {
- // Only words longer than minWordSize get processed
- if (termAtt.length() >= this.minWordSize)
- {
- decompose();
- // only capture the state if we really need it for producing new tokens
- if (tokens.Count > 0)
- {
- current = captureState();
- }
- }
- // return original token:
- return true;
- }
- else
- {
- return false;
- }
- }
-
- /// <summary>
- /// Decomposes the current <seealso cref="#termAtt"/> and places <seealso cref="CompoundToken"/> instances in the <seealso cref="#tokens"/> list.
- /// The original token may not be placed in the list, as it is automatically passed through this filter.
- /// </summary>
- protected internal abstract void decompose();
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
- public override void reset()
- {
- base.reset();
- tokens.Clear();
- current = null;
- }
-
- /// <summary>
- /// Helper class to hold decompounded token information
- /// </summary>
- protected internal class CompoundToken
- {
- private readonly CompoundWordTokenFilterBase outerInstance;
-
- public readonly string txt;
- public readonly int startOffset, endOffset;
-
- /// <summary>
- /// Construct the compound token based on a slice of the current <seealso cref="CompoundWordTokenFilterBase#termAtt"/>. </summary>
- public CompoundToken(CompoundWordTokenFilterBase outerInstance, int offset, int length)
- {
- this.outerInstance = outerInstance;
- this.txt = outerInstance.termAtt.subSequence(offset, offset + length);
-
- // offsets of the original word
- int startOff = outerInstance.offsetAtt.startOffset();
- int endOff = outerInstance.offsetAtt.endOffset();
-
- if (outerInstance.matchVersion.onOrAfter(LuceneVersion.LUCENE_44) || endOff - startOff != outerInstance.termAtt.length())
- {
- // if length by start + end offsets doesn't match the term text then assume
- // this is a synonym and don't adjust the offsets.
- this.startOffset = startOff;
- this.endOffset = endOff;
- }
- else
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int newStart = startOff + offset;
- int newStart = startOff + offset;
- this.startOffset = newStart;
- this.endOffset = newStart + length;
- }
- }
-
- }
- }
-
+ /// Base class for decomposition token filters.
+ /// <para>
+ ///
+ /// <a name="version"></a>
+ /// You must specify the required <seealso cref="LuceneVersion"/> compatibility when creating
+ /// CompoundWordTokenFilterBase:
+ /// <ul>
+ /// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ /// supplementary characters in strings and char arrays provided as compound word
+ /// dictionaries.
+ /// <li>As of 4.4, <seealso cref="CompoundWordTokenFilterBase"/> doesn't update offsets.
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public abstract class CompoundWordTokenFilterBase : TokenFilter
+ {
+ /// <summary>
+ /// The default for minimal word length that gets decomposed
+ /// </summary>
+ public const int DEFAULT_MIN_WORD_SIZE = 5;
+
+ /// <summary>
+ /// The default for minimal length of subwords that get propagated to the output of this filter
+ /// </summary>
+ public const int DEFAULT_MIN_SUBWORD_SIZE = 2;
+
+ /// <summary>
+ /// The default for maximal length of subwords that get propagated to the output of this filter
+ /// </summary>
+ public const int DEFAULT_MAX_SUBWORD_SIZE = 15;
+
+ protected internal readonly LuceneVersion matchVersion;
+ protected internal readonly CharArraySet dictionary;
+ protected internal readonly LinkedList<CompoundToken> tokens;
+ protected internal readonly int minWordSize;
+ protected internal readonly int minSubwordSize;
+ protected internal readonly int maxSubwordSize;
+ protected internal readonly bool onlyLongestMatch;
+
+ protected internal readonly CharTermAttribute termAtt;
+ protected internal readonly IOffsetAttribute offsetAtt;
+ private readonly IPositionIncrementAttribute posIncAtt;
+
+ private AttributeSource.State current;
+
+ protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, bool onlyLongestMatch)
+ : this(matchVersion, input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch)
+ {
+ }
+
+ protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary)
+ : this(matchVersion, input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
+ {
+ }
+
+ protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
+ : base(input)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>() as CharTermAttribute;
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ posIncAtt = AddAttribute<IPositionIncrementAttribute>();
+
+ this.matchVersion = matchVersion;
+ this.tokens = new LinkedList<CompoundToken>();
+ if (minWordSize < 0)
+ {
+ throw new System.ArgumentException("minWordSize cannot be negative");
+ }
+ this.minWordSize = minWordSize;
+ if (minSubwordSize < 0)
+ {
+ throw new System.ArgumentException("minSubwordSize cannot be negative");
+ }
+ this.minSubwordSize = minSubwordSize;
+ if (maxSubwordSize < 0)
+ {
+ throw new System.ArgumentException("maxSubwordSize cannot be negative");
+ }
+ this.maxSubwordSize = maxSubwordSize;
+ this.onlyLongestMatch = onlyLongestMatch;
+ this.dictionary = dictionary;
+ }
+
+ public override bool IncrementToken()
+ {
+ if (tokens.Count > 0)
+ {
+ Debug.Assert(current != null);
+ CompoundToken token = tokens.First.Value; tokens.RemoveFirst();
+ RestoreState(current); // keep all other attributes untouched
+ termAtt.SetEmpty().Append(token.txt);
+ offsetAtt.SetOffset(token.startOffset, token.endOffset);
+ posIncAtt.PositionIncrement = 0;
+ return true;
+ }
+
+ current = null; // not really needed, but for safety
+ if (input.IncrementToken())
+ {
+ // Only words longer than minWordSize get processed
+ if (termAtt.Length >= this.minWordSize)
+ {
+ Decompose();
+ // only capture the state if we really need it for producing new tokens
+ if (tokens.Count > 0)
+ {
+ current = CaptureState();
+ }
+ }
+ // return original token:
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Decomposes the current <seealso cref="#termAtt"/> and places <seealso cref="CompoundToken"/> instances in the <seealso cref="#tokens"/> list.
+ /// The original token may not be placed in the list, as it is automatically passed through this filter.
+ /// </summary>
+ protected abstract void Decompose();
+
+ public override void Reset()
+ {
+ base.Reset();
+ tokens.Clear();
+ current = null;
+ }
+
+ /// <summary>
+ /// Helper class to hold decompounded token information
+ /// </summary>
+ protected internal class CompoundToken
+ {
+ public readonly ICharSequence txt;
+ public readonly int startOffset, endOffset;
+
+ /// <summary>
+ /// Construct the compound token based on a slice of the current <seealso cref="CompoundWordTokenFilterBase#termAtt"/>. </summary>
+ public CompoundToken(CompoundWordTokenFilterBase outerInstance, int offset, int length)
+ {
+ this.txt = outerInstance.termAtt.SubSequence(offset, offset + length);
+
+ // offsets of the original word
+ int startOff = outerInstance.offsetAtt.StartOffset();
+ int endOff = outerInstance.offsetAtt.EndOffset();
+
+ if (outerInstance.matchVersion.OnOrAfter(LuceneVersion.LUCENE_44) || endOff - startOff != outerInstance.termAtt.Length)
+ {
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ this.startOffset = startOff;
+ this.endOffset = endOff;
+ }
+ else
+ {
+ int newStart = startOff + offset;
+ this.startOffset = newStart;
+ this.endOffset = newStart + length;
+ }
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
index 6b722ad..c8b5f5f 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
@@ -4,65 +4,65 @@ using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Core
{
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/// <summary>
- /// Normalizes token text to UPPER CASE.
- /// <a name="version"/>
- /// <para>You must specify the required <seealso cref="LuceneVersion"/>
- /// compatibility when creating UpperCaseFilter
- ///
- /// </para>
- /// <para><b>NOTE:</b> In Unicode, this transformation may lose information when the
- /// upper case character represents more than one lower case character. Use this filter
- /// when you require uppercase tokens. Use the <seealso cref="LowerCaseFilter"/> for
- /// general search matching
- /// </para>
- /// </summary>
- public sealed class UpperCaseFilter : TokenFilter
- {
- private readonly CharacterUtils charUtils;
- private readonly ICharTermAttribute termAtt;;
+ /// Normalizes token text to UPPER CASE.
+ /// <a name="version"/>
+ /// <para>You must specify the required <seealso cref="LuceneVersion"/>
+ /// compatibility when creating UpperCaseFilter
+ ///
+ /// </para>
+ /// <para><b>NOTE:</b> In Unicode, this transformation may lose information when the
+ /// upper case character represents more than one lower case character. Use this filter
+ /// when you require uppercase tokens. Use the <seealso cref="LowerCaseFilter"/> for
+ /// general search matching
+ /// </para>
+ /// </summary>
+ public sealed class UpperCaseFilter : TokenFilter
+ {
+ private readonly CharacterUtils charUtils;
+ private readonly ICharTermAttribute termAtt;
- /// <summary>
- /// Create a new UpperCaseFilter, that normalizes token text to upper case.
- /// </summary>
- /// <param name="matchVersion"> See <a href="#version">above</a> </param>
- /// <param name="in"> TokenStream to filter </param>
- public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in) : base(@in)
- {
- termAtt = AddAttribute<ICharTermAttribute>();
- termAtt = AddAttribute<ICharTermAttribute>();
- charUtils = CharacterUtils.GetInstance(matchVersion);
- }
+ /// <summary>
+ /// Create a new UpperCaseFilter, that normalizes token text to upper case.
+ /// </summary>
+ /// <param name="matchVersion"> See <a href="#version">above</a> </param>
+ /// <param name="in"> TokenStream to filter </param>
+ public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in)
+ : base(@in)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>();
+ termAtt = AddAttribute<ICharTermAttribute>();
+ charUtils = CharacterUtils.GetInstance(matchVersion);
+ }
- public override bool IncrementToken()
- {
- if (input.IncrementToken())
- {
- charUtils.ToUpper(termAtt.Buffer(), 0, termAtt.Length);
- return true;
- }
- else
- {
- return false;
- }
- }
- }
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ charUtils.ToUpper(termAtt.Buffer(), 0, termAtt.Length);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
index 3827b36..c277918 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
@@ -1,323 +1,308 @@
using System;
using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
using Reader = System.IO.TextReader;
using Version = Lucene.Net.Util.LuceneVersion;
namespace Lucene.Net.Analysis.Ngram
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/// <summary>
- /// Old version of <seealso cref="EdgeNGramTokenizer"/> which doesn't handle correctly
- /// supplementary characters.
- /// </summary>
- [Obsolete]
- public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
- {
- public const Side DEFAULT_SIDE = Side.FRONT;
- public const int DEFAULT_MAX_GRAM_SIZE = 1;
- public const int DEFAULT_MIN_GRAM_SIZE = 1;
-
- private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
- private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
- private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
-
- /// <summary>
- /// Specifies which side of the input the n-gram should be generated from </summary>
- public enum Side
- {
-
- /// <summary>
- /// Get the n-gram from the front of the input </summary>
-//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
- FRONT
- {
- public String getLabel() { return "front"
- }
- },
-
- /// <summary>
- /// Get the n-gram from the end of the input </summary>
-//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
- BACK
- {
- public String getLabel()
- {
- return "back";
- }
- }
-
- public =
-
- // Get the appropriate Side from a string
- public static Side getSide(String sideName)
- {
-//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
- if (FRONT.getLabel().equals(sideName))
- {
- return FRONT;
- }
-//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
- if (BACK.getLabel().equals(sideName))
- {
- return BACK;
- }
- return null;
- }
- }
-
- private int minGram;
- private int maxGram;
- private int gramSize;
- private Side side;
- private bool started;
- private int inLen; // length of the input AFTER trim()
- private int charsRead; // length of the input
- private string inStr;
-
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram) : base(input)
- {
- init(version, side, minGram, maxGram);
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) : base(factory, input)
- {
- init(version, side, minGram, maxGram);
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(Version version, Reader input, string sideLabel, int minGram, int maxGram) : this(version, input, Side.getSide(sideLabel), minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, string sideLabel, int minGram, int maxGram) : this(version, factory, input, Side.getSide(sideLabel), minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public Lucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) : this(version, input, Side.FRONT, minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) : this(version, factory, input, Side.FRONT, minGram, maxGram)
- {
- }
-
- private void init(Version version, Side side, int minGram, int maxGram)
- {
- if (version == null)
- {
- throw new System.ArgumentException("version must not be null");
- }
-
- if (side == null)
- {
- throw new System.ArgumentException("sideLabel must be either front or back");
- }
-
- if (minGram < 1)
- {
- throw new System.ArgumentException("minGram must be greater than zero");
- }
-
- if (minGram > maxGram)
- {
- throw new System.ArgumentException("minGram must not be greater than maxGram");
- }
-
- if (version.onOrAfter(Version.LUCENE_44))
- {
- if (side == Side.BACK)
- {
- throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
- }
- }
- else
- {
- maxGram = Math.Min(maxGram, 1024);
- }
-
- this.minGram = minGram;
- this.maxGram = maxGram;
- this.side = side;
- }
-
- /// <summary>
- /// Returns the next token in the stream, or null at EOS. </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
- public override bool incrementToken()
- {
- clearAttributes();
- // if we are just starting, read the whole input
- if (!started)
- {
- started = true;
- gramSize = minGram;
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int limit = side == Side.FRONT ? maxGram : 1024;
- int limit = side == Side.FRONT ? maxGram : 1024;
- char[] chars = new char[Math.Min(1024, limit)];
- charsRead = 0;
- // TODO: refactor to a shared readFully somewhere:
- bool exhausted = false;
- while (charsRead < limit)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int inc = input.read(chars, charsRead, chars.length-charsRead);
- int inc = input.read(chars, charsRead, chars.Length - charsRead);
- if (inc == -1)
- {
- exhausted = true;
- break;
- }
- charsRead += inc;
- if (charsRead == chars.Length && charsRead < limit)
- {
- chars = ArrayUtil.grow(chars);
- }
- }
-
- inStr = new string(chars, 0, charsRead);
- inStr = inStr.Trim();
-
- if (!exhausted)
- {
- // Read extra throwaway chars so that on end() we
- // report the correct offset:
- char[] throwaway = new char[1024];
- while (true)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int inc = input.read(throwaway, 0, throwaway.length);
- int inc = input.read(throwaway, 0, throwaway.Length);
- if (inc == -1)
- {
- break;
- }
- charsRead += inc;
- }
- }
-
- inLen = inStr.length();
- if (inLen == 0)
- {
- return false;
- }
- posIncrAtt.PositionIncrement = 1;
- }
- else
- {
- posIncrAtt.PositionIncrement = 0;
- }
-
- // if the remaining input is too short, we can't generate any n-grams
- if (gramSize > inLen)
- {
- return false;
- }
-
- // if we have hit the end of our n-gram size range, quit
- if (gramSize > maxGram || gramSize > inLen)
- {
- return false;
- }
-
- // grab gramSize chars from front or back
- int start = side == Side.FRONT ? 0 : inLen - gramSize;
- int end = start + gramSize;
- termAtt.setEmpty().append(inStr, start, end);
- offsetAtt.setOffset(correctOffset(start), correctOffset(end));
- gramSize++;
- return true;
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void end() throws java.io.IOException
- public override void end()
- {
- base.end();
- // set final offset
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int finalOffset = correctOffset(charsRead);
- int finalOffset = correctOffset(charsRead);
- this.offsetAtt.setOffset(finalOffset, finalOffset);
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
- public override void reset()
- {
- base.reset();
- started = false;
- }
-}
+ /// Old version of <seealso cref="EdgeNGramTokenizer"/> which doesn't handle correctly
+ /// supplementary characters.
+ /// </summary>
+ [Obsolete]
+ public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
+ {
+ public const Side DEFAULT_SIDE = Side.FRONT;
+ public const int DEFAULT_MAX_GRAM_SIZE = 1;
+ public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ private readonly CharTermAttribute termAtt;
+ private readonly OffsetAttribute offsetAtt;
+ private readonly PositionIncrementAttribute posIncrAtt;
+
+ /// <summary>
+ /// Specifies which side of the input the n-gram should be generated from </summary>
+ public enum Side
+ {
+
+ /// <summary>
+ /// Get the n-gram from the front of the input </summary>
+ FRONT,
+
+ /// <summary>
+ /// Get the n-gram from the end of the input </summary>
+ BACK,
+ }
+
+ private static string GetSideLabel(Side side)
+ {
+ if (side == Side.FRONT) return "front";
+ if (side == Side.BACK) return "back";
+ return null;
+ }
+
+
+ // Get the appropriate Side from a string
+ internal static Side? GetSide(String sideName)
+ {
+ if (GetSideLabel(Side.FRONT).Equals(sideName))
+ {
+ return Side.FRONT;
+ }
+ if (GetSideLabel(Side.BACK).Equals(sideName))
+ {
+ return Side.BACK;
+ }
+ return null;
+ }
+
+ private int minGram;
+ private int maxGram;
+ private int gramSize;
+ private Side side;
+ private bool started;
+ private int inLen; // length of the input AFTER trim()
+ private int charsRead; // length of the input
+ private string inStr;
+
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram)
+ : base(input)
+ {
+ init(version, side, minGram, maxGram);
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram)
+ : base(factory, input)
+ {
+ init(version, side, minGram, maxGram);
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(Version version, Reader input, string sideLabel, int minGram, int maxGram)
+ : this(version, input, GetSide(sideLabel), minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, string sideLabel, int minGram, int maxGram)
+ : this(version, factory, input, GetSide(sideLabel), minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public Lucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram)
+ : this(version, input, Side.FRONT, minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram)
+ : this(version, factory, input, Side.FRONT, minGram, maxGram)
+ {
+ }
+
+ private void init(Version version, Side side, int minGram, int maxGram)
+ {
+ if (version == null)
+ {
+ throw new System.ArgumentException("version must not be null");
+ }
+
+ if (side == null)
+ {
+ throw new System.ArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+
+ if (version.OnOrAfter(Version.LUCENE_44))
+ {
+ if (side == Side.BACK)
+ {
+ throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
+ }
+ }
+ else
+ {
+ maxGram = Math.Min(maxGram, 1024);
+ }
+
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.side = side;
+ }
+
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS. </summary>
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ // if we are just starting, read the whole input
+ if (!started)
+ {
+ started = true;
+ gramSize = minGram;
+ int limit = side == Side.FRONT ? maxGram : 1024;
+ char[] chars = new char[Math.Min(1024, limit)];
+ charsRead = 0;
+ // TODO: refactor to a shared readFully somewhere:
+ bool exhausted = false;
+ while (charsRead < limit)
+ {
+ int inc = input.Read(chars, charsRead, chars.Length - charsRead);
+ if (inc <= 0)
+ {
+ exhausted = true;
+ break;
+ }
+ charsRead += inc;
+ if (charsRead == chars.Length && charsRead < limit)
+ {
+ chars = ArrayUtil.Grow(chars);
+ }
+ }
+
+ inStr = new string(chars, 0, charsRead);
+ inStr = inStr.Trim();
+
+ if (!exhausted)
+ {
+ // Read extra throwaway chars so that on end() we
+ // report the correct offset:
+ var throwaway = new char[1024];
+ while (true)
+ {
+ int inc = input.Read(throwaway, 0, throwaway.Length);
+ if (inc <= 0)
+ {
+ break;
+ }
+ charsRead += inc;
+ }
+ }
+
+ inLen = inStr.Length;
+ if (inLen == 0)
+ {
+ return false;
+ }
+ posIncrAtt.PositionIncrement = 1;
+ }
+ else
+ {
+ posIncrAtt.PositionIncrement = 0;
+ }
+
+ // if the remaining input is too short, we can't generate any n-grams
+ if (gramSize > inLen)
+ {
+ return false;
+ }
+
+ // if we have hit the end of our n-gram size range, quit
+ if (gramSize > maxGram || gramSize > inLen)
+ {
+ return false;
+ }
+
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : inLen - gramSize;
+ int end = start + gramSize;
+ termAtt.SetEmpty().Append(inStr, start, end);
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
+ gramSize++;
+ return true;
+ }
+
+ public override void End()
+ {
+ base.End();
+ // set final offset
+ int finalOffset = CorrectOffset(charsRead);
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ started = false;
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
index de32d23..0dd0529 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
@@ -1,164 +1,149 @@
using Lucene.Net.Analysis.Core;
-using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Util;
-using StopwordAnalyzerBase = Lucene.Net.Analysis.Util.StopwordAnalyzerBase;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.standard;
+using Reader = System.IO.TextReader;
-namespace org.apache.lucene.analysis.standard
+namespace Lucene.Net.Analysis.Standard
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- using org.apache.lucene.analysis;
- using LowerCaseFilter = LowerCaseFilter;
- using StopAnalyzer = StopAnalyzer;
- using StopFilter = StopFilter;
- using CharArraySet = CharArraySet;
- using StopwordAnalyzerBase = StopwordAnalyzerBase;
- using WordlistLoader = WordlistLoader;
- using Version = org.apache.lucene.util.Version;
-
-
- /// <summary>
- /// Filters <seealso cref="ClassicTokenizer"/> with <seealso cref="ClassicFilter"/>, {@link
- /// LowerCaseFilter} and <seealso cref="StopFilter"/>, using a list of
- /// English stop words.
- ///
- /// <a name="version"/>
- /// <para>You must specify the required <seealso cref="Version"/>
- /// compatibility when creating ClassicAnalyzer:
- /// <ul>
- /// <li> As of 3.1, StopFilter correctly handles Unicode 4.0
- /// supplementary characters in stopwords
- /// <li> As of 2.9, StopFilter preserves position
- /// increments
- /// <li> As of 2.4, Tokens incorrectly identified as acronyms
- /// are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
- /// </ul>
- ///
- /// ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
- /// As of 3.1, <seealso cref="StandardAnalyzer"/> implements Unicode text segmentation,
- /// as specified by UAX#29.
- /// </para>
- /// </summary>
- public sealed class ClassicAnalyzer : StopwordAnalyzerBase
- {
-
- /// <summary>
- /// Default maximum allowed token length </summary>
- public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
-
- private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
-
- /// <summary>
- /// An unmodifiable set containing some common English words that are usually not
- /// useful for searching.
- /// </summary>
- public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
-
- /// <summary>
- /// Builds an analyzer with the given stop words. </summary>
- /// <param name="matchVersion"> Lucene version to match See {@link
- /// <a href="#version">above</a>} </param>
- /// <param name="stopWords"> stop words </param>
- public ClassicAnalyzer(Version matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords)
- {
- }
-
- /// <summary>
- /// Builds an analyzer with the default stop words ({@link
- /// #STOP_WORDS_SET}). </summary>
- /// <param name="matchVersion"> Lucene version to match See {@link
- /// <a href="#version">above</a>} </param>
- public ClassicAnalyzer(Version matchVersion) : this(matchVersion, STOP_WORDS_SET)
- {
- }
-
- /// <summary>
- /// Builds an analyzer with the stop words from the given reader. </summary>
- /// <seealso cref= WordlistLoader#getWordSet(Reader, Version) </seealso>
- /// <param name="matchVersion"> Lucene version to match See {@link
- /// <a href="#version">above</a>} </param>
- /// <param name="stopwords"> Reader to read stop words from </param>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public ClassicAnalyzer(org.apache.lucene.util.Version matchVersion, java.io.Reader stopwords) throws java.io.IOException
- public ClassicAnalyzer(Version matchVersion, Reader stopwords) : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
- {
- }
-
- /// <summary>
- /// Set maximum allowed token length. If a token is seen
- /// that exceeds this length then it is discarded. This
- /// setting only takes effect the next time tokenStream or
- /// tokenStream is called.
- /// </summary>
- public int MaxTokenLength
- {
- set
- {
- maxTokenLength = value;
- }
- get
- {
- return maxTokenLength;
- }
- }
-
-
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
-//ORIGINAL LINE: @Override protected TokenStreamComponents createComponents(final String fieldName, final java.io.Reader reader)
- protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
- ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
- src.MaxTokenLength = maxTokenLength;
- TokenStream tok = new ClassicFilter(src);
- tok = new LowerCaseFilter(matchVersion, tok);
- tok = new StopFilter(matchVersion, tok, stopwords);
- return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
- }
-
- private class TokenStreamComponentsAnonymousInnerClassHelper : TokenStreamComponents
- {
- private readonly ClassicAnalyzer outerInstance;
-
- private Reader reader;
- private org.apache.lucene.analysis.standard.ClassicTokenizer src;
-
- public TokenStreamComponentsAnonymousInnerClassHelper(ClassicAnalyzer outerInstance, org.apache.lucene.analysis.standard.ClassicTokenizer src, TokenStream tok, Reader reader) : base(src, tok)
- {
- this.outerInstance = outerInstance;
- this.reader = reader;
- this.src = src;
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override protected void setReader(final java.io.Reader reader) throws java.io.IOException
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
- protected internal override Reader Reader
- {
- set
- {
- src.MaxTokenLength = outerInstance.maxTokenLength;
- base.Reader = value;
- }
- }
- }
- }
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Filters <seealso cref="ClassicTokenizer"/> with <seealso cref="ClassicFilter"/>, {@link
+ /// LowerCaseFilter} and <seealso cref="StopFilter"/>, using a list of
+ /// English stop words.
+ ///
+ /// <a name="version"/>
+ /// <para>You must specify the required <seealso cref="LuceneVersion"/>
+ /// compatibility when creating ClassicAnalyzer:
+ /// <ul>
+ /// <li> As of 3.1, StopFilter correctly handles Unicode 4.0
+ /// supplementary characters in stopwords
+ /// <li> As of 2.9, StopFilter preserves position
+ /// increments
+ /// <li> As of 2.4, Tokens incorrectly identified as acronyms
+ /// are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
+ /// </ul>
+ ///
+ /// ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
+ /// As of 3.1, <seealso cref="StandardAnalyzer"/> implements Unicode text segmentation,
+ /// as specified by UAX#29.
+ /// </para>
+ /// </summary>
+ public sealed class ClassicAnalyzer : StopwordAnalyzerBase
+ {
+
+ /// <summary>
+ /// Default maximum allowed token length </summary>
+ public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /// <summary>
+ /// An unmodifiable set containing some common English words that are usually not
+ /// useful for searching.
+ /// </summary>
+ public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words. </summary>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ /// <param name="stopWords"> stop words </param>
+ public ClassicAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords)
+ : base(matchVersion, stopWords)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words ({@link
+ /// #STOP_WORDS_SET}). </summary>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ public ClassicAnalyzer(LuceneVersion matchVersion)
+ : this(matchVersion, STOP_WORDS_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the stop words from the given reader. </summary>
+ /// <seealso cref= WordlistLoader#getWordSet(Reader, Version) </seealso>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ /// <param name="stopwords"> Reader to read stop words from </param>
+ public ClassicAnalyzer(LuceneVersion matchVersion, Reader stopwords)
+ : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
+ {
+ }
+
+ /// <summary>
+ /// Set maximum allowed token length. If a token is seen
+ /// that exceeds this length then it is discarded. This
+ /// setting only takes effect the next time tokenStream or
+ /// tokenStream is called.
+ /// </summary>
+ public int MaxTokenLength
+ {
+ set
+ {
+ maxTokenLength = value;
+ }
+ get
+ {
+ return maxTokenLength;
+ }
+ }
+
+
+ public override TokenStreamComponents CreateComponents(string fieldName, Reader reader)
+ {
+ var src = new ClassicTokenizer(matchVersion, reader);
+ src.MaxTokenLength = maxTokenLength;
+ TokenStream tok = new ClassicFilter(src);
+ tok = new LowerCaseFilter(matchVersion, tok);
+ tok = new StopFilter(matchVersion, tok, stopwords);
+ return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
+ }
+
+ private class TokenStreamComponentsAnonymousInnerClassHelper : TokenStreamComponents
+ {
+ private readonly ClassicAnalyzer outerInstance;
+
+ private Reader reader;
+ private ClassicTokenizer src;
+
+ public TokenStreamComponentsAnonymousInnerClassHelper(ClassicAnalyzer outerInstance, ClassicTokenizer src, TokenStream tok, Reader reader)
+ : base(src, tok)
+ {
+ this.outerInstance = outerInstance;
+ this.reader = reader;
+ this.src = src;
+ }
+
+ protected override Reader Reader
+ {
+ set
+ {
+ src.MaxTokenLength = outerInstance.maxTokenLength;
+ base.Reader = value;
+ }
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
index 9ee4b32..60bd1dd 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
@@ -1,92 +1,85 @@
-namespace org.apache.lucene.analysis.standard
-{
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+using Lucene.Net.Analysis.Tokenattributes;
- using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+namespace Lucene.Net.Analysis.Standard
+{
- /// <summary>
- /// Normalizes tokens extracted with <seealso cref="ClassicTokenizer"/>. </summary>
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Normalizes tokens extracted with <seealso cref="ClassicTokenizer"/>. </summary>
- public class ClassicFilter : TokenFilter
- {
+ public class ClassicFilter : TokenFilter
+ {
- /// <summary>
- /// Construct filtering <i>in</i>. </summary>
- public ClassicFilter(TokenStream @in) : base(@in)
- {
- }
+ /// <summary>
+ /// Construct filtering <i>in</i>. </summary>
+ public ClassicFilter(TokenStream @in)
+ : base(@in)
+ {
+ typeAtt = AddAttribute<ITypeAttribute>();
+ termAtt = AddAttribute<ICharTermAttribute>();
+ }
- private static readonly string APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
- private static readonly string ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+ private static readonly string APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+ private static readonly string ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
- // this filters uses attribute type
- private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
- private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ // this filters uses attribute type
+ private readonly ITypeAttribute typeAtt;
+ private readonly ICharTermAttribute termAtt;
- /// <summary>
- /// Returns the next token in the stream, or null at EOS.
- /// <para>Removes <tt>'s</tt> from the end of words.
- /// </para>
- /// <para>Removes dots from acronyms.
- /// </para>
- /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
- public override bool incrementToken()
- {
- if (!input.incrementToken())
- {
- return false;
- }
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS.
+ /// <para>Removes <tt>'s</tt> from the end of words.
+ /// </para>
+ /// <para>Removes dots from acronyms.
+ /// </para>
+ /// </summary>
+ public override bool IncrementToken()
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final char[] buffer = termAtt.buffer();
- char[] buffer = termAtt.buffer();
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int bufferLength = termAtt.length();
- int bufferLength = termAtt.length();
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final String type = typeAtt.type();
- string type = typeAtt.type();
+ char[] buffer = termAtt.Buffer();
+ int bufferLength = termAtt.Length;
+ string type = typeAtt.Type;
- if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) // remove 's
- {
- // Strip last 2 characters off
- termAtt.Length = bufferLength - 2;
- } // remove dots
- else if (type == ACRONYM_TYPE)
- {
- int upto = 0;
- for (int i = 0;i < bufferLength;i++)
- {
- char c = buffer[i];
- if (c != '.')
- {
- buffer[upto++] = c;
- }
- }
- termAtt.Length = upto;
- }
+ if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) // remove 's
+ {
+ // Strip last 2 characters off
+ termAtt.Length = bufferLength - 2;
+ } // remove dots
+ else if (type == ACRONYM_TYPE)
+ {
+ int upto = 0;
+ for (int i = 0; i < bufferLength; i++)
+ {
+ char c = buffer[i];
+ if (c != '.')
+ {
+ buffer[upto++] = c;
+ }
+ }
+ termAtt.Length = upto;
+ }
- return true;
- }
- }
+ return true;
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
index 2107ccc..45d7cd0 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
@@ -1,55 +1,53 @@
using System.Collections.Generic;
-using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+using Lucene.Net.Analysis.Util;
-namespace org.apache.lucene.analysis.standard
+namespace Lucene.Net.Analysis.Standard
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Factory for <seealso cref="ClassicFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.ClassicTokenizerFactory"/>
+ /// <filter class="solr.ClassicFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class ClassicFilterFactory : TokenFilterFactory
+ {
- using TokenFilterFactory = TokenFilterFactory;
+ /// <summary>
+ /// Creates a new ClassicFilterFactory </summary>
+ public ClassicFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
- /// <summary>
- /// Factory for <seealso cref="ClassicFilter"/>.
- /// <pre class="prettyprint">
- /// <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
- /// <analyzer>
- /// <tokenizer class="solr.ClassicTokenizerFactory"/>
- /// <filter class="solr.ClassicFilterFactory"/>
- /// </analyzer>
- /// </fieldType></pre>
- /// </summary>
- public class ClassicFilterFactory : TokenFilterFactory
- {
-
- /// <summary>
- /// Creates a new ClassicFilterFactory </summary>
- public ClassicFilterFactory(IDictionary<string, string> args) : base(args)
- {
- if (args.Count > 0)
- {
- throw new System.ArgumentException("Unknown parameters: " + args);
- }
- }
-
- public override TokenFilter create(TokenStream input)
- {
- return new ClassicFilter(input);
- }
- }
+ public override TokenStream Create(TokenStream input)
+ {
+ return new ClassicFilter(input);
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
index f9c680e..3ef7a9e 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
@@ -15,198 +15,185 @@
* limitations under the License.
*/
-using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.standard;
+using Reader = System.IO.TextReader;
-namespace org.apache.lucene.analysis.standard
+namespace Lucene.Net.Analysis.Standard
{
-
-
- using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
- using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
- using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
- using Version = org.apache.lucene.util.Version;
-
- /// <summary>
- /// A grammar-based tokenizer constructed with JFlex
- ///
- /// <para> This should be a good tokenizer for most European-language documents:
- ///
- /// <ul>
- /// <li>Splits words at punctuation characters, removing punctuation. However, a
- /// dot that's not followed by whitespace is considered part of a token.
- /// <li>Splits words at hyphens, unless there's a number in the token, in which case
- /// the whole token is interpreted as a product number and is not split.
- /// <li>Recognizes email addresses and internet hostnames as one token.
- /// </ul>
- ///
- /// </para>
- /// <para>Many applications have specific tokenizer needs. If this tokenizer does
- /// not suit your application, please consider copying this source code
- /// directory to your project and maintaining your own grammar-based tokenizer.
- ///
- /// ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1.
- /// As of 3.1, <seealso cref="StandardTokenizer"/> implements Unicode text segmentation,
- /// as specified by UAX#29.
- /// </para>
- /// </summary>
-
- public sealed class ClassicTokenizer : Tokenizer
- {
- /// <summary>
- /// A private instance of the JFlex-constructed scanner </summary>
- private StandardTokenizerInterface scanner;
-
- public const int ALPHANUM = 0;
- public const int APOSTROPHE = 1;
- public const int ACRONYM = 2;
- public const int COMPANY = 3;
- public const int EMAIL = 4;
- public const int HOST = 5;
- public const int NUM = 6;
- public const int CJ = 7;
-
- public const int ACRONYM_DEP = 8;
-
- /// <summary>
- /// String token types that correspond to token type int constants </summary>
- public static readonly string[] TOKEN_TYPES = new string [] {"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
-
- private int skippedPositions;
-
- private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
-
- /// <summary>
- /// Set the max allowed token length. Any token longer
- /// than this is skipped.
- /// </summary>
- public int MaxTokenLength
- {
- set
- {
- if (value < 1)
- {
- throw new System.ArgumentException("maxTokenLength must be greater than zero");
- }
- this.maxTokenLength = value;
- }
- get
- {
- return maxTokenLength;
- }
- }
-
-
- /// <summary>
- /// Creates a new instance of the <seealso cref="ClassicTokenizer"/>. Attaches
- /// the <code>input</code> to the newly created JFlex scanner.
- /// </summary>
- /// <param name="input"> The input reader
- ///
- /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
- public ClassicTokenizer(Version matchVersion, Reader input) : base(input)
- {
- init(matchVersion);
- }
-
- /// <summary>
- /// Creates a new ClassicTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>
- /// </summary>
- public ClassicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) : base(factory, input)
- {
- init(matchVersion);
- }
-
- private void init(Version matchVersion)
- {
- this.scanner = new ClassicTokenizerImpl(input);
- }
-
- // this tokenizer generates three attributes:
- // term offset, positionIncrement and type
- private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
- private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
- private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
- private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.analysis.TokenStream#next()
- */
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
- public override bool incrementToken()
- {
- clearAttributes();
- skippedPositions = 0;
-
- while (true)
- {
- int tokenType = scanner.NextToken;
-
- if (tokenType == StandardTokenizerInterface_Fields.YYEOF)
- {
- return false;
- }
-
- if (scanner.yylength() <= maxTokenLength)
- {
- posIncrAtt.PositionIncrement = skippedPositions + 1;
- scanner.getText(termAtt);
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int start = scanner.yychar();
- int start = scanner.yychar();
- offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
-
- if (tokenType == ClassicTokenizer.ACRONYM_DEP)
- {
- typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST];
- termAtt.Length = termAtt.length() - 1; // remove extra '.'
- }
- else
- {
- typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[tokenType];
- }
- return true;
- }
- else
- // When we skip a too-long term, we still increment the
- // position increment
- {
- skippedPositions++;
- }
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public final void end() throws java.io.IOException
- public override void end()
- {
- base.end();
- // set final offset
- int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
- offsetAtt.setOffset(finalOffset, finalOffset);
- // adjust any skipped tokens
- posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void close() throws java.io.IOException
- public override void close()
- {
- base.close();
- scanner.yyreset(input);
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
- public override void reset()
- {
- base.reset();
- scanner.yyreset(input);
- skippedPositions = 0;
- }
- }
+ /// <summary>
+ /// A grammar-based tokenizer constructed with JFlex
+ ///
+ /// <para> This should be a good tokenizer for most European-language documents:
+ ///
+ /// <ul>
+ /// <li>Splits words at punctuation characters, removing punctuation. However, a
+ /// dot that's not followed by whitespace is considered part of a token.
+ /// <li>Splits words at hyphens, unless there's a number in the token, in which case
+ /// the whole token is interpreted as a product number and is not split.
+ /// <li>Recognizes email addresses and internet hostnames as one token.
+ /// </ul>
+ ///
+ /// </para>
+ /// <para>Many applications have specific tokenizer needs. If this tokenizer does
+ /// not suit your application, please consider copying this source code
+ /// directory to your project and maintaining your own grammar-based tokenizer.
+ ///
+ /// ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1.
+ /// As of 3.1, <seealso cref="StandardTokenizer"/> implements Unicode text segmentation,
+ /// as specified by UAX#29.
+ /// </para>
+ /// </summary>
+
+ public sealed class ClassicTokenizer : Tokenizer
+ {
+ /// <summary>
+ /// A private instance of the JFlex-constructed scanner </summary>
+ private StandardTokenizerInterface scanner;
+
+ public const int ALPHANUM = 0;
+ public const int APOSTROPHE = 1;
+ public const int ACRONYM = 2;
+ public const int COMPANY = 3;
+ public const int EMAIL = 4;
+ public const int HOST = 5;
+ public const int NUM = 6;
+ public const int CJ = 7;
+
+ public const int ACRONYM_DEP = 8;
+
+ /// <summary>
+ /// String token types that correspond to token type int constants </summary>
+ public static readonly string[] TOKEN_TYPES = new string[] { "<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>" };
+
+ private int skippedPositions;
+
+ private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+ /// <summary>
+ /// Set the max allowed token length. Any token longer
+ /// than this is skipped.
+ /// </summary>
+ public int MaxTokenLength
+ {
+ set
+ {
+ if (value < 1)
+ {
+ throw new System.ArgumentException("maxTokenLength must be greater than zero");
+ }
+ this.maxTokenLength = value;
+ }
+ get
+ {
+ return maxTokenLength;
+ }
+ }
+
+
+ /// <summary>
+ /// Creates a new instance of the <seealso cref="ClassicTokenizer"/>. Attaches
+ /// the <code>input</code> to the newly created JFlex scanner.
+ /// </summary>
+ /// <param name="input"> The input reader
+ ///
+ /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
+ public ClassicTokenizer(LuceneVersion matchVersion, Reader input)
+ : base(input)
+ {
+ Init(matchVersion);
+ }
+
+ /// <summary>
+ /// Creates a new ClassicTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>
+ /// </summary>
+ public ClassicTokenizer(LuceneVersion matchVersion, AttributeFactory factory, Reader input)
+ : base(factory, input)
+ {
+ Init(matchVersion);
+ }
+
+ private void Init(LuceneVersion matchVersion)
+ {
+ this.scanner = new ClassicTokenizerImpl(input);
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private readonly CharTermAttribute termAtt;
+ private readonly OffsetAttribute offsetAtt;
+ private readonly PositionIncrementAttribute posIncrAtt;
+ private readonly TypeAttribute typeAtt;
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ skippedPositions = 0;
+
+ while (true)
+ {
+ int tokenType = scanner.NextToken;
+
+ if (tokenType == StandardTokenizerInterface_Fields.YYEOF)
+ {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength)
+ {
+ posIncrAtt.PositionIncrement = skippedPositions + 1;
+ scanner.getText(termAtt);
+
+ int start = scanner.yychar();
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
+
+ if (tokenType == ClassicTokenizer.ACRONYM_DEP)
+ {
+ typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST];
+ termAtt.Length = termAtt.Length - 1; // remove extra '.'
+ }
+ else
+ {
+ typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[tokenType];
+ }
+ return true;
+ }
+ else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ {
+ skippedPositions++;
+ }
+ }
+ }
+
+ public override void End()
+ {
+ base.End();
+ // set final offset
+ int finalOffset = CorrectOffset(scanner.yychar() + scanner.yylength());
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ // adjust any skipped tokens
+ posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
+ }
+
+ public override void Dispose()
+ {
+ base.Dispose();
+ scanner.yyreset(input);
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ scanner.yyreset(input);
+ skippedPositions = 0;
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs
index 4d30289..f2ad424 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs
@@ -1,7 +1,9 @@
/* The following code was generated by JFlex 1.5.1 */
using System;
using System.IO;
+using Lucene.Net.Analysis.Tokenattributes;
using org.apache.lucene.analysis.standard;
+using Reader = System.IO.TextReader;
namespace Lucene.Net.Analysis.Standard
{
@@ -286,9 +288,9 @@ namespace Lucene.Net.Analysis.Standard
/// <summary>
/// Fills CharTermAttribute with the current token text.
/// </summary>
- public void getText(CharTermAttribute t)
+ public void getText(ICharTermAttribute t)
{
- t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ t.CopyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
}
@@ -359,7 +361,7 @@ namespace Lucene.Net.Analysis.Standard
}
/* finally: fill the buffer with new input */
- int numRead = zzReader.read(zzBuffer, zzEndRead, zzBuffer.Length - zzEndRead);
+ int numRead = zzReader.Read(zzBuffer, zzEndRead, zzBuffer.Length - zzEndRead);
if (numRead > 0)
{
@@ -369,7 +371,7 @@ namespace Lucene.Net.Analysis.Standard
// unlikely but not impossible: read 0 characters, but not at end of stream
if (numRead == 0)
{
- int c = zzReader.read();
+ int c = zzReader.Read();
if (c == -1)
{
return true;
@@ -389,8 +391,6 @@ namespace Lucene.Net.Analysis.Standard
/// <summary>
/// Closes the input stream.
/// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public final void yyclose() throws java.io.IOException
public void yyclose()
{
zzAtEOF = true; // indicate end of file
@@ -398,7 +398,7 @@ namespace Lucene.Net.Analysis.Standard
if (zzReader != null)
{
- zzReader.close();
+ zzReader.Close();
}
}