You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ar...@apache.org on 2007/05/01 20:45:35 UTC
svn commit: r534192 [4/19] - in /incubator/lucene.net/trunk/C#: ./ src/
src/Demo/ src/Demo/DeleteFiles/ src/Demo/DemoLib/ src/Demo/DemoLib/HTML/
src/Demo/IndexFiles/ src/Demo/IndexHtml/ src/Demo/SearchFiles/
src/Lucene.Net/ src/Lucene.Net/Analysis/ src...
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj Tue May 1 11:45:26 2007
@@ -1,178 +1,196 @@
-/**f
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-options {
- STATIC = false;
-//IGNORE_CASE = true;
-//BUILD_PARSER = false;
- UNICODE_INPUT = true;
- USER_CHAR_STREAM = true;
- OPTIMIZE_TOKEN_MANAGER = true;
-//DEBUG_TOKEN_MANAGER = true;
-}
-PARSER_BEGIN(StandardTokenizer)
-
-package org.apache.lucene.analysis.standard;
-
-import java.io.*;
-
-/** A grammar-based tokenizer constructed with JavaCC.
- *
- * <p> This should be a good tokenizer for most European-language documents:
- *
- * <ul>
- * <li>Splits words at punctuation characters, removing punctuation. However, a
- * dot that's not followed by whitespace is considered part of a token.
- * <li>Splits words at hyphens, unless there's a number in the token, in which case
- * the whole token is interpreted as a product number and is not split.
- * <li>Recognizes email addresses and internet hostnames as one token.
- * </ul>
- *
- * <p>Many applications have specific tokenizer needs. If this tokenizer does
- * not suit your application, please consider copying this source code
- * directory to your project and maintaining your own grammar-based tokenizer.
- */
-public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
-
- /** Constructs a tokenizer for this Reader. */
- public StandardTokenizer(Reader reader) {
- this(new FastCharStream(reader));
- this.input = reader;
- }
-}
-
-PARSER_END(StandardTokenizer)
-
-TOKEN : { // token patterns
-
- // basic word: a sequence of digits & letters
- <ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
-
- // internal apostrophes: O'Reilly, you're, O'Reilly's
- // use a post-filter to remove possesives
-| <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
-
- // acronyms: U.S.A., I.B.M., etc.
- // use a post-filter to remove dots
-| <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
-
- // company names like AT&T and Excite@Home.
-| <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
-
- // email addresses
-| <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >
-
- // hostname
-| <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
-
- // floating point, serial, model numbers, ip addresses, etc.
- // every other segment must have at least one digit
-| <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
- | <HAS_DIGIT> <P> <ALPHANUM>
- | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
- | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
- | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
- | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
- )
- >
-| <#P: ("_"|"-"|"/"|"."|",") >
-| <#HAS_DIGIT: // at least one digit
- (<LETTER>|<DIGIT>)*
- <DIGIT>
- (<LETTER>|<DIGIT>)*
- >
-
-| < #ALPHA: (<LETTER>)+>
-| < #LETTER: // unicode letters
- [
- "\u0041"-"\u005a",
- "\u0061"-"\u007a",
- "\u00c0"-"\u00d6",
- "\u00d8"-"\u00f6",
- "\u00f8"-"\u00ff",
- "\u0100"-"\u1fff"
- ]
- >
-| < CJ: // Chinese, Japanese
- [
- "\u3040"-"\u318f",
- "\u3300"-"\u337f",
- "\u3400"-"\u3d2d",
- "\u4e00"-"\u9fff",
- "\uf900"-"\ufaff"
- ]
- >
-| < KOREAN: // Korean
- [
- "\uac00"-"\ud7af"
- ]
- >
-| < #DIGIT: // unicode digits
- [
- "\u0030"-"\u0039",
- "\u0660"-"\u0669",
- "\u06f0"-"\u06f9",
- "\u0966"-"\u096f",
- "\u09e6"-"\u09ef",
- "\u0a66"-"\u0a6f",
- "\u0ae6"-"\u0aef",
- "\u0b66"-"\u0b6f",
- "\u0be7"-"\u0bef",
- "\u0c66"-"\u0c6f",
- "\u0ce6"-"\u0cef",
- "\u0d66"-"\u0d6f",
- "\u0e50"-"\u0e59",
- "\u0ed0"-"\u0ed9",
- "\u1040"-"\u1049"
- ]
- >
-}
-
-SKIP : { // skip unrecognized chars
- <NOISE: ~[] >
-}
-
-/** Returns the next token in the stream, or null at EOS.
- * <p>The returned token's type is set to an element of {@link
- * StandardTokenizerConstants#tokenImage}.
- */
-org.apache.lucene.analysis.Token next() throws IOException :
-{
- Token token = null;
-}
-{
- ( token = <ALPHANUM> |
- token = <APOSTROPHE> |
- token = <ACRONYM> |
- token = <COMPANY> |
- token = <EMAIL> |
- token = <HOST> |
- token = <NUM> |
- token = <CJ> |
- token = <EOF>
- )
- {
- if (token.kind == EOF) {
- return null;
- } else {
- return
- new org.apache.lucene.analysis.Token(token.image,
- token.beginColumn,token.endColumn,
- tokenImage[token.kind]);
- }
- }
-}
+/**f
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+options {
+ STATIC = false;
+//IGNORE_CASE = true;
+//BUILD_PARSER = false;
+ UNICODE_INPUT = true;
+ USER_CHAR_STREAM = true;
+ OPTIMIZE_TOKEN_MANAGER = true;
+//DEBUG_TOKEN_MANAGER = true;
+}
+PARSER_BEGIN(StandardTokenizer)
+
+package Lucene.Net.Analysis.Standard;
+
+import java.io.*;
+
+/** A grammar-based tokenizer constructed with JavaCC.
+ *
+ * <p> This should be a good tokenizer for most European-language documents:
+ *
+ * <ul>
+ * <li>Splits words at punctuation characters, removing punctuation. However, a
+ * dot that's not followed by whitespace is considered part of a token.
+ * <li>Splits words at hyphens, unless there's a number in the token, in which case
+ * the whole token is interpreted as a product number and is not split.
+ * <li>Recognizes email addresses and internet hostnames as one token.
+ * </ul>
+ *
+ * <p>Many applications have specific tokenizer needs. If this tokenizer does
+ * not suit your application, please consider copying this source code
+ * directory to your project and maintaining your own grammar-based tokenizer.
+ */
+public class StandardTokenizer extends Lucene.Net.Analysis.Tokenizer {
+
+ /** Constructs a tokenizer for this Reader. */
+ public StandardTokenizer(Reader reader) {
+ this(new FastCharStream(reader));
+ this.input = reader;
+ }
+}
+
+PARSER_END(StandardTokenizer)
+
+TOKEN : { // token patterns
+
+ // basic word: a sequence of digits & letters
+ <ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
+
+ // internal apostrophes: O'Reilly, you're, O'Reilly's
+ // use a post-filter to remove possesives
+| <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
+
+ // acronyms: U.S.A., I.B.M., etc.
+ // use a post-filter to remove dots
+| <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
+
+ // company names like AT&T and Excite@Home.
+| <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
+
+ // email addresses
+| <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >
+
+ // hostname
+| <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
+
+ // floating point, serial, model numbers, ip addresses, etc.
+ // every other segment must have at least one digit
+| <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
+ | <HAS_DIGIT> <P> <ALPHANUM>
+ | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
+ | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
+ | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
+ | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
+ )
+ >
+| <#P: ("_"|"-"|"/"|"."|",") >
+| <#HAS_DIGIT: // at least one digit
+ (<LETTER>|<DIGIT>)*
+ <DIGIT>
+ (<LETTER>|<DIGIT>)*
+ >
+
+| < #ALPHA: (<LETTER>)+>
+| < #LETTER: // unicode letters
+ [
+ "\u0041"-"\u005a",
+ "\u0061"-"\u007a",
+ "\u00c0"-"\u00d6",
+ "\u00d8"-"\u00f6",
+ "\u00f8"-"\u00ff",
+ "\u0100"-"\u1fff",
+ "\uffa0"-"\uffdc"
+ ]
+ >
+| < CJ: // Chinese, Japanese
+ [
+ "\u3040"-"\u318f",
+ "\u3100"-"\u312f", // BaPoMoFo (aka ZhuYin)
+ "\u3040"-"\u309F", // Japanese: Hiragana
+ "\u30A0"-"\u30FF", // Japanese: Katakana
+ "\u31F0"-"\u31FF", // Japanese: Katakana Phonetic Extensions
+ "\u3300"-"\u337f",
+ "\u3400"-"\u4dbf", // CJK Unified Ideographs Ext. A
+ "\u4e00"-"\u9fff",
+ "\uf900"-"\ufaff",
+ "\uff65"-"\uff9f"
+
+// Otis: consider adding these, too
+//
+// 2E80-2EFF: CJK Radicals Supplement
+// 2F00-2FDF: Kangxi Radicals
+// 3190-319F: Kanbun
+// 31C0-31EF: CJK Strokes
+// 4E00-9FBF: CJK Unified
+// F900-FAFF: CJK Compatibility Ideographs
+
+ ]
+ >
+| < KOREAN: // Korean
+ [
+ "\uac00"-"\ud7af", // Hangul Syllables
+ "\u1100"-"\u11ff" // Hangul Jamo
+ // "\uac00"-"\ud7a3"
+ ]
+ >
+| < #DIGIT: // unicode digits
+ [
+ "\u0030"-"\u0039",
+ "\u0660"-"\u0669",
+ "\u06f0"-"\u06f9",
+ "\u0966"-"\u096f",
+ "\u09e6"-"\u09ef",
+ "\u0a66"-"\u0a6f",
+ "\u0ae6"-"\u0aef",
+ "\u0b66"-"\u0b6f",
+ "\u0be7"-"\u0bef",
+ "\u0c66"-"\u0c6f",
+ "\u0ce6"-"\u0cef",
+ "\u0d66"-"\u0d6f",
+ "\u0e50"-"\u0e59",
+ "\u0ed0"-"\u0ed9",
+ "\u1040"-"\u1049"
+ ]
+ >
+}
+
+SKIP : { // skip unrecognized chars
+ <NOISE: ~[] >
+}
+
+/** Returns the next token in the stream, or null at EOS.
+ * <p>The returned token's type is set to an element of {@link
+ * StandardTokenizerConstants#tokenImage}.
+ */
+Lucene.Net.Analysis.Token next() throws IOException :
+{
+ Token token = null;
+}
+{
+ ( token = <ALPHANUM> |
+ token = <APOSTROPHE> |
+ token = <ACRONYM> |
+ token = <COMPANY> |
+ token = <EMAIL> |
+ token = <HOST> |
+ token = <NUM> |
+ token = <CJ> |
+ token = <EOF>
+ )
+ {
+ if (token.kind == EOF) {
+ return null;
+ } else {
+ return
+ new Lucene.Net.Analysis.Token(token.image,
+ token.beginColumn,token.endColumn,
+ tokenImage[token.kind]);
+ }
+ }
+}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs Tue May 1 11:45:26 2007
@@ -21,25 +21,25 @@
namespace Lucene.Net.Analysis.Standard
{
- public class StandardTokenizerConstants
+ public class StandardTokenizerConstants
{
- public const int EOF = 0;
- public const int ALPHANUM = 1;
- public const int APOSTROPHE = 2;
- public const int ACRONYM = 3;
- public const int COMPANY = 4;
- public const int EMAIL = 5;
- public const int HOST = 6;
- public const int NUM = 7;
- public const int P = 8;
- public const int HAS_DIGIT = 9;
- public const int ALPHA = 10;
- public const int LETTER = 11;
- public const int CJ = 12;
- public const int KOREAN = 13;
- public const int DIGIT = 14;
- public const int NOISE = 15;
- public const int DEFAULT = 0;
- public static System.String[] tokenImage = new System.String[]{"<EOF>", "<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<P>", "<HAS_DIGIT>", "<ALPHA>", "<LETTER>", "<CJ>", "<KOREAN>", "<DIGIT>", "<NOISE>"};
- }
+ public const int EOF = 0;
+ public const int ALPHANUM = 1;
+ public const int APOSTROPHE = 2;
+ public const int ACRONYM = 3;
+ public const int COMPANY = 4;
+ public const int EMAIL = 5;
+ public const int HOST = 6;
+ public const int NUM = 7;
+ public const int P = 8;
+ public const int HAS_DIGIT = 9;
+ public const int ALPHA = 10;
+ public const int LETTER = 11;
+ public const int CJ = 12;
+ public const int KOREAN = 13;
+ public const int DIGIT = 14;
+ public const int NOISE = 15;
+ public const int DEFAULT = 0;
+ public static System.String[] tokenImage = new System.String[]{"<EOF>", "<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<P>", "<HAS_DIGIT>", "<ALPHA>", "<LETTER>", "<CJ>", "<KOREAN>", "<DIGIT>", "<NOISE>"};
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs Tue May 1 11:45:26 2007
@@ -30,7 +30,7 @@
input_stream.Done();
}
- private void InitBlock()
+ private void InitBlock()
{
System.IO.StreamWriter temp_writer;
temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
@@ -80,23 +80,25 @@
JjCheckNAdd(jjnextStates[start]);
JjCheckNAdd(jjnextStates[start + 1]);
}
- internal static readonly ulong[] jjbitVec0 = new ulong[]{0x1ff0000000000000L, 0xffffffffffffc000L, 0xffffffffL, 0x600000000000000L};
+ internal static readonly ulong[] jjbitVec0 = new ulong[]{0xfff0000000000000L, 0xffffffffffffdfffL, 0xffffffffL, 0x600000000000000L};
internal static readonly ulong[] jjbitVec2 = new ulong[]{0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL};
- internal static readonly ulong[] jjbitVec3 = new ulong[]{0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0x0L};
+ internal static readonly ulong[] jjbitVec3 = new ulong[]{0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0xffff000000000000L};
internal static readonly ulong[] jjbitVec4 = new ulong[]{0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L};
- internal static readonly ulong[] jjbitVec5 = new ulong[]{0x3fffffffffffL, 0x0L, 0x0L, 0x0L};
- internal static readonly ulong[] jjbitVec6 = new ulong[]{0x0L, 0x0L, 0xfffff00000000000L, 0x7fffffL};
- internal static readonly ulong[] jjbitVec7 = new ulong[]{0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L};
- internal static readonly ulong[] jjbitVec8 = new ulong[]{0xfffffffeL, 0x0L, 0x0L, 0x0L};
- internal static readonly ulong[] jjbitVec9 = new ulong[]{0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL};
- internal static readonly ulong[] jjbitVec10 = new ulong[]{0x1600L, 0x0L, 0x0L, 0x0L};
- internal static readonly ulong[] jjbitVec11 = new ulong[]{0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L};
- internal static readonly ulong[] jjbitVec12 = new ulong[]{0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L};
- internal static readonly ulong[] jjbitVec13 = new ulong[]{0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L};
- internal static readonly ulong[] jjbitVec14 = new ulong[]{0x0L, 0xffc000000000L, 0x0L, 0x0L};
- internal static readonly ulong[] jjbitVec15 = new ulong[]{0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L};
- internal static readonly ulong[] jjbitVec16 = new ulong[]{0x0L, 0x3ffL, 0x0L, 0x0L};
- internal static readonly ulong[] jjbitVec17 = new ulong[]{0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL};
+ internal static readonly ulong[] jjbitVec5 = new ulong[]{0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L};
+ internal static readonly ulong[] jjbitVec6 = new ulong[]{0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L};
+ internal static readonly ulong[] jjbitVec7 = new ulong[]{0x20000L, 0x0L, 0xfffff00000000000L, 0x7fffffL};
+ internal static readonly ulong[] jjbitVec8 = new ulong[]{0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L};
+ internal static readonly ulong[] jjbitVec9 = new ulong[]{0xfffffffeL, 0x0L, 0x0L, 0x0L};
+ internal static readonly ulong[] jjbitVec10 = new ulong[]{0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL};
+ internal static readonly ulong[] jjbitVec11 = new ulong[]{0x0L, 0x0L, 0xffffffff00000000L, 0x1fffffffL};
+ internal static readonly ulong[] jjbitVec12 = new ulong[]{0x1600L, 0x0L, 0x0L, 0x0L};
+ internal static readonly ulong[] jjbitVec13 = new ulong[]{0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L};
+ internal static readonly ulong[] jjbitVec14 = new ulong[]{0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L};
+ internal static readonly ulong[] jjbitVec15 = new ulong[]{0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L};
+ internal static readonly ulong[] jjbitVec16 = new ulong[]{0x0L, 0xffc000000000L, 0x0L, 0x0L};
+ internal static readonly ulong[] jjbitVec17 = new ulong[]{0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L};
+ internal static readonly ulong[] jjbitVec18 = new ulong[]{0x0L, 0x3ffL, 0x0L, 0x0L};
+ internal static readonly ulong[] jjbitVec19 = new ulong[]{0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL};
private int JjMoveNfa_0(int startState, int curPos)
{
int[] nextStates;
@@ -111,7 +113,7 @@
ReInitRounds();
if (curChar < 64)
{
- ulong l = ((ulong) 1L) << curChar;
+ ulong l = ((ulong) 1L) << curChar;
MatchLoop:
do
{
@@ -1206,9 +1208,12 @@
case 51:
return ((jjbitVec4[i2] & l2) != (ulong) 0L);
- case 61:
+ case 77:
return ((jjbitVec5[i2] & l2) != (ulong) 0L);
+ case 255:
+ return ((jjbitVec6[i2] & l2) != (ulong) 0L);
+
default:
if ((jjbitVec0[i1] & l1) != (ulong) 0L)
return true;
@@ -1222,10 +1227,10 @@
{
case 215:
- return ((jjbitVec7[i2] & l2) != (ulong) 0L);
+ return ((jjbitVec8[i2] & l2) != (ulong) 0L);
default:
- if ((jjbitVec6[i1] & l1) != (ulong) 0L)
+ if ((jjbitVec7[i1] & l1) != (ulong) 0L)
return true;
return false;
@@ -1237,10 +1242,13 @@
{
case 0:
- return ((jjbitVec9[i2] & l2) != (ulong) 0L);
+ return ((jjbitVec10[i2] & l2) != (ulong) 0L);
+
+ case 255:
+ return ((jjbitVec11[i2] & l2) != (ulong) 0L);
default:
- if ((jjbitVec8[i1] & l1) != (ulong) 0L)
+ if ((jjbitVec9[i1] & l1) != (ulong) 0L)
return true;
return false;
@@ -1252,23 +1260,23 @@
{
case 6:
- return ((jjbitVec12[i2] & l2) != (ulong) 0L);
+ return ((jjbitVec14[i2] & l2) != (ulong) 0L);
case 11:
- return ((jjbitVec13[i2] & l2) != (ulong) 0L);
+ return ((jjbitVec15[i2] & l2) != (ulong) 0L);
case 13:
- return ((jjbitVec14[i2] & l2) != (ulong) 0L);
+ return ((jjbitVec16[i2] & l2) != (ulong) 0L);
case 14:
- return ((jjbitVec15[i2] & l2) != (ulong) 0L);
+ return ((jjbitVec17[i2] & l2) != (ulong) 0L);
case 16:
- return ((jjbitVec16[i2] & l2) != (ulong) 0L);
+ return ((jjbitVec18[i2] & l2) != (ulong) 0L);
default:
- if ((jjbitVec10[i1] & l1) != (ulong) 0L)
- if ((jjbitVec11[i2] & l2) == (ulong) 0L)
+ if ((jjbitVec12[i1] & l1) != (ulong) 0L)
+ if ((jjbitVec13[i2] & l2) == (ulong) 0L)
return false;
else
return true;
@@ -1282,13 +1290,16 @@
{
case 0:
- return ((jjbitVec9[i2] & l2) != (ulong) 0L);
+ return ((jjbitVec10[i2] & l2) != (ulong) 0L);
case 215:
- return ((jjbitVec7[i2] & l2) != (ulong) 0L);
+ return ((jjbitVec8[i2] & l2) != (ulong) 0L);
+
+ case 255:
+ return ((jjbitVec11[i2] & l2) != (ulong) 0L);
default:
- if ((jjbitVec17[i1] & l1) != (ulong) 0L)
+ if ((jjbitVec19[i1] & l1) != (ulong) 0L)
return true;
return false;
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Token.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/Token.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Token.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Token.cs Tue May 1 11:45:26 2007
@@ -21,73 +21,73 @@
namespace Lucene.Net.Analysis.Standard
{
- /// <summary> Describes the input token stream.</summary>
+ /// <summary> Describes the input token stream.</summary>
- public class Token
- {
+ public class Token
+ {
- /// <summary> An integer that describes the kind of this token. This numbering
- /// system is determined by JavaCCParser, and a table of these numbers is
- /// stored in the file ...Constants.java.
- /// </summary>
- public int kind;
-
- /// <summary> beginLine and beginColumn describe the position of the first character
- /// of this token; endLine and endColumn describe the position of the
- /// last character of this token.
- /// </summary>
- public int beginLine, beginColumn, endLine, endColumn;
-
- /// <summary> The string image of the token.</summary>
- public System.String image;
-
- /// <summary> A reference to the next regular (non-special) token from the input
- /// stream. If this is the last token from the input stream, or if the
- /// token manager has not read tokens beyond this one, this field is
- /// set to null. This is true only if this token is also a regular
- /// token. Otherwise, see below for a description of the contents of
- /// this field.
- /// </summary>
- public Token next;
-
- /// <summary> This field is used to access special tokens that occur prior to this
- /// token, but after the immediately preceding regular (non-special) token.
- /// If there are no such special tokens, this field is set to null.
- /// When there are more than one such special token, this field refers
- /// to the last of these special tokens, which in turn refers to the next
- /// previous special token through its specialToken field, and so on
- /// until the first special token (whose specialToken field is null).
- /// The next fields of special tokens refer to other special tokens that
- /// immediately follow it (without an intervening regular token). If there
- /// is no such token, this field is null.
- /// </summary>
- public Token specialToken;
-
- /// <summary> Returns the image.</summary>
- public override System.String ToString()
- {
- return image;
- }
-
- /// <summary> Returns a new Token object, by default. However, if you want, you
- /// can create and return subclass objects based on the value of ofKind.
- /// Simply add the cases to the switch for all those special cases.
- /// For example, if you have a subclass of Token called IDToken that
- /// you want to create if ofKind is ID, simlpy add something like :
- ///
- /// case MyParserConstants.ID : return new IDToken();
- ///
- /// to the following switch statement. Then you can cast matchedToken
- /// variable to the appropriate type and use it in your lexical actions.
- /// </summary>
- public static Token NewToken(int ofKind)
- {
- switch (ofKind)
- {
+ /// <summary> An integer that describes the kind of this token. This numbering
+ /// system is determined by JavaCCParser, and a table of these numbers is
+ /// stored in the file ...Constants.java.
+ /// </summary>
+ public int kind;
+
+ /// <summary> beginLine and beginColumn describe the position of the first character
+ /// of this token; endLine and endColumn describe the position of the
+ /// last character of this token.
+ /// </summary>
+ public int beginLine, beginColumn, endLine, endColumn;
+
+ /// <summary> The string image of the token.</summary>
+ public System.String image;
+
+ /// <summary> A reference to the next regular (non-special) token from the input
+ /// stream. If this is the last token from the input stream, or if the
+ /// token manager has not read tokens beyond this one, this field is
+ /// set to null. This is true only if this token is also a regular
+ /// token. Otherwise, see below for a description of the contents of
+ /// this field.
+ /// </summary>
+ public Token next;
+
+ /// <summary> This field is used to access special tokens that occur prior to this
+ /// token, but after the immediately preceding regular (non-special) token.
+ /// If there are no such special tokens, this field is set to null.
+ /// When there are more than one such special token, this field refers
+ /// to the last of these special tokens, which in turn refers to the next
+ /// previous special token through its specialToken field, and so on
+ /// until the first special token (whose specialToken field is null).
+ /// The next fields of special tokens refer to other special tokens that
+ /// immediately follow it (without an intervening regular token). If there
+ /// is no such token, this field is null.
+ /// </summary>
+ public Token specialToken;
+
+ /// <summary> Returns the image.</summary>
+ public override System.String ToString()
+ {
+ return image;
+ }
+
+ /// <summary> Returns a new Token object, by default. However, if you want, you
+ /// can create and return subclass objects based on the value of ofKind.
+ /// Simply add the cases to the switch for all those special cases.
+ /// For example, if you have a subclass of Token called IDToken that
+ /// you want to create if ofKind is ID, simlpy add something like :
+ ///
+ /// case MyParserConstants.ID : return new IDToken();
+ ///
+ /// to the following switch statement. Then you can cast matchedToken
+ /// variable to the appropriate type and use it in your lexical actions.
+ /// </summary>
+ public static Token NewToken(int ofKind)
+ {
+ switch (ofKind)
+ {
- default: return new Token();
+ default: return new Token();
- }
- }
- }
+ }
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs Tue May 1 11:45:26 2007
@@ -21,141 +21,141 @@
namespace Lucene.Net.Analysis.Standard
{
- [Serializable]
- public class TokenMgrError:System.ApplicationException
- {
- /// <summary> You can also modify the body of this method to customize your error messages.
- /// For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not
- /// of end-users concern, so you can return something like :
- ///
- /// "Internal Error : Please file a bug report .... "
- ///
- /// from this method for such cases in the release version of your parser.
- /// </summary>
- public override System.String Message
- {
- get
- {
- return base.Message;
- }
+ [Serializable]
+ public class TokenMgrError:System.ApplicationException
+ {
+ /// <summary> You can also modify the body of this method to customize your error messages.
+ /// For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not
+ /// of end-users concern, so you can return something like :
+ ///
+ /// "Internal Error : Please file a bug report .... "
+ ///
+ /// from this method for such cases in the release version of your parser.
+ /// </summary>
+ public override System.String Message
+ {
+ get
+ {
+ return base.Message;
+ }
- }
- /*
- * Ordinals for various reasons why an Error of this type can be thrown.
- */
-
- /// <summary> Lexical error occured.</summary>
- internal const int LEXICAL_ERROR = 0;
-
- /// <summary> An attempt wass made to create a second instance of a static token manager.</summary>
- internal const int STATIC_LEXER_ERROR = 1;
-
- /// <summary> Tried to change to an invalid lexical state.</summary>
- internal const int INVALID_LEXICAL_STATE = 2;
-
- /// <summary> Detected (and bailed out of) an infinite loop in the token manager.</summary>
- internal const int LOOP_DETECTED = 3;
-
- /// <summary> Indicates the reason why the exception is thrown. It will have
- /// one of the above 4 values.
- /// </summary>
- internal int errorCode;
-
- /// <summary> Replaces unprintable characters by their espaced (or unicode escaped)
- /// equivalents in the given string
- /// </summary>
- protected internal static System.String addEscapes(System.String str)
- {
- System.Text.StringBuilder retval = new System.Text.StringBuilder();
- char ch;
- for (int i = 0; i < str.Length; i++)
- {
- switch (str[i])
- {
-
- case (char) (0):
- continue;
-
- case '\b':
- retval.Append("\\b");
- continue;
-
- case '\t':
- retval.Append("\\t");
- continue;
-
- case '\n':
- retval.Append("\\n");
- continue;
-
- case '\f':
- retval.Append("\\f");
- continue;
-
- case '\r':
- retval.Append("\\r");
- continue;
-
- case '\"':
- retval.Append("\\\"");
- continue;
-
- case '\'':
- retval.Append("\\\'");
- continue;
-
- case '\\':
- retval.Append("\\\\");
- continue;
-
- default:
- if ((ch = str[i]) < 0x20 || ch > 0x7e)
- {
- System.String s = "0000" + System.Convert.ToString(ch, 16);
- retval.Append("\\u" + s.Substring(s.Length - 4, (s.Length) - (s.Length - 4)));
- }
- else
- {
- retval.Append(ch);
- }
- continue;
-
- }
- }
- return retval.ToString();
- }
-
- /// <summary> Returns a detailed message for the Error when it is thrown by the
- /// token manager to indicate a lexical error.
- /// Parameters :
- /// EOFSeen : indicates if EOF caused the lexicl error
- /// curLexState : lexical state in which this error occured
- /// errorLine : line number when the error occured
- /// errorColumn : column number when the error occured
- /// errorAfter : prefix that was seen before this error occured
- /// curchar : the offending character
- /// Note: You can customize the lexical error message by modifying this method.
- /// </summary>
- protected internal static System.String LexicalError(bool EOFSeen, int lexState, int errorLine, int errorColumn, System.String errorAfter, char curChar)
- {
- return ("Lexical error at line " + errorLine + ", column " + errorColumn + ". Encountered: " + (EOFSeen?"<EOF> ":("\"" + addEscapes(System.Convert.ToString(curChar)) + "\"") + " (" + (int) curChar + "), ") + "after : \"" + addEscapes(errorAfter) + "\"");
- }
-
- /*
- * Constructors of various flavors follow.
- */
-
- public TokenMgrError()
- {
- }
-
- public TokenMgrError(System.String message, int reason):base(message)
- {
- errorCode = reason;
- }
-
- public TokenMgrError(bool EOFSeen, int lexState, int errorLine, int errorColumn, System.String errorAfter, char curChar, int reason):this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason)
- {
- }
- }
+ }
+ /*
+ * Ordinals for various reasons why an Error of this type can be thrown.
+ */
+
+ /// <summary> Lexical error occured.</summary>
+ internal const int LEXICAL_ERROR = 0;
+
+ /// <summary> An attempt wass made to create a second instance of a static token manager.</summary>
+ internal const int STATIC_LEXER_ERROR = 1;
+
+ /// <summary> Tried to change to an invalid lexical state.</summary>
+ internal const int INVALID_LEXICAL_STATE = 2;
+
+ /// <summary> Detected (and bailed out of) an infinite loop in the token manager.</summary>
+ internal const int LOOP_DETECTED = 3;
+
+ /// <summary> Indicates the reason why the exception is thrown. It will have
+ /// one of the above 4 values.
+ /// </summary>
+ internal int errorCode;
+
+ /// <summary> Replaces unprintable characters by their espaced (or unicode escaped)
+ /// equivalents in the given string
+ /// </summary>
+ protected internal static System.String addEscapes(System.String str)
+ {
+ System.Text.StringBuilder retval = new System.Text.StringBuilder();
+ char ch;
+ for (int i = 0; i < str.Length; i++)
+ {
+ switch (str[i])
+ {
+
+ case (char) (0):
+ continue;
+
+ case '\b':
+ retval.Append("\\b");
+ continue;
+
+ case '\t':
+ retval.Append("\\t");
+ continue;
+
+ case '\n':
+ retval.Append("\\n");
+ continue;
+
+ case '\f':
+ retval.Append("\\f");
+ continue;
+
+ case '\r':
+ retval.Append("\\r");
+ continue;
+
+ case '\"':
+ retval.Append("\\\"");
+ continue;
+
+ case '\'':
+ retval.Append("\\\'");
+ continue;
+
+ case '\\':
+ retval.Append("\\\\");
+ continue;
+
+ default:
+ if ((ch = str[i]) < 0x20 || ch > 0x7e)
+ {
+ System.String s = "0000" + System.Convert.ToString(ch, 16);
+ retval.Append("\\u" + s.Substring(s.Length - 4, (s.Length) - (s.Length - 4)));
+ }
+ else
+ {
+ retval.Append(ch);
+ }
+ continue;
+
+ }
+ }
+ return retval.ToString();
+ }
+
+ /// <summary> Returns a detailed message for the Error when it is thrown by the
+ /// token manager to indicate a lexical error.
+ /// Parameters :
+ /// EOFSeen : indicates if EOF caused the lexicl error
+ /// curLexState : lexical state in which this error occured
+ /// errorLine : line number when the error occured
+ /// errorColumn : column number when the error occured
+ /// errorAfter : prefix that was seen before this error occured
+ /// curchar : the offending character
+ /// Note: You can customize the lexical error message by modifying this method.
+ /// </summary>
+ protected internal static System.String LexicalError(bool EOFSeen, int lexState, int errorLine, int errorColumn, System.String errorAfter, char curChar)
+ {
+ return ("Lexical error at line " + errorLine + ", column " + errorColumn + ". Encountered: " + (EOFSeen?"<EOF> ":("\"" + addEscapes(System.Convert.ToString(curChar)) + "\"") + " (" + (int) curChar + "), ") + "after : \"" + addEscapes(errorAfter) + "\"");
+ }
+
+ /*
+ * Constructors of various flavors follow.
+ */
+
+ public TokenMgrError()
+ {
+ }
+
+ public TokenMgrError(System.String message, int reason):base(message)
+ {
+ errorCode = reason;
+ }
+
+ public TokenMgrError(bool EOFSeen, int lexState, int errorLine, int errorColumn, System.String errorAfter, char curChar, int reason):this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason)
+ {
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/StopAnalyzer.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopAnalyzer.cs Tue May 1 11:45:26 2007
@@ -20,55 +20,55 @@
namespace Lucene.Net.Analysis
{
- /// <summary>Filters LetterTokenizer with LowerCaseFilter and StopFilter. </summary>
+ /// <summary>Filters LetterTokenizer with LowerCaseFilter and StopFilter. </summary>
- public sealed class StopAnalyzer : Analyzer
- {
- private System.Collections.Hashtable stopWords;
-
- /// <summary>An array containing some common English words that are not usually useful
- /// for searching.
- /// </summary>
- public static readonly System.String[] ENGLISH_STOP_WORDS = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
-
- /// <summary>Builds an analyzer which removes words in ENGLISH_STOP_WORDS. </summary>
- public StopAnalyzer()
- {
- stopWords = StopFilter.MakeStopSet(ENGLISH_STOP_WORDS);
- }
-
- /// <summary>Builds an analyzer with the stop words from the given set.</summary>
- public StopAnalyzer(System.Collections.Hashtable stopWords)
- {
- this.stopWords = stopWords;
- }
-
- /// <summary>Builds an analyzer which removes words in the provided array. </summary>
- public StopAnalyzer(System.String[] stopWords)
- {
- this.stopWords = StopFilter.MakeStopSet(stopWords);
- }
-
- /// <summary>Builds an analyzer with the stop words from the given file.</summary>
- /// <seealso cref="WordlistLoader.GetWordSet(File)">
- /// </seealso>
- public StopAnalyzer(System.IO.FileInfo stopwordsFile)
- {
- stopWords = WordlistLoader.GetWordSet(stopwordsFile);
- }
-
- /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
- /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
- /// </seealso>
- public StopAnalyzer(System.IO.TextReader stopwords)
- {
- stopWords = WordlistLoader.GetWordSet(stopwords);
- }
-
- /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
- public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
- {
- return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
- }
- }
+ public sealed class StopAnalyzer : Analyzer
+ {
+ private System.Collections.Hashtable stopWords;
+
+ /// <summary>An array containing some common English words that are not usually useful
+ /// for searching.
+ /// </summary>
+ public static readonly System.String[] ENGLISH_STOP_WORDS = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
+
+ /// <summary>Builds an analyzer which removes words in ENGLISH_STOP_WORDS. </summary>
+ public StopAnalyzer()
+ {
+ stopWords = StopFilter.MakeStopSet(ENGLISH_STOP_WORDS);
+ }
+
+ /// <summary>Builds an analyzer with the stop words from the given set.</summary>
+ public StopAnalyzer(System.Collections.Hashtable stopWords)
+ {
+ this.stopWords = stopWords;
+ }
+
+ /// <summary>Builds an analyzer which removes words in the provided array. </summary>
+ public StopAnalyzer(System.String[] stopWords)
+ {
+ this.stopWords = StopFilter.MakeStopSet(stopWords);
+ }
+
+ /// <summary>Builds an analyzer with the stop words from the given file.</summary>
+ /// <seealso cref="WordlistLoader.GetWordSet(File)">
+ /// </seealso>
+ public StopAnalyzer(System.IO.FileInfo stopwordsFile)
+ {
+ stopWords = WordlistLoader.GetWordSet(stopwordsFile);
+ }
+
+ /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
+ /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
+ /// </seealso>
+ public StopAnalyzer(System.IO.TextReader stopwords)
+ {
+ stopWords = WordlistLoader.GetWordSet(stopwords);
+ }
+
+ /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/StopFilter.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopFilter.cs Tue May 1 11:45:26 2007
@@ -20,97 +20,97 @@
namespace Lucene.Net.Analysis
{
- /// <summary> Removes stop words from a token stream.</summary>
+ /// <summary> Removes stop words from a token stream.</summary>
- public sealed class StopFilter : TokenFilter
- {
+ public sealed class StopFilter : TokenFilter
+ {
- private System.Collections.Hashtable stopWords;
- private bool ignoreCase;
+ private System.Collections.Hashtable stopWords;
+ private bool ignoreCase;
- /// <summary> Construct a token stream filtering the given input.</summary>
- public StopFilter(TokenStream input, System.String[] stopWords) : this(input, stopWords, false)
- {
- }
-
- /// <summary> Constructs a filter which removes words from the input
- /// TokenStream that are named in the array of words.
- /// </summary>
- public StopFilter(TokenStream in_Renamed, System.String[] stopWords, bool ignoreCase) : base(in_Renamed)
- {
- this.ignoreCase = ignoreCase;
- this.stopWords = MakeStopSet(stopWords, ignoreCase);
- }
+ /// <summary> Construct a token stream filtering the given input.</summary>
+ public StopFilter(TokenStream input, System.String[] stopWords) : this(input, stopWords, false)
+ {
+ }
+
+ /// <summary> Constructs a filter which removes words from the input
+ /// TokenStream that are named in the array of words.
+ /// </summary>
+ public StopFilter(TokenStream in_Renamed, System.String[] stopWords, bool ignoreCase) : base(in_Renamed)
+ {
+ this.ignoreCase = ignoreCase;
+ this.stopWords = MakeStopSet(stopWords, ignoreCase);
+ }
- /// <summary> Construct a token stream filtering the given input.</summary>
- /// <param name="input">
- /// </param>
- /// <param name="stopWords">The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
- /// </param>
- /// <param name="ignoreCase">-Ignore case when stopping. The stopWords set must be setup to contain only lower case words
- /// </param>
- public StopFilter(TokenStream input, System.Collections.Hashtable stopWords, bool ignoreCase) : base(input)
- {
- this.ignoreCase = ignoreCase;
- this.stopWords = stopWords;
- }
-
- /// <summary> Constructs a filter which removes words from the input
- /// TokenStream that are named in the Set.
- /// It is crucial that an efficient Set implementation is used
- /// for maximum performance.
- ///
- /// </summary>
- /// <seealso cref="MakeStopSet(String[])">
- /// </seealso>
- public StopFilter(TokenStream in_Renamed, System.Collections.Hashtable stopWords) : this(in_Renamed, stopWords, false)
- {
- }
-
- /// <summary> Builds a Set from an array of stop words,
- /// appropriate for passing into the StopFilter constructor.
- /// This permits this stopWords construction to be cached once when
- /// an Analyzer is constructed.
- ///
- /// </summary>
- /// <seealso cref="MakeStopSet(String[], boolean) passing false to ignoreCase">
- /// </seealso>
- public static System.Collections.Hashtable MakeStopSet(System.String[] stopWords)
- {
- return MakeStopSet(stopWords, false);
- }
-
- /// <summary> </summary>
- /// <param name="stopWords">
- /// </param>
- /// <param name="ignoreCase">If true, all words are lower cased first.
- /// </param>
- /// <returns> a Set containing the words
- /// </returns>
- public static System.Collections.Hashtable MakeStopSet(System.String[] stopWords, bool ignoreCase)
- {
- System.Collections.Hashtable stopTable = new System.Collections.Hashtable(stopWords.Length);
+ /// <summary> Construct a token stream filtering the given input.</summary>
+ /// <param name="input">
+ /// </param>
+ /// <param name="stopWords">The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
+ /// </param>
+ /// <param name="ignoreCase">-Ignore case when stopping. The stopWords set must be setup to contain only lower case words
+ /// </param>
+ public StopFilter(TokenStream input, System.Collections.Hashtable stopWords, bool ignoreCase) : base(input)
+ {
+ this.ignoreCase = ignoreCase;
+ this.stopWords = stopWords;
+ }
+
+ /// <summary> Constructs a filter which removes words from the input
+ /// TokenStream that are named in the Set.
+ /// It is crucial that an efficient Set implementation is used
+ /// for maximum performance.
+ ///
+ /// </summary>
+ /// <seealso cref="MakeStopSet(String[])">
+ /// </seealso>
+ public StopFilter(TokenStream in_Renamed, System.Collections.Hashtable stopWords) : this(in_Renamed, stopWords, false)
+ {
+ }
+
+ /// <summary> Builds a Set from an array of stop words,
+ /// appropriate for passing into the StopFilter constructor.
+ /// This permits this stopWords construction to be cached once when
+ /// an Analyzer is constructed.
+ ///
+ /// </summary>
+ /// <seealso cref="MakeStopSet(String[], boolean) passing false to ignoreCase">
+ /// </seealso>
+ public static System.Collections.Hashtable MakeStopSet(System.String[] stopWords)
+ {
+ return MakeStopSet(stopWords, false);
+ }
+
+ /// <summary> </summary>
+ /// <param name="stopWords">
+ /// </param>
+ /// <param name="ignoreCase">If true, all words are lower cased first.
+ /// </param>
+ /// <returns> a Set containing the words
+ /// </returns>
+ public static System.Collections.Hashtable MakeStopSet(System.String[] stopWords, bool ignoreCase)
+ {
+ System.Collections.Hashtable stopTable = new System.Collections.Hashtable(stopWords.Length);
for (int i = 0; i < stopWords.Length; i++)
{
System.String tmp = ignoreCase ? stopWords[i].ToLower() : stopWords[i];
stopTable.Add(tmp, tmp);
}
- return stopTable;
- }
+ return stopTable;
+ }
- /// <summary> Returns the next input Token whose termText() is not a stop word.</summary>
- public override Token Next()
- {
- // return the first non-stop word found
- for (Token token = input.Next(); token != null; token = input.Next())
- {
- System.String termText = ignoreCase ? token.termText.ToLower() : token.termText;
- if (!stopWords.Contains(termText))
- return token;
- }
- // reached EOS -- return null
- return null;
- }
- }
+ /// <summary> Returns the next input Token whose termText() is not a stop word.</summary>
+ public override Token Next()
+ {
+ // return the first non-stop word found
+ for (Token token = input.Next(); token != null; token = input.Next())
+ {
+ System.String termText = ignoreCase ? token.termText.ToLower() : token.termText;
+ if (!stopWords.Contains(termText))
+ return token;
+ }
+ // reached EOS -- return null
+ return null;
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Token.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs Tue May 1 11:45:26 2007
@@ -20,129 +20,147 @@
namespace Lucene.Net.Analysis
{
- /// <summary>A Token is an occurence of a term from the text of a field. It consists of
- /// a term's text, the start and end offset of the term in the text of the field,
- /// and a type string.
- /// The start and end offsets permit applications to re-associate a token with
- /// its source text, e.g., to display highlighted query terms in a document
- /// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
- /// display, etc.
- /// The type is an interned string, assigned by a lexical analyzer
- /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
- /// belongs to. For example an end of sentence marker token might be implemented
- /// with type "eos". The default token type is "word".
- /// </summary>
+ /// <summary>A Token is an occurence of a term from the text of a field. It consists of
+ /// a term's text, the start and end offset of the term in the text of the field,
+ /// and a type string.
+ /// The start and end offsets permit applications to re-associate a token with
+ /// its source text, e.g., to display highlighted query terms in a document
+ /// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
+ /// display, etc.
+ /// The type is an interned string, assigned by a lexical analyzer
+ /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
+ /// belongs to. For example an end of sentence marker token might be implemented
+ /// with type "eos". The default token type is "word".
+ /// </summary>
- public sealed class Token
- {
- internal System.String termText; // the text of the term
- internal int startOffset; // start in source text
- internal int endOffset; // end in source text
- internal System.String type = "word"; // lexical type
-
- private int positionIncrement = 1;
-
- /// <summary>Constructs a Token with the given term text, and start & end offsets.
- /// The type defaults to "word."
- /// </summary>
- public Token(System.String text, int start, int end)
- {
- termText = text;
- startOffset = start;
- endOffset = end;
- }
-
- /// <summary>Constructs a Token with the given text, start and end offsets, & type. </summary>
- public Token(System.String text, int start, int end, System.String typ)
- {
- termText = text;
- startOffset = start;
- endOffset = end;
- type = typ;
- }
-
- /// <summary>Set the position increment. This determines the position of this token
- /// relative to the previous Token in a {@link TokenStream}, used in phrase
- /// searching.
- ///
- /// <p>The default value is one.
- ///
- /// <p>Some common uses for this are:<ul>
- ///
- /// <li>Set it to zero to put multiple terms in the same position. This is
- /// useful if, e.g., a word has multiple stems. Searches for phrases
- /// including either stem will match. In this case, all but the first stem's
- /// increment should be set to zero: the increment of the first instance
- /// should be one. Repeating a token with an increment of zero can also be
- /// used to boost the scores of matches on that token.
- ///
- /// <li>Set it to values greater than one to inhibit exact phrase matches.
- /// If, for example, one does not want phrases to match across removed stop
- /// words, then one could build a stop word filter that removes stop words and
- /// also sets the increment to the number of stop words removed before each
- /// non-stop word. Then exact phrase queries will only match when the terms
- /// occur with no intervening stop words.
- ///
- /// </ul>
- /// </summary>
- /// <seealso cref="Lucene.Net.index.TermPositions">
- /// </seealso>
- public void SetPositionIncrement(int positionIncrement)
- {
- if (positionIncrement < 0)
- throw new System.ArgumentException("Increment must be zero or greater: " + positionIncrement);
- this.positionIncrement = positionIncrement;
- }
-
- /// <summary>Returns the position increment of this Token.</summary>
- /// <seealso cref="setPositionIncrement">
- /// </seealso>
- public int GetPositionIncrement()
- {
- return positionIncrement;
- }
-
- /// <summary>Returns the Token's term text. </summary>
- public System.String TermText()
- {
- return termText;
- }
-
- /// <summary>Returns this Token's starting offset, the position of the first character
- /// corresponding to this token in the source text.
- /// Note that the difference between endOffset() and startOffset() may not be
- /// equal to termText.length(), as the term text may have been altered by a
- /// stemmer or some other filter.
- /// </summary>
- public int StartOffset()
- {
- return startOffset;
- }
-
- /// <summary>Returns this Token's ending offset, one greater than the position of the
- /// last character corresponding to this token in the source text.
- /// </summary>
- public int EndOffset()
- {
- return endOffset;
- }
-
- /// <summary>Returns this Token's lexical type. Defaults to "word". </summary>
- public System.String Type()
- {
- return type;
- }
-
- public override System.String ToString()
- {
- System.Text.StringBuilder sb = new System.Text.StringBuilder();
- sb.Append("(" + termText + "," + startOffset + "," + endOffset);
- if (!type.Equals("word"))
- sb.Append(",type=" + type);
- if (positionIncrement != 1)
- sb.Append(",posIncr=" + positionIncrement);
- sb.Append(")");
- return sb.ToString();
- }
- }
+ public class Token : System.ICloneable
+ {
+ internal System.String termText; // the text of the term
+ internal int startOffset; // start in source text
+ internal int endOffset; // end in source text
+ internal System.String type = "word"; // lexical type
+
+ private int positionIncrement = 1;
+
+ /// <summary>Constructs a Token with the given term text, and start & end offsets.
+ /// The type defaults to "word."
+ /// </summary>
+ public Token(System.String text, int start, int end)
+ {
+ termText = text;
+ startOffset = start;
+ endOffset = end;
+ }
+
+ /// <summary>Constructs a Token with the given text, start and end offsets, & type. </summary>
+ public Token(System.String text, int start, int end, System.String typ)
+ {
+ termText = text;
+ startOffset = start;
+ endOffset = end;
+ type = typ;
+ }
+
+ /// <summary>Set the position increment. This determines the position of this token
+ /// relative to the previous Token in a {@link TokenStream}, used in phrase
+ /// searching.
+ ///
+ /// <p>The default value is one.
+ ///
+ /// <p>Some common uses for this are:<ul>
+ ///
+ /// <li>Set it to zero to put multiple terms in the same position. This is
+ /// useful if, e.g., a word has multiple stems. Searches for phrases
+ /// including either stem will match. In this case, all but the first stem's
+ /// increment should be set to zero: the increment of the first instance
+ /// should be one. Repeating a token with an increment of zero can also be
+ /// used to boost the scores of matches on that token.
+ ///
+ /// <li>Set it to values greater than one to inhibit exact phrase matches.
+ /// If, for example, one does not want phrases to match across removed stop
+ /// words, then one could build a stop word filter that removes stop words and
+ /// also sets the increment to the number of stop words removed before each
+ /// non-stop word. Then exact phrase queries will only match when the terms
+ /// occur with no intervening stop words.
+ ///
+ /// </ul>
+ /// </summary>
+ /// <seealso cref="Lucene.Net.index.TermPositions">
+ /// </seealso>
+ public void SetPositionIncrement(int positionIncrement)
+ {
+ if (positionIncrement < 0)
+ throw new System.ArgumentException("Increment must be zero or greater: " + positionIncrement);
+ this.positionIncrement = positionIncrement;
+ }
+
+ /// <summary>Returns the position increment of this Token.</summary>
+ /// <seealso cref="setPositionIncrement">
+ /// </seealso>
+ public int GetPositionIncrement()
+ {
+ return positionIncrement;
+ }
+
+ /// <summary>Sets the Token's term text. </summary>
+ public virtual void SetTermText(System.String text)
+ {
+ termText = text;
+ }
+
+ /// <summary>Returns the Token's term text. </summary>
+ public System.String TermText()
+ {
+ return termText;
+ }
+
+ /// <summary>Returns this Token's starting offset, the position of the first character
+ /// corresponding to this token in the source text.
+ /// Note that the difference between endOffset() and startOffset() may not be
+ /// equal to termText.length(), as the term text may have been altered by a
+ /// stemmer or some other filter.
+ /// </summary>
+ public int StartOffset()
+ {
+ return startOffset;
+ }
+
+ /// <summary>Returns this Token's ending offset, one greater than the position of the
+ /// last character corresponding to this token in the source text.
+ /// </summary>
+ public int EndOffset()
+ {
+ return endOffset;
+ }
+
+ /// <summary>Returns this Token's lexical type. Defaults to "word". </summary>
+ public System.String Type()
+ {
+ return type;
+ }
+
+ public override System.String ToString()
+ {
+ System.Text.StringBuilder sb = new System.Text.StringBuilder();
+ sb.Append("(" + termText + "," + startOffset + "," + endOffset);
+ if (!type.Equals("word"))
+ sb.Append(",type=" + type);
+ if (positionIncrement != 1)
+ sb.Append(",posIncr=" + positionIncrement);
+ sb.Append(")");
+ return sb.ToString();
+ }
+
+ public virtual System.Object Clone()
+ {
+ try
+ {
+ return base.MemberwiseClone();
+ }
+ catch (System.Exception e)
+ {
+ throw new System.SystemException("", e); // shouldn't happen since we implement Cloneable
+ }
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/TokenFilter.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs Tue May 1 11:45:26 2007
@@ -20,26 +20,26 @@
namespace Lucene.Net.Analysis
{
- /// <summary>A TokenFilter is a TokenStream whose input is another token stream.
- /// <p>
- /// This is an abstract class.
- /// </summary>
+ /// <summary>A TokenFilter is a TokenStream whose input is another token stream.
+ /// <p>
+ /// This is an abstract class.
+ /// </summary>
- public abstract class TokenFilter : TokenStream
- {
- /// <summary>The source of tokens for this filter. </summary>
- protected internal TokenStream input;
+ public abstract class TokenFilter : TokenStream
+ {
+ /// <summary>The source of tokens for this filter. </summary>
+ protected internal TokenStream input;
- /// <summary>Construct a token stream filtering the given input. </summary>
- protected internal TokenFilter(TokenStream input)
- {
- this.input = input;
- }
+ /// <summary>Construct a token stream filtering the given input. </summary>
+ protected internal TokenFilter(TokenStream input)
+ {
+ this.input = input;
+ }
- /// <summary>Close the input TokenStream. </summary>
- public override void Close()
- {
- input.Close();
- }
- }
+ /// <summary>Close the input TokenStream. </summary>
+ public override void Close()
+ {
+ input.Close();
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/TokenStream.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs Tue May 1 11:45:26 2007
@@ -20,26 +20,26 @@
namespace Lucene.Net.Analysis
{
- /// <summary>A TokenStream enumerates the sequence of tokens, either from
- /// fields of a document or from query text.
- /// <p>
- /// This is an abstract class. Concrete subclasses are:
- /// <ul>
- /// <li>{@link Tokenizer}, a TokenStream
- /// whose input is a Reader; and
- /// <li>{@link TokenFilter}, a TokenStream
- /// whose input is another TokenStream.
- /// </ul>
- /// </summary>
+ /// <summary>A TokenStream enumerates the sequence of tokens, either from
+ /// fields of a document or from query text.
+ /// <p>
+ /// This is an abstract class. Concrete subclasses are:
+ /// <ul>
+ /// <li>{@link Tokenizer}, a TokenStream
+ /// whose input is a Reader; and
+ /// <li>{@link TokenFilter}, a TokenStream
+ /// whose input is another TokenStream.
+ /// </ul>
+ /// </summary>
- public abstract class TokenStream
- {
- /// <summary>Returns the next token in the stream, or null at EOS. </summary>
- public abstract Token Next();
+ public abstract class TokenStream
+ {
+ /// <summary>Returns the next token in the stream, or null at EOS. </summary>
+ public abstract Token Next();
- /// <summary>Releases resources associated with this stream. </summary>
- public virtual void Close()
- {
- }
- }
+ /// <summary>Releases resources associated with this stream. </summary>
+ public virtual void Close()
+ {
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Tokenizer.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs Tue May 1 11:45:26 2007
@@ -20,34 +20,34 @@
namespace Lucene.Net.Analysis
{
- /// <summary>A Tokenizer is a TokenStream whose input is a Reader.
- /// <p>
- /// This is an abstract class.
- /// </summary>
+ /// <summary>A Tokenizer is a TokenStream whose input is a Reader.
+ /// <p>
+ /// This is an abstract class.
+ /// </summary>
- public abstract class Tokenizer : TokenStream
- {
- /// <summary>The text source for this Tokenizer. </summary>
- protected internal System.IO.TextReader input;
+ public abstract class Tokenizer : TokenStream
+ {
+ /// <summary>The text source for this Tokenizer. </summary>
+ protected internal System.IO.TextReader input;
- /// <summary>Construct a tokenizer with null input. </summary>
- protected internal Tokenizer()
- {
- }
+ /// <summary>Construct a tokenizer with null input. </summary>
+ protected internal Tokenizer()
+ {
+ }
- /// <summary>Construct a token stream processing the given input. </summary>
- protected internal Tokenizer(System.IO.TextReader input)
- {
- this.input = input;
- }
+ /// <summary>Construct a token stream processing the given input. </summary>
+ protected internal Tokenizer(System.IO.TextReader input)
+ {
+ this.input = input;
+ }
- /// <summary>By default, closes the input Reader. </summary>
- public override void Close()
- {
- if (input != null)
- {
- input.Close();
- }
- }
- }
+ /// <summary>By default, closes the input Reader. </summary>
+ public override void Close()
+ {
+ if (input != null)
+ {
+ input.Close();
+ }
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs Tue May 1 11:45:26 2007
@@ -20,13 +20,13 @@
namespace Lucene.Net.Analysis
{
- /// <summary>An Analyzer that uses WhitespaceTokenizer. </summary>
+ /// <summary>An Analyzer that uses WhitespaceTokenizer. </summary>
- public sealed class WhitespaceAnalyzer : Analyzer
- {
- public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
- {
- return new WhitespaceTokenizer(reader);
- }
- }
+ public sealed class WhitespaceAnalyzer : Analyzer
+ {
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ return new WhitespaceTokenizer(reader);
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/WhitespaceTokenizer.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceTokenizer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceTokenizer.cs Tue May 1 11:45:26 2007
@@ -20,23 +20,23 @@
namespace Lucene.Net.Analysis
{
- /// <summary>A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
- /// Adjacent sequences of non-Whitespace characters form tokens.
- /// </summary>
+ /// <summary>A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+ /// Adjacent sequences of non-Whitespace characters form tokens.
+ /// </summary>
- public class WhitespaceTokenizer : CharTokenizer
- {
- /// <summary>Construct a new WhitespaceTokenizer. </summary>
- public WhitespaceTokenizer(System.IO.TextReader in_Renamed) : base(in_Renamed)
- {
- }
+ public class WhitespaceTokenizer : CharTokenizer
+ {
+ /// <summary>Construct a new WhitespaceTokenizer. </summary>
+ public WhitespaceTokenizer(System.IO.TextReader in_Renamed) : base(in_Renamed)
+ {
+ }
- /// <summary>Collects only characters which do not satisfy
- /// {@link Character#isWhitespace(char)}.
- /// </summary>
- protected internal override bool IsTokenChar(char c)
- {
- return !System.Char.IsWhiteSpace(c);
- }
- }
+ /// <summary>Collects only characters which do not satisfy
+ /// {@link Character#isWhitespace(char)}.
+ /// </summary>
+ protected internal override bool IsTokenChar(char c)
+ {
+ return !System.Char.IsWhiteSpace(c);
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/WordlistLoader.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs Tue May 1 11:45:26 2007
@@ -20,91 +20,110 @@
namespace Lucene.Net.Analysis
{
- /// <summary> Loader for text files that represent a list of stopwords.
- ///
- /// </summary>
- /// <author> Gerhard Schwarz
- /// </author>
- /// <version> $Id: WordlistLoader.java 192989 2005-06-22 19:59:03Z dnaber $
- /// </version>
- public class WordlistLoader
- {
+ /// <summary> Loader for text files that represent a list of stopwords.
+ ///
+ /// </summary>
+ /// <author> Gerhard Schwarz
+ /// </author>
+ /// <version> $Id: WordlistLoader.java 192989 2005-06-22 19:59:03Z dnaber $
+ /// </version>
+ public class WordlistLoader
+ {
- /// <summary> Loads a text file and adds every line as an entry to a HashSet (omitting
- /// leading and trailing whitespace). Every line of the file should contain only
- /// one word. The words need to be in lowercase if you make use of an
- /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
- ///
- /// </summary>
- /// <param name="wordfile">File containing the wordlist
- /// </param>
- /// <returns> A HashSet with the file's words
- /// </returns>
- public static System.Collections.Hashtable GetWordSet(System.IO.FileInfo wordfile)
- {
- System.Collections.Hashtable result = new System.Collections.Hashtable();
- System.IO.TextReader reader = null;
- try
- {
- reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default);
- result = GetWordSet(reader);
- }
- finally
- {
- if (reader != null)
- reader.Close();
- }
- return result;
- }
+ /// <summary> Loads a text file and adds every line as an entry to a HashSet (omitting
+ /// leading and trailing whitespace). Every line of the file should contain only
+ /// one word. The words need to be in lowercase if you make use of an
+ /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ ///
+ /// </summary>
+ /// <param name="wordfile">File containing the wordlist
+ /// </param>
+ /// <returns> A HashSet with the file's words
+ /// </returns>
+ public static System.Collections.Hashtable GetWordSet(System.IO.FileInfo wordfile)
+ {
+ System.Collections.Hashtable result = new System.Collections.Hashtable();
+ System.IO.TextReader reader = null;
+ try
+ {
+ reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default);
+ result = GetWordSet(reader);
+ }
+ finally
+ {
+ if (reader != null)
+ reader.Close();
+ }
+ return result;
+ }
- /// <summary> Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
- /// leading and trailing whitespace). Every line of the Reader should contain only
- /// one word. The words need to be in lowercase if you make use of an
- /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
- ///
- /// </summary>
- /// <param name="reader">Reader containing the wordlist
- /// </param>
- /// <returns> A HashSet with the reader's words
- /// </returns>
- public static System.Collections.Hashtable GetWordSet(System.IO.TextReader reader)
- {
- System.Collections.Hashtable result = new System.Collections.Hashtable();
- System.IO.TextReader br = null;
- try
- {
- br = (System.IO.TextReader) reader;
- System.String word = null;
- while ((word = br.ReadLine()) != null)
- {
+ /// <summary> Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
+ /// leading and trailing whitespace). Every line of the Reader should contain only
+ /// one word. The words need to be in lowercase if you make use of an
+ /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ ///
+ /// </summary>
+ /// <param name="reader">Reader containing the wordlist
+ /// </param>
+ /// <returns> A HashSet with the reader's words
+ /// </returns>
+ public static System.Collections.Hashtable GetWordSet(System.IO.TextReader reader)
+ {
+ System.Collections.Hashtable result = new System.Collections.Hashtable();
+ System.IO.TextReader br = null;
+ try
+ {
+ br = (System.IO.TextReader) reader;
+ System.String word = null;
+ while ((word = br.ReadLine()) != null)
+ {
System.String tmp = word.Trim();
- result.Add(tmp, tmp);
- }
- }
- finally
- {
- if (br != null)
- br.Close();
- }
- return result;
- }
+ result.Add(tmp, tmp);
+ }
+ }
+ finally
+ {
+ if (br != null)
+ br.Close();
+ }
+ return result;
+ }
-
- /// <summary> Builds a wordlist table, using words as both keys and values
- /// for backward compatibility.
- ///
- /// </summary>
- /// <param name="wordSet"> stopword set
- /// </param>
- private static System.Collections.Hashtable MakeWordTable(System.Collections.Hashtable wordSet)
- {
- System.Collections.Hashtable table = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
- for (System.Collections.IEnumerator iter = wordSet.GetEnumerator(); iter.MoveNext(); )
- {
- System.String word = (System.String) iter.Current;
- table[word] = word;
- }
- return table;
- }
- }
+ /// <summary> Reads a stem dictionary. Each line contains:
+ /// <pre>word<b>\t</b>stem</pre>
+ /// (i.e. two tab seperated words)
+ ///
+ /// </summary>
+ /// <returns> stem dictionary that overrules the stemming algorithm
+ /// </returns>
+ /// <throws> IOException </throws>
+ public static System.Collections.Hashtable GetStemDict(System.IO.FileInfo wordstemfile)
+ {
+ if (wordstemfile == null)
+ throw new System.NullReferenceException("wordstemfile may not be null");
+ System.Collections.Hashtable result = new System.Collections.Hashtable();
+ System.IO.StreamReader br = null;
+ System.IO.StreamReader fr = null;
+ try
+ {
+ fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default);
+ br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding);
+ System.String line;
+ char[] tab = {'\t'};
+ while ((line = br.ReadLine()) != null)
+ {
+ System.String[] wordstem = line.Split(tab, 2);
+ result[wordstem[0]] = wordstem[1];
+ }
+ }
+ finally
+ {
+ if (fr != null)
+ fr.Close();
+ if (br != null)
+ br.Close();
+ }
+ return result;
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/AssemblyInfo.cs?view=diff&rev=534192&r1=534191&r2=534192
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/AssemblyInfo.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/AssemblyInfo.cs Tue May 1 11:45:26 2007
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
using System.Reflection;
using System.Runtime.CompilerServices;
@@ -7,7 +24,7 @@
// associated with an assembly.
//
[assembly: AssemblyTitle("Apache Lucene.Net")]
-[assembly: AssemblyDescription("The Apache Software Foundation Lucene.Net a full-text search engine library")]
+[assembly: AssemblyDescription("The Apache Software Foundation Lucene.Net text search library")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("The Apache Software Foundation")]
[assembly: AssemblyProduct("Lucene.Net")]
@@ -18,6 +35,7 @@
[assembly: AssemblyInformationalVersionAttribute("2.0")]
+
//
// Version information for an assembly consists of the following four values:
//
@@ -29,7 +47,8 @@
// You can specify all the values or you can default the Revision and Build Numbers
// by using the '*' as shown below:
-[assembly: AssemblyVersion("2.0.0.005")]
+[assembly: AssemblyVersion("2.1.0.001")]
+
//
// In order to sign your assembly you must specify a key to use. Refer to the