You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by pn...@apache.org on 2011/03/20 08:30:38 UTC
[Lucene.Net] svn commit: r1083372 [1/3] - in
/incubator/lucene.net/trunk/C#/contrib/Analyzers: ./ Lucene.Net.Analyzers/
Lucene.Net.Analyzers/BR/ Lucene.Net.Analyzers/CJK/ Lucene.Net.Analyzers/Cn/
Lucene.Net.Analyzers/Cz/ Lucene.Net.Analyzers/De/ Lucene.Net.Analyzer...
Author: pnasser
Date: Sun Mar 20 07:30:37 2011
New Revision: 1083372
URL: http://svn.apache.org/viewvc?rev=1083372&view=rev
Log:
LUCENENET 372 - BR, CJK, CN, CZ, DE, FR, NL, RU Analyzers - Tests Missing
Added:
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemmer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/DutchAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/DutchStemFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/DutchStemmer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/WordlistLoader.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianCharsets.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianLetterTokenizer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianLowerCaseFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianStemFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianStemmer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/WordlistLoader.cs
Modified:
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj
Modified: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers.sln?rev=1083372&r1=1083371&r2=1083372&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln (original)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln Sun Mar 20 07:30:37 2011
@@ -1,9 +1,15 @@

-Microsoft Visual Studio Solution File, Format Version 10.00
-# Visual C# Express 2008
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analyzers", "Lucene.Net.Analyzers\Lucene.Net.Analyzers.csproj", "{4286E961-9143-4821-B46D-3D39D3736386}"
EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{67D27628-F1D5-4499-9818-B669731925C8}"
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.TestAnalyzers", "Test\Lucene.Net.TestAnalyzers.csproj", "{67D27628-F1D5-4499-9818-B669731925C8}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net", "..\..\src\Lucene.Net\Lucene.Net.csproj", "{5D4AD9BE-1FFB-41AB-9943-25737971BF57}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Test", "..\..\src\Test\Lucene.Net.Test.csproj", "{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DemoLib", "..\..\src\Demo\DemoLib\DemoLib.csproj", "{F04CA2F4-E182-46A8-B914-F46AF5319E83}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -19,6 +25,18 @@ Global
{67D27628-F1D5-4499-9818-B669731925C8}.Debug|Any CPU.Build.0 = Debug|Any CPU
{67D27628-F1D5-4499-9818-B669731925C8}.Release|Any CPU.ActiveCfg = Release|Any CPU
{67D27628-F1D5-4499-9818-B669731925C8}.Release|Any CPU.Build.0 = Release|Any CPU
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.Build.0 = Release|Any CPU
+ {AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Release|Any CPU.Build.0 = Release|Any CPU
+ {F04CA2F4-E182-46A8-B914-F46AF5319E83}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {F04CA2F4-E182-46A8-B914-F46AF5319E83}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {F04CA2F4-E182-46A8-B914-F46AF5319E83}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {F04CA2F4-E182-46A8-B914-F46AF5319E83}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Modified: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs?rev=1083372&r1=1083371&r2=1083372&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -96,7 +96,7 @@ namespace Lucene.Net.Analysis.BR
*/
public BrazilianAnalyzer(FileInfo stopwords)
{
- stoptable = WordlistLoader.GetWordSet(stopwords);
+ stoptable = WordlistLoader.GetWordtable(stopwords);
}
/**
@@ -118,7 +118,7 @@ namespace Lucene.Net.Analysis.BR
*/
public void SetStemExclusionTable(FileInfo exclusionlist)
{
- excltable = WordlistLoader.GetWordSet(exclusionlist);
+ excltable = WordlistLoader.GetWordtable(exclusionlist);
}
/**
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,128 @@
+using System;
+using System.IO;
+using System.Collections;
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.CJK
+{
+ /* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2004 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ *
+ * $Id: CJKAnalyzer.java,v 1.5 2004/10/17 11:41:41 dnaber Exp $
+ */
+
+ /// <summary>
+ /// Filters CJKTokenizer with StopFilter.
+ ///
+ /// <author>Che, Dong</author>
+ /// </summary>
+ public class CJKAnalyzer : Analyzer
+ {
+ //~ Static fields/initializers ---------------------------------------------
+
+ /// <summary>
+ /// An array containing some common English words that are not usually
+ /// useful for searching. and some double-byte interpunctions.....
+ /// </summary>
+ public static String[] stopWords =
+ {
+ "a", "and", "are", "as", "at", "be",
+ "but", "by", "for", "if", "in",
+ "into", "is", "it", "no", "not",
+ "of", "on", "or", "s", "such", "t",
+ "that", "the", "their", "then",
+ "there", "these", "they", "this",
+ "to", "was", "will", "with", "",
+ "www"
+ };
+
+ //~ Instance fields --------------------------------------------------------
+
+ /// <summary>
+ /// stop word list
+ /// </summary>
+ private Hashtable stopTable;
+
+ //~ Constructors -----------------------------------------------------------
+
+ /// <summary>
+ /// Builds an analyzer which removes words in STOP_WORDS.
+ /// </summary>
+ public CJKAnalyzer()
+ {
+ stopTable = StopFilter.MakeStopSet(stopWords);
+ }
+
+ /// <summary>
+ /// Builds an analyzer which removes words in the provided array.
+ /// </summary>
+ /// <param name="stopWords">stop word array</param>
+ public CJKAnalyzer(String[] stopWords)
+ {
+ stopTable = StopFilter.MakeStopSet(stopWords);
+ }
+
+ //~ Methods ----------------------------------------------------------------
+
+ /// <summary>
+ /// get token stream from input
+ /// </summary>
+ /// <param name="fieldName">lucene field name</param>
+ /// <param name="reader">input reader</param>
+ /// <returns>Token Stream</returns>
+ public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ return new StopFilter(new CJKTokenizer(reader), stopTable);
+ }
+ }
+}
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,329 @@
+using System;
+using System.IO;
+using System.Text;
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.CJK
+{
+ /* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2004 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+ /// <summary>
+ /// <p>
+ /// CJKTokenizer was modified from StopTokenizer which does a decent job for
+ /// most European languages. and it perferm other token method for double-byte
+ /// Characters: the token will return at each two charactors with overlap match.<br>
+ /// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
+ /// also need filter filter zero length token ""<br>
+ /// for Digit: digit, '+', '#' will token as letter<br>
+ /// for more info on Asia language(Chinese Japanese Korean) text segmentation:
+ /// please search <a
+ /// href="http://www.google.com/search?q=word+chinese+segment">google</a>
+ /// </p>
+ ///
+ /// @author Che, Dong
+ /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
+ /// </summary>
+ public sealed class CJKTokenizer : Tokenizer
+ {
+ //~ Static fields/initializers ---------------------------------------------
+
+ /// <summary>
+ /// Max word length
+ /// </summary>
+ private static int MAX_WORD_LEN = 255;
+
+ /// <summary>
+ /// buffer size
+ /// </summary>
+ private static int IO_BUFFER_SIZE = 256;
+
+ //~ Instance fields --------------------------------------------------------
+
+ /// <summary>
+ /// word offset, used to imply which character(in ) is parsed
+ /// </summary>
+ private int offset = 0;
+
+ /// <summary>
+ /// the index used only for ioBuffer
+ /// </summary>
+ private int bufferIndex = 0;
+
+ /// <summary>
+ /// data length
+ /// </summary>
+ private int dataLen = 0;
+
+ /// <summary>
+ /// character buffer, store the characters which are used to compose <br>
+ /// the returned Token
+ /// </summary>
+ private char[] buffer = new char[MAX_WORD_LEN];
+
+ /// <summary>
+ /// I/O buffer, used to store the content of the input(one of the <br>
+ /// members of Tokenizer)
+ /// </summary>
+ private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ /// <summary>
+ /// word type: single=>ASCII double=>non-ASCII word=>default
+ /// </summary>
+ private String tokenType = "word";
+
+ /// <summary>
+ /// tag: previous character is a cached double-byte character "C1C2C3C4"
+ /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+ /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+ /// </summary>
+ private bool preIsTokened = false;
+
+ //~ Constructors -----------------------------------------------------------
+
+ /// <summary>
+ /// Construct a token stream processing the given input.
+ /// </summary>
+ /// <param name="_in">I/O reader</param>
+ public CJKTokenizer(TextReader _in)
+ {
+ input = _in;
+ }
+
+ //~ Methods ----------------------------------------------------------------
+
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS.
+ /// </summary>
+ /// <returns>Token</returns>
+ public override Token Next()
+ {
+ /** how many character(s) has been stored in buffer */
+ int length = 0;
+
+ /** the position used to create Token */
+ int start = offset;
+
+ while (true)
+ {
+ /** current charactor */
+ char c;
+
+ /** unicode block of current charactor for detail */
+ //Character.UnicodeBlock ub;
+
+ offset++;
+
+ if (bufferIndex >= dataLen)
+ {
+ dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+ bufferIndex = 0;
+ }
+
+ if (dataLen == 0)
+ {
+ if (length > 0)
+ {
+ if (preIsTokened == true)
+ {
+ length = 0;
+ preIsTokened = false;
+ }
+
+ break;
+ }
+ else
+ {
+ return null;
+ }
+ }
+ else
+ {
+ //get current character
+ c = ioBuffer[bufferIndex++];
+
+ //get the UnicodeBlock of the current character
+ //ub = Character.UnicodeBlock.of(c);
+ }
+
+ //if the current character is ASCII or Extend ASCII
+ if (('\u0000' <= c && c <= '\u007F') ||
+ ('\uFF00' <= c && c <= '\uFFEF'))
+ {
+ if ('\uFF00' <= c && c <= '\uFFEF')
+ {
+ /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+ int i = (int) c;
+ i = i - 65248;
+ c = (char) i;
+ }
+
+ // if the current character is a letter or "_" "+" "#"
+ if (Char.IsLetterOrDigit(c)
+ || ((c == '_') || (c == '+') || (c == '#'))
+ )
+ {
+ if (length == 0)
+ {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the current character begin to token the ASCII
+ // letter
+ start = offset - 1;
+ }
+ else if (tokenType == "double")
+ {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the previous non-ASCII
+ // : the current character
+ offset--;
+ bufferIndex--;
+ tokenType = "single";
+
+ if (preIsTokened == true)
+ {
+ // there is only one non-ASCII has been stored
+ length = 0;
+ preIsTokened = false;
+
+ break;
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ // store the LowerCase(c) in the buffer
+ buffer[length++] = Char.ToLower(c);
+ tokenType = "single";
+
+ // break the procedure if buffer overflowed!
+ if (length == MAX_WORD_LEN)
+ {
+ break;
+ }
+ }
+ else if (length > 0)
+ {
+ if (preIsTokened == true)
+ {
+ length = 0;
+ preIsTokened = false;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+ else
+ {
+ // non-ASCII letter, eg."C1C2C3C4"
+ if (Char.IsLetter(c))
+ {
+ if (length == 0)
+ {
+ start = offset - 1;
+ buffer[length++] = c;
+ tokenType = "double";
+ }
+ else
+ {
+ if (tokenType == "single")
+ {
+ offset--;
+ bufferIndex--;
+
+ //return the previous ASCII characters
+ break;
+ }
+ else
+ {
+ buffer[length++] = c;
+ tokenType = "double";
+
+ if (length == 2)
+ {
+ offset--;
+ bufferIndex--;
+ preIsTokened = true;
+
+ break;
+ }
+ }
+ }
+ }
+ else if (length > 0)
+ {
+ if (preIsTokened == true)
+ {
+ // empty the buffer
+ length = 0;
+ preIsTokened = false;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+ }
+
+ return new Token(new String(buffer, 0, length), start, start + length,
+ tokenType
+ );
+ }
+ }
+
+}
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,92 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Cn
+{
+ /* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2004 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+ /// <summary>
+ /// Title: ChineseAnalyzer
+ /// Description:
+ /// Subclass of org.apache.lucene.analysis.Analyzer
+ /// build from a ChineseTokenizer, filtered with ChineseFilter.
+ /// Copyright: Copyright (c) 2001
+ /// Company:
+ /// <author>Yiyi Sun</author>
+ /// <version>$Id: ChineseAnalyzer.java, v 1.2 2003/01/22 20:54:47 ehatcher Exp $</version>
+ /// </summary>
+ public class ChineseAnalyzer : Analyzer
+ {
+
+ public ChineseAnalyzer()
+ {
+ }
+
+ /// <summary>
+ /// Creates a TokenStream which tokenizes all the text in the provided Reader.
+ /// </summary>
+ /// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
+ public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new ChineseTokenizer(reader);
+ result = new ChineseFilter(result);
+ return result;
+ }
+ }
+}
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,138 @@
+using System;
+using System.IO;
+using System.Collections;
+using System.Globalization;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Cn
+{
+ /* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2004 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+ /// <summary>
+ /// Title: ChineseFilter
+ /// Description: Filter with a stop word table
+ /// Rule: No digital is allowed.
+ /// English word/token should larger than 1 character.
+ /// One Chinese character as one Chinese word.
+ /// TO DO:
+ /// 1. Add Chinese stop words, such as \ue400
+ /// 2. Dictionary based Chinese word extraction
+ /// 3. Intelligent Chinese word extraction
+ ///
+ /// Copyright: Copyright (c) 2001
+ /// Company:
+ /// <author>Yiyi Sun</author>
+ /// <version>$Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $</version>
+ /// </summary>
+ public sealed class ChineseFilter : TokenFilter
+ {
+ // Only English now, Chinese to be added later.
+ public static String[] STOP_WORDS =
+ {
+ "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ };
+
+ private Hashtable stopTable;
+
+ public ChineseFilter(TokenStream _in) : base (_in)
+ {
+ stopTable = new Hashtable(STOP_WORDS.Length);
+
+ for (int i = 0; i < STOP_WORDS.Length; i++)
+ stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
+ }
+
+ public override Token Next()
+ {
+
+ for (Token token = input.Next(); token != null; token = input.Next())
+ {
+ String text = token.TermText();
+
+ // why not key off token type here assuming ChineseTokenizer comes first?
+ if (stopTable[text] == null)
+ {
+ switch (Char.GetUnicodeCategory(text[0]))
+ {
+
+ case UnicodeCategory.LowercaseLetter:
+ case UnicodeCategory.UppercaseLetter:
+
+ // English word/token should larger than 1 character.
+ if (text.Length > 1)
+ {
+ return token;
+ }
+ break;
+ case UnicodeCategory.OtherLetter:
+
+ // One Chinese character as one Chinese word.
+ // Chinese word extraction to be added later here.
+
+ return token;
+ }
+
+ }
+
+ }
+ return null;
+ }
+ }
+}
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,179 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+using System.Globalization;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Cn
+{
+ /* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2004 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+ /// <summary>
+ /// Title: ChineseTokenizer
+ /// Description: Extract tokens from the Stream using Character.getType()
+ /// Rule: A Chinese character as a single token
+ /// Copyright: Copyright (c) 2001
+ /// Company:
+ ///
+ /// The difference between thr ChineseTokenizer and the
+ /// CJKTokenizer (id=23545) is that they have different
+ /// token parsing logic.
+ ///
+ /// Let me use an example. If having a Chinese text
+ /// "C1C2C3C4" to be indexed, the tokens returned from the
+ /// ChineseTokenizer are C1, C2, C3, C4. And the tokens
+ /// returned from the CJKTokenizer are C1C2, C2C3, C3C4.
+ ///
+ /// Therefore the index the CJKTokenizer created is much
+ /// larger.
+ ///
+ /// The problem is that when searching for C1, C1C2, C1C3,
+ /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
+ /// CJKTokenizer will not work.
+ /// <author>Yiyi Sun</author>
+ /// <version>$Id: ChineseTokenizer.java, v 1.4 2003/03/02 13:56:03 otis Exp $</version>
+ /// </summary>
+ public sealed class ChineseTokenizer : Tokenizer
+ {
+
+
+ public ChineseTokenizer(TextReader _in)
+ {
+ input = _in;
+ }
+
+ private int offset = 0, bufferIndex=0, dataLen=0;
+ private static int MAX_WORD_LEN = 255;
+ private static int IO_BUFFER_SIZE = 1024;
+ private char[] buffer = new char[MAX_WORD_LEN];
+ private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ private int length;
+ private int start;
+
+ private void Push(char c)
+ {
+
+ if (length == 0) start = offset-1; // start of token
+ buffer[length++] = Char.ToLower(c); // buffer it
+
+ }
+
+ private Token Flush()
+ {
+
+ if (length > 0)
+ {
+ //System.out.println(new String(buffer, 0, length));
+ return new Token(new String(buffer, 0, length), start, start+length);
+ }
+ else
+ return null;
+ }
+
+ public override Token Next()
+ {
+
+ length = 0;
+ start = offset;
+
+
+ while (true)
+ {
+
+ char c;
+ offset++;
+
+ if (bufferIndex >= dataLen)
+ {
+ dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+ bufferIndex = 0;
+ };
+
+ if (dataLen == 0) return Flush();
+ else
+ c = ioBuffer[bufferIndex++];
+
+
+ switch(Char.GetUnicodeCategory(c))
+ {
+
+ case UnicodeCategory.DecimalDigitNumber:
+ case UnicodeCategory.LowercaseLetter:
+ case UnicodeCategory.UppercaseLetter:
+ Push(c);
+ if (length == MAX_WORD_LEN) return Flush();
+ break;
+
+ case UnicodeCategory.OtherLetter:
+ if (length>0)
+ {
+ bufferIndex--;
+ return Flush();
+ }
+ Push(c);
+ return Flush();
+
+ default:
+ if (length>0) return Flush();
+ break;
+ }
+ }
+
+ }
+ }
+}
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,190 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.De;
+using Lucene.Net.Analysis.Standard;
+
+namespace Lucene.Net.Analysis.Cz
+{
+ /* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2004 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+ /// <summary>
+ /// Analyzer for Czech language. Supports an external list of stopwords (words that
+ /// will not be indexed at all).
+ /// A default set of stopwords is used unless an alternative list is specified, the
+ /// exclusion list is empty by default.
+ ///
+ /// <author>Lukas Zapletal [lzap@root.cz]</author>
+ /// <version>$Id: CzechAnalyzer.java,v 1.2 2003/01/22 20:54:47 ehatcher Exp $</version>
+ /// </summary>
+ public sealed class CzechAnalyzer : Analyzer
+ {
+ /// <summary>
+ /// List of typical stopwords.
+ /// </summary>
+ public static String[] STOP_WORDS =
+ {
+ "a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem",
+ "byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto",
+ "jej","zda","pro\u010d","m\u00e1te","tato","kam","tohoto","kdo","kte\u0159\u00ed",
+ "mi","n\u00e1m","tom","tomuto","m\u00edt","nic","proto","kterou","byla",
+ "toho","proto\u017ee","asi","ho","na\u0161i","napi\u0161te","re","co\u017e","t\u00edm",
+ "tak\u017ee","sv\u00fdch","jej\u00ed","sv\u00fdmi","jste","aj","tu","tedy","teto",
+ "bylo","kde","ke","prav\u00e9","ji","nad","nejsou","\u010di","pod","t\u00e9ma",
+ "mezi","p\u0159es","ty","pak","v\u00e1m","ani","kdy\u017e","v\u0161ak","neg","jsem",
+ "tento","\u010dl\u00e1nku","\u010dl\u00e1nky","aby","jsme","p\u0159ed","pta","jejich",
+ "byl","je\u0161t\u011b","a\u017e","bez","tak\u00e9","pouze","prvn\u00ed","va\u0161e","kter\u00e1",
+ "n\u00e1s","nov\u00fd","tipy","pokud","m\u016f\u017ee","strana","jeho","sv\u00e9","jin\u00e9",
+ "zpr\u00e1vy","nov\u00e9","nen\u00ed","v\u00e1s","jen","podle","zde","u\u017e","b\u00fdt","v\u00edce",
+ "bude","ji\u017e","ne\u017e","kter\u00fd","by","kter\u00e9","co","nebo","ten","tak",
+ "m\u00e1","p\u0159i","od","po","jsou","jak","dal\u0161\u00ed","ale","si","se","ve",
+ "to","jako","za","zp\u011bt","ze","do","pro","je","na","atd","atp",
+ "jakmile","p\u0159i\u010dem\u017e","j\u00e1","on","ona","ono","oni","ony","my","vy",
+ "j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e",
+ "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
+ };
+
+ /// <summary>
+ /// Contains the stopwords used with the StopFilter.
+ /// </summary>
+ private Hashtable stoptable = new Hashtable();
+
+ /// <summary>
+ /// Builds an analyzer.
+ /// </summary>
+ public CzechAnalyzer()
+ {
+ stoptable = StopFilter.MakeStopSet( STOP_WORDS );
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ public CzechAnalyzer( String[] stopwords )
+ {
+ stoptable = StopFilter.MakeStopSet( stopwords );
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ public CzechAnalyzer( Hashtable stopwords )
+ {
+ stoptable = stopwords;
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ public CzechAnalyzer( FileInfo stopwords )
+ {
+ stoptable = WordlistLoader.GetWordtable( stopwords );
+ }
+
+ /// <summary>
+ /// Loads stopwords hash from resource stream (file, database...).
+ /// </summary>
+ /// <param name="wordfile">File containing the wordlist</param>
+ /// <param name="encoding">Encoding used (win-1250, iso-8859-2, ...}, null for default system encoding</param>
+ public void LoadStopWords( Stream wordfile, String encoding )
+ {
+ if ( wordfile == null )
+ {
+ stoptable = new Hashtable();
+ return;
+ }
+ try
+ {
+ // clear any previous table (if present)
+ stoptable = new Hashtable();
+
+ StreamReader isr;
+ if (encoding == null)
+ isr = new StreamReader(wordfile);
+ else
+ isr = new StreamReader(wordfile, Encoding.GetEncoding(encoding));
+
+ String word;
+ while ( ( word = isr.ReadLine() ) != null )
+ {
+ stoptable[word] = word;
+ }
+
+ }
+ catch ( IOException )
+ {
+ stoptable = null;
+ }
+ }
+
+ /// <summary>
+ /// Creates a TokenStream which tokenizes all the text in the provided Reader.
+ /// <returns>
+ /// A TokenStream build from a StandardTokenizer filtered with
+ /// StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter
+ /// </returns>
+ public override TokenStream TokenStream( String fieldName, TextReader reader )
+ {
+ TokenStream result = new StandardTokenizer( reader );
+ result = new StandardFilter( result );
+ result = new LowerCaseFilter( result );
+ result = new StopFilter( result, stoptable );
+ return result;
+ }
+ }
+}
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,125 @@
+using System;
+using System.IO;
+using System.Collections;
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.De
+{
+ /// <summary>
+ /// Analyzer for German language. Supports an external list of stopwords (words that
+ /// will not be indexed at all) and an external list of exclusions (word that will
+ /// not be stemmed, but indexed).
+ /// A default set of stopwords is used unless an alternative list is specified, the
+ /// exclusion list is empty by default.
+ /// </summary>
+ public class GermanAnalyzer : Analyzer
+ {
+ /// <summary>
+ /// List of typical german stopwords.
+ /// </summary>
+ private String[] GERMAN_STOP_WORDS =
+ {
+ "einer", "eine", "eines", "einem", "einen",
+ "der", "die", "das", "dass", "daÃ",
+ "du", "er", "sie", "es",
+ "was", "wer", "wie", "wir",
+ "und", "oder", "ohne", "mit",
+ "am", "im", "in", "aus", "auf",
+ "ist", "sein", "war", "wird",
+ "ihr", "ihre", "ihres",
+ "als", "für", "von",
+ "dich", "dir", "mich", "mir",
+ "mein", "kein",
+ "durch", "wegen"
+ };
+
+ /// <summary>
+ /// Contains the stopwords used with the StopFilter.
+ /// </summary>
+ private Hashtable stoptable = new Hashtable();
+
+ /// <summary>
+ /// Contains words that should be indexed but not stemmed.
+ /// </summary>
+ private Hashtable excltable = new Hashtable();
+
+ /// <summary>
+ /// Builds an analyzer.
+ /// </summary>
+ public GermanAnalyzer()
+ {
+ stoptable = StopFilter.MakeStopSet( GERMAN_STOP_WORDS );
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="stopwords"></param>
+ public GermanAnalyzer( String[] stopwords )
+ {
+ stoptable = StopFilter.MakeStopSet( stopwords );
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="stopwords"></param>
+ public GermanAnalyzer( Hashtable stopwords )
+ {
+ stoptable = stopwords;
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="stopwords"></param>
+ public GermanAnalyzer( FileInfo stopwords )
+ {
+ stoptable = WordlistLoader.GetWordtable( stopwords );
+ }
+
+ /// <summary>
+ /// Builds an exclusionlist from an array of Strings.
+ /// </summary>
+ /// <param name="exclusionlist"></param>
+ public void SetStemExclusionTable( String[] exclusionlist )
+ {
+ excltable = StopFilter.MakeStopSet( exclusionlist );
+ }
+
+ /// <summary>
+ /// Builds an exclusionlist from a Hashtable.
+ /// </summary>
+ /// <param name="exclusionlist"></param>
+ public void SetStemExclusionTable( Hashtable exclusionlist )
+ {
+ excltable = exclusionlist;
+ }
+
+ /// <summary>
+ /// Builds an exclusionlist from the words contained in the given file.
+ /// </summary>
+ /// <param name="exclusionlist"></param>
+ public void SetStemExclusionTable(FileInfo exclusionlist)
+ {
+ excltable = WordlistLoader.GetWordtable(exclusionlist);
+ }
+
+ /// <summary>
+ /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
+ /// </summary>
+ /// <param name="fieldName"></param>
+ /// <param name="reader"></param>
+ /// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new StandardTokenizer( reader );
+ result = new StandardFilter( result );
+ result = new LowerCaseFilter(result);
+ result = new StopFilter( result, stoptable );
+ result = new GermanStemFilter( result, excltable );
+ return result;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,85 @@
+using System;
+using System.IO;
+using System.Collections;
+
+namespace Lucene.Net.Analysis.De
+{
+ /// <summary>
+ /// A filter that stems German words. It supports a table of words that should
+ /// not be stemmed at all. The stemmer used can be changed at runtime after the
+ /// filter object is created (as long as it is a GermanStemmer).
+ /// </summary>
+ public sealed class GermanStemFilter : TokenFilter
+ {
+ /// <summary>
+ /// The actual token in the input stream.
+ /// </summary>
+ private Token token = null;
+ private GermanStemmer stemmer = null;
+ private Hashtable exclusions = null;
+
+ public GermanStemFilter( TokenStream _in ) : base(_in)
+ {
+ stemmer = new GermanStemmer();
+ }
+
+ /// <summary>
+ /// Builds a GermanStemFilter that uses an exclusiontable.
+ /// </summary>
+ /// <param name="_in"></param>
+ /// <param name="exclusiontable"></param>
+ public GermanStemFilter( TokenStream _in, Hashtable exclusiontable ): this(_in)
+ {
+ exclusions = exclusiontable;
+ }
+
+ /// <summary>
+ /// </summary>
+ /// <returns>Returns the next token in the stream, or null at EOS</returns>
+ public override Token Next()
+
+ {
+ if ( ( token = input.Next() ) == null )
+ {
+ return null;
+ }
+ // Check the exclusiontable
+ else if ( exclusions != null && exclusions.Contains( token.TermText() ) )
+ {
+ return token;
+ }
+ else
+ {
+ String s = stemmer.Stem( token.TermText() );
+ // If not stemmed, dont waste the time creating a new token
+ if ( !s.Equals( token.TermText() ) )
+ {
+ return new Token( s, token.StartOffset(),
+ token.EndOffset(), token.Type() );
+ }
+ return token;
+ }
+ }
+
+ /// <summary>
+ /// Set a alternative/custom GermanStemmer for this filter.
+ /// </summary>
+ /// <param name="stemmer"></param>
+ public void SetStemmer( GermanStemmer stemmer )
+ {
+ if ( stemmer != null )
+ {
+ this.stemmer = stemmer;
+ }
+ }
+
+ /// <summary>
+ /// Set an alternative exclusion list for this filter.
+ /// </summary>
+ /// <param name="exclusiontable"></param>
+ public void SetExclusionTable( Hashtable exclusiontable )
+ {
+ exclusions = exclusiontable;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,287 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+namespace Lucene.Net.Analysis.De
+{
+ /// <summary>
+ /// A stemmer for German words. The algorithm is based on the report
+ /// "A Fast and Simple Stemming Algorithm for German Words" by Jжrg
+ /// Caumanns (joerg.caumanns@isst.fhg.de).
+ /// </summary>
+ public class GermanStemmer
+ {
+ /// <summary>
+ /// Buffer for the terms while stemming them.
+ /// </summary>
+ private StringBuilder sb = new StringBuilder();
+
+ /// <summary>
+ /// Amount of characters that are removed with <tt>Substitute()</tt> while stemming.
+ /// </summary>
+ private int substCount = 0;
+
+ /// <summary>
+ /// Stemms the given term to an unique <tt>discriminator</tt>.
+ /// </summary>
+ /// <param name="term">The term that should be stemmed.</param>
+ /// <returns>Discriminator for <tt>term</tt></returns>
+ internal String Stem( String term )
+ {
+ // Use lowercase for medium stemming.
+ term = term.ToLower();
+ if ( !IsStemmable( term ) )
+ return term;
+ // Reset the StringBuilder.
+ sb.Remove(0, sb.Length);
+ sb.Insert(0, term);
+ // Stemming starts here...
+ Substitute( sb );
+ Strip( sb );
+ Optimize( sb );
+ Resubstitute( sb );
+ RemoveParticleDenotion( sb );
+ return sb.ToString();
+ }
+
+ /// <summary>
+ /// Checks if a term could be stemmed.
+ /// </summary>
+ /// <param name="term"></param>
+ /// <returns>true if, and only if, the given term consists in letters.</returns>
+ private bool IsStemmable( String term )
+ {
+ for ( int c = 0; c < term.Length; c++ )
+ {
+ if ( !Char.IsLetter(term[c])) return false;
+ }
+ return true;
+ }
+
+ /// <summary>
+ /// Suffix stripping (stemming) on the current term. The stripping is reduced
+ /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
+ /// from which all regular suffixes are build of. The simplification causes
+ /// some overstemming, and way more irregular stems, but still provides unique.
+ /// discriminators in the most of those cases.
+ /// The algorithm is context free, except of the length restrictions.
+ /// </summary>
+ /// <param name="buffer"></param>
+ private void Strip( StringBuilder buffer )
+ {
+ bool doMore = true;
+ while ( doMore && buffer.Length > 3 )
+ {
+ if ( ( buffer.Length + substCount > 5 ) &&
+ buffer.ToString().Substring(buffer.Length - 2, 2).Equals( "nd" ) )
+ {
+ buffer.Remove( buffer.Length - 2, 2 );
+ }
+ else if ( ( buffer.Length + substCount > 4 ) &&
+ buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "em" ) )
+ {
+ buffer.Remove( buffer.Length - 2, 2 );
+ }
+ else if ( ( buffer.Length + substCount > 4 ) &&
+ buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "er" ) )
+ {
+ buffer.Remove( buffer.Length - 2, 2 );
+ }
+ else if ( buffer[buffer.Length - 1] == 'e' )
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ }
+ else if ( buffer[buffer.Length - 1] == 's' )
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ }
+ else if ( buffer[buffer.Length - 1] == 'n' )
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ }
+ // "t" occurs only as suffix of verbs.
+ else if ( buffer[buffer.Length - 1] == 't')
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ }
+ else
+ {
+ doMore = false;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Does some optimizations on the term. This optimisations are contextual.
+ /// </summary>
+ /// <param name="buffer"></param>
+ private void Optimize( StringBuilder buffer )
+ {
+ // Additional step for female plurals of professions and inhabitants.
+ if ( buffer.Length > 5 && buffer.ToString().Substring(buffer.Length - 5, 5).Equals( "erin*" ))
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ Strip(buffer);
+ }
+ // Additional step for irregular plural nouns like "Matrizen -> Matrix".
+ if ( buffer[buffer.Length - 1] == ('z') )
+ {
+ buffer[buffer.Length - 1] = 'x';
+ }
+ }
+
+ /// <summary>
+ /// Removes a particle denotion ("ge") from a term.
+ /// </summary>
+ /// <param name="buffer"></param>
+ private void RemoveParticleDenotion( StringBuilder buffer )
+ {
+ if ( buffer.Length > 4 )
+ {
+ for ( int c = 0; c < buffer.Length - 3; c++ )
+ {
+ if ( buffer.ToString().Substring( c, 4 ).Equals( "gege" ) )
+ {
+ buffer.Remove(c, 2);
+ return;
+ }
+ }
+ }
+ }
+
+ /// <summary>
+ /// Do some substitutions for the term to reduce overstemming:
+ ///
+ /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
+ /// "Ã" is substituted by "ss"
+ /// - Substitute a second char of a pair of equal characters with
+ /// an asterisk: ?? -> ?*
+ /// - Substitute some common character combinations with a token:
+ /// sch/ch/ei/ie/ig/st -> $/Ч/%/&/#/!
+ /// </summary>
+ private void Substitute( StringBuilder buffer )
+ {
+ substCount = 0;
+ for ( int c = 0; c < buffer.Length; c++ )
+ {
+ // Replace the second char of a pair of the equal characters with an asterisk
+ if ( c > 0 && buffer[c] == buffer[c - 1])
+ {
+ buffer[c] = '*';
+ }
+ // Substitute Umlauts.
+ else if ( buffer[c] == 'ä' )
+ {
+ buffer[c] = 'a';
+ }
+ else if ( buffer[c] == 'ö' )
+ {
+ buffer[c] = 'o';
+ }
+ else if ( buffer[c] == 'ü' )
+ {
+ buffer[c] = 'u';
+ }
+ // Fix bug so that 'Ã' at the end of a word is replaced.
+ else if ( buffer[c] == 'Ã' )
+ {
+
+ buffer[c] = 's';
+ buffer.Insert(c + 1, 's');
+ substCount++;
+ }
+ // Take care that at least one character is left left side from the current one
+ if ( c < buffer.Length - 1 )
+ {
+ // Masking several common character combinations with an token
+ if ( ( c < buffer.Length - 2 ) && buffer[c] == 's' &&
+ buffer[c + 1] == 'c' && buffer[c + 2] == 'h' )
+ {
+ buffer[c] = '$';
+ buffer.Remove(c + 1, 2);
+ substCount =+ 2;
+ }
+ else if ( buffer[c] == 'c' && buffer[c + 1] == 'h' )
+ {
+ buffer[c] = '§';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ else if ( buffer[c] == 'e' && buffer[c + 1] == 'i' )
+ {
+ buffer[c] = '%';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ else if ( buffer[c] == 'i' && buffer[c + 1] == 'e' )
+ {
+ buffer[c] = '&';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ else if ( buffer[c] == 'i' && buffer[c + 1] == 'g' )
+ {
+ buffer[c] = '#';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ else if ( buffer[c] == 's' && buffer[c + 1] == 't' )
+ {
+ buffer[c] = '!';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ }
+ }
+ }
+
+ /// <summary>
+ /// Undoes the changes made by Substitute(). That are character pairs and
+ /// character combinations. Umlauts will remain as their corresponding vowel,
+ /// as "?" remains as "ss".
+ /// </summary>
+ /// <param name="buffer"></param>
+ private void Resubstitute( StringBuilder buffer )
+ {
+ for ( int c = 0; c < buffer.Length; c++ )
+ {
+ if ( buffer[c] == '*' )
+ {
+ char x = buffer[c - 1];
+ buffer[c] = x;
+ }
+ else if ( buffer[c] == '$' )
+ {
+ buffer[c] = 's';
+ buffer.Insert( c + 1, new char[]{'c', 'h'}, 0, 2);
+ }
+ else if ( buffer[c] == '§' )
+ {
+ buffer[c] = 'c';
+ buffer.Insert( c + 1, 'h' );
+ }
+ else if ( buffer[c] == '%' )
+ {
+ buffer[c] = 'e';
+ buffer.Insert( c + 1, 'i' );
+ }
+ else if ( buffer[c] == '&' )
+ {
+ buffer[c] = 'i';
+ buffer.Insert( c + 1, 'e' );
+ }
+ else if ( buffer[c] == '#' )
+ {
+ buffer[c] = 'i';
+ buffer.Insert( c + 1, 'g' );
+ }
+ else if ( buffer[c] == '!' )
+ {
+ buffer[c] = 's';
+ buffer.Insert( c + 1, 't' );
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,96 @@
+using System;
+using System.IO;
+using System.Collections;
+
+namespace Lucene.Net.Analysis.De
+{
+ /// <summary>
+ /// Loads a text file and adds every line as an entry to a Hashtable. Every line
+ /// should contain only one word. If the file is not found or on any error, an
+ /// empty table is returned.
+ /// </summary>
+ public class WordlistLoader
+ {
+ /// <summary>
+ /// </summary>
+ /// <param name="path">Path to the wordlist</param>
+ /// <param name="wordfile">Name of the wordlist</param>
+ /// <returns></returns>
+ public static Hashtable GetWordtable( String path, String wordfile )
+ {
+ if ( path == null || wordfile == null )
+ {
+ return new Hashtable();
+ }
+ return GetWordtable(new FileInfo(path + "\\" + wordfile));
+ }
+
+ /// <summary>
+ /// </summary>
+ /// <param name="wordfile">Complete path to the wordlist</param>
+ /// <returns></returns>
+ public static Hashtable GetWordtable( String wordfile )
+ {
+ if ( wordfile == null )
+ {
+ return new Hashtable();
+ }
+ return GetWordtable( new FileInfo( wordfile ) );
+ }
+
+ /// <summary>
+ ///
+ /// </summary>
+ /// <param name="wordfile">File containing the wordlist</param>
+ /// <returns></returns>
+ public static Hashtable GetWordtable( FileInfo wordfile )
+ {
+ if ( wordfile == null )
+ {
+ return new Hashtable();
+ }
+ Hashtable result = null;
+ try
+ {
+ StreamReader lnr = new StreamReader(wordfile.FullName);
+ String word = null;
+ String[] stopwords = new String[100];
+ int wordcount = 0;
+ while ( ( word = lnr.ReadLine() ) != null )
+ {
+ wordcount++;
+ if ( wordcount == stopwords.Length )
+ {
+ String[] tmp = new String[stopwords.Length + 50];
+ Array.Copy( stopwords, 0, tmp, 0, wordcount );
+ stopwords = tmp;
+ }
+ stopwords[wordcount-1] = word;
+ }
+ result = MakeWordTable( stopwords, wordcount );
+ }
+ // On error, use an empty table
+ catch (IOException)
+ {
+ result = new Hashtable();
+ }
+ return result;
+ }
+
+ /// <summary>
+ /// Builds the wordlist table.
+ /// </summary>
+ /// <param name="words">Word that where read</param>
+ /// <param name="length">Amount of words that where read into <tt>words</tt></param>
+ /// <returns></returns>
+ private static Hashtable MakeWordTable( String[] words, int length )
+ {
+ Hashtable table = new Hashtable( length );
+ for ( int i = 0; i < length; i++ )
+ {
+ table.Add(words[i], words[i]);
+ }
+ return table;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,197 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.De;
+using Lucene.Net.Analysis.Standard;
+
+namespace Lucene.Net.Analysis.Fr
+{
+ /* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2004 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+ /// <summary>
+ /// Analyzer for french language. Supports an external list of stopwords (words that
+ /// will not be indexed at all) and an external list of exclusions (word that will
+ /// not be stemmed, but indexed).
+ /// A default set of stopwords is used unless an other list is specified, the
+ /// exclusionlist is empty by default.
+ ///
+ /// <author>Patrick Talbot (based on Gerhard Schwarz work for German)</author>
+ /// <version>$Id: FrenchAnalyzer.java,v 1.9 2004/10/17 11:41:40 dnaber Exp $</version>
+ /// </summary>
+ public sealed class FrenchAnalyzer : Analyzer
+ {
+
+ /// <summary>
+ /// Extended list of typical french stopwords.
+ /// </summary>
+ public static String[] FRENCH_STOP_WORDS =
+ {
+ "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
+ "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
+ "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
+ "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
+ "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
+ "dedans", "dehors", "delà ", "depuis", "derrière", "des", "désormais", "desquelles",
+ "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
+ "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
+ "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
+ "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
+ "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là ",
+ "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
+ "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
+ "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi",
+ "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
+ "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
+ "qui", "quoi", "quoique", "revoici", "revoilà ", "s", "sa", "sans", "sauf", "se", "selon",
+ "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
+ "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
+ "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
+ "voici", "voilà ", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à ", "ça", "ès",
+ "été", "être", "ô"
+ };
+
+ /// <summary>
+ /// Contains the stopwords used with the StopFilter.
+ /// </summary>
+ private Hashtable stoptable = new Hashtable();
+
+ /// <summary>
+ /// Contains words that should be indexed but not stemmed.
+ /// </summary>
+ private Hashtable excltable = new Hashtable();
+
+ /// <summary>
+ /// Builds an analyzer.
+ /// </summary>
+ public FrenchAnalyzer()
+ {
+ stoptable = StopFilter.MakeStopSet( FRENCH_STOP_WORDS );
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ public FrenchAnalyzer( String[] stopwords )
+ {
+ stoptable = StopFilter.MakeStopSet( stopwords );
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ public FrenchAnalyzer( Hashtable stopwords )
+ {
+ stoptable = stopwords;
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ public FrenchAnalyzer( FileInfo stopwords )
+ {
+ stoptable = WordlistLoader.GetWordtable( stopwords );
+ }
+
+ /// <summary>
+ /// Builds an exclusionlist from an array of Strings.
+ /// </summary>
+ public void SetStemExclusionTable( String[] exclusionlist )
+ {
+ excltable = StopFilter.MakeStopSet( exclusionlist );
+ }
+
+ /// <summary>
+ /// Builds an exclusionlist from a Hashtable.
+ /// </summary>
+ public void SetStemExclusionTable( Hashtable exclusionlist )
+ {
+ excltable = exclusionlist;
+ }
+
+ /// <summary>
+ /// Builds an exclusionlist from the words contained in the given file.
+ /// </summary>
+ public void SetStemExclusionTable( FileInfo exclusionlist )
+ {
+ excltable = WordlistLoader.GetWordtable( exclusionlist );
+ }
+
+ /// <summary>
+ /// Creates a TokenStream which tokenizes all the text in the provided Reader.
+ /// </summary>
+ /// <returns>
+ /// A TokenStream build from a StandardTokenizer filtered with
+ /// StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
+ /// </returns>
+ public override TokenStream TokenStream( String fieldName, TextReader reader )
+ {
+
+ if (fieldName==null) throw new ArgumentException("fieldName must not be null");
+ if (reader==null) throw new ArgumentException("readermust not be null");
+
+ TokenStream result = new StandardTokenizer( reader );
+ result = new StandardFilter( result );
+ result = new StopFilter( result, stoptable );
+ result = new FrenchStemFilter( result, excltable );
+ // Convert to lowercase after stemming!
+ result = new LowerCaseFilter( result );
+ return result;
+ }
+ }
+
+}
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,143 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Fr
+{
+ /* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2004 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+ /// <summary>
+ /// A filter that stemms french words. It supports a table of words that should
+ /// not be stemmed at all. The used stemmer can be changed at runtime after the
+ /// filter object is created (as long as it is a FrenchStemmer).
+ ///
+ /// <author>Patrick Talbot (based on Gerhard Schwarz work for German)</author>
+ /// <version>$Id: FrenchAnalyzer.java,v 1.2 2004/01/23 20:54:47 ehatcher Exp $</version>
+ /// </summary>
+ public sealed class FrenchStemFilter : TokenFilter
+ {
+
+ /// <summary>
+ /// The actual token in the input stream.
+ /// </summary>
+ private Token token = null;
+ private FrenchStemmer stemmer = null;
+ private Hashtable exclusions = null;
+
+ public FrenchStemFilter( TokenStream _in ) : base(_in)
+ {
+ stemmer = new FrenchStemmer();
+ }
+
+ /// <summary>
+ /// Builds a FrenchStemFilter that uses an exclusiontable.
+ /// </summary>
+ public FrenchStemFilter( TokenStream _in, Hashtable exclusiontable ) : this( _in )
+ {
+ exclusions = exclusiontable;
+ }
+
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS
+ /// </summary>
+ /// <returns>
+ /// Returns the next token in the stream, or null at EOS
+ /// </returns>
+ public override Token Next()
+ {
+ if ( ( token = input.Next() ) == null )
+ {
+ return null;
+ }
+ // Check the exclusiontable
+ else if ( exclusions != null && exclusions.Contains( token.TermText() ) )
+ {
+ return token;
+ }
+ else
+ {
+ String s = stemmer.Stem( token.TermText() );
+ // If not stemmed, dont waste the time creating a new token
+ if ( !s.Equals( token.TermText() ) )
+ {
+ return new Token( s, 0, s.Length, token.Type() );
+ }
+ return token;
+ }
+ }
+
+ /// <summary>
+ /// Set a alternative/custom FrenchStemmer for this filter.
+ /// </summary>
+ public void SetStemmer( FrenchStemmer stemmer )
+ {
+ if ( stemmer != null )
+ {
+ this.stemmer = stemmer;
+ }
+ }
+
+ /// <summary>
+ /// Set an alternative exclusion list for this filter.
+ /// </summary>
+ public void SetExclusionTable( Hashtable exclusiontable )
+ {
+ exclusions = exclusiontable;
+ }
+ }
+}