You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2011/11/21 05:44:59 UTC
[Lucene.Net] svn commit: r1204353 [4/9] - in
/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src:
contrib/Analyzers/ contrib/Analyzers/AR/ contrib/Analyzers/BR/
contrib/Analyzers/CJK/ contrib/Analyzers/Cn/ contrib/Analyzers/Compound/
contrib/Analyzers/Compoun...
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -20,26 +20,31 @@
*/
using System;
+using System.Collections.Generic;
using System.IO;
using System.Collections;
+using System.Linq;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis;
+using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.De
{
- /// <summary>
- /// Analyzer for German language. Supports an external list of stopwords (words that
- /// will not be indexed at all) and an external list of exclusions (word that will
- /// not be stemmed, but indexed).
- /// A default set of stopwords is used unless an alternative list is specified, the
- /// exclusion list is empty by default.
- /// </summary>
- public class GermanAnalyzer : Analyzer
- {
- /// <summary>
- /// List of typical german stopwords.
- /// </summary>
- private String[] GERMAN_STOP_WORDS =
+ /// <summary>
+ /// Analyzer for German language. Supports an external list of stopwords (words that
+ /// will not be indexed at all) and an external list of exclusions (word that will
+ /// not be stemmed, but indexed).
+ /// A default set of stopwords is used unless an alternative list is specified, the
+ /// exclusion list is empty by default.
+ /// </summary>
+ public class GermanAnalyzer : Analyzer
+ {
+ /// <summary>
+ /// List of typical german stopwords.
+ /// </summary>
+ [Obsolete("Use GetDefaultStopSet() instead")]
+ //TODO: make this private in 3.1
+ private static readonly String[] GERMAN_STOP_WORDS =
{
"einer", "eine", "eines", "einem", "einen",
"der", "die", "das", "dass", "daÃ",
@@ -55,92 +60,150 @@ namespace Lucene.Net.Analysis.De
"durch", "wegen"
};
- /// <summary>
- /// Contains the stopwords used with the StopFilter.
- /// </summary>
- private Hashtable stoptable = new Hashtable();
-
- /// <summary>
- /// Contains words that should be indexed but not stemmed.
- /// </summary>
- private Hashtable excltable = new Hashtable();
-
- /// <summary>
- /// Builds an analyzer.
- /// </summary>
- public GermanAnalyzer()
- {
- stoptable = StopFilter.MakeStopSet( GERMAN_STOP_WORDS );
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="stopwords"></param>
- public GermanAnalyzer( String[] stopwords )
- {
- stoptable = StopFilter.MakeStopSet( stopwords );
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="stopwords"></param>
- public GermanAnalyzer( Hashtable stopwords )
- {
- stoptable = stopwords;
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="stopwords"></param>
- public GermanAnalyzer( FileInfo stopwords )
- {
- stoptable = WordlistLoader.GetWordtable( stopwords );
- }
-
- /// <summary>
- /// Builds an exclusionlist from an array of Strings.
- /// </summary>
- /// <param name="exclusionlist"></param>
- public void SetStemExclusionTable( String[] exclusionlist )
- {
- excltable = StopFilter.MakeStopSet( exclusionlist );
- }
-
- /// <summary>
- /// Builds an exclusionlist from a Hashtable.
- /// </summary>
- /// <param name="exclusionlist"></param>
- public void SetStemExclusionTable( Hashtable exclusionlist )
- {
- excltable = exclusionlist;
- }
-
- /// <summary>
- /// Builds an exclusionlist from the words contained in the given file.
- /// </summary>
- /// <param name="exclusionlist"></param>
- public void SetStemExclusionTable(FileInfo exclusionlist)
- {
- excltable = WordlistLoader.GetWordtable(exclusionlist);
- }
-
- /// <summary>
- /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
- /// </summary>
- /// <param name="fieldName"></param>
- /// <param name="reader"></param>
- /// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
- public override TokenStream TokenStream(String fieldName, TextReader reader)
- {
- TokenStream result = new StandardTokenizer( reader );
- result = new StandardFilter( result );
- result = new LowerCaseFilter(result);
- result = new StopFilter( result, stoptable );
- result = new GermanStemFilter( result, excltable );
- return result;
- }
- }
+ /// <summary>
+ /// Returns a set of default German-stopwords
+ /// </summary>
+ public static ISet<string> GetDefaultStopSet()
+ {
+ return DefaultSetHolder.DEFAULT_SET;
+ }
+
+ private static class DefaultSetHolder
+ {
+ internal static readonly ISet<string> DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet(
+ GERMAN_STOP_WORDS,
+ false));
+ }
+
+ /// <summary>
+ /// Contains the stopwords used with the StopFilter.
+ /// </summary>
+ //TODO: make this readonly in 3.1
+ private ISet<string> stopSet;
+
+ /// <summary>
+ /// Contains words that should be indexed but not stemmed.
+ /// </summary>
+ //TODO: make this readonly in 3.1
+ private ISet<string> exclusionSet;
+
+ private Version matchVersion;
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words:
+ /// <see cref="GetDefaultStopSet"/>
+ /// </summary>
+ [Obsolete("Use GermanAnalyzer(Version) instead")]
+ public GermanAnalyzer()
+ : this(Version.LUCENE_23)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words:
+ /// <see cref="GetDefaultStopSet"/>
+ /// </summary>
+ public GermanAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_SET)
+ { }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="matchVersion">Lucene compatibility version</param>
+ /// <param name="stopwords">a stopword set</param>
+ public GermanAnalyzer(Version matchVersion, ISet<string> stopwords)
+ : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words
+ /// </summary>
+ /// <param name="matchVersion">lucene compatibility version</param>
+ /// <param name="stopwords">a stopword set</param>
+ /// <param name="stemExclusionSet">a stemming exclusion set</param>
+ public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet)
+ {
+ stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+ exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet));
+ SetOverridesTokenStreamMethod(typeof(GermanAnalyzer));
+ this.matchVersion = matchVersion;
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="stopwords"></param>
+ [Obsolete("use GermanAnalyzer(Version, Set) instead")]
+ public GermanAnalyzer(Version matchVersion, params string[] stopwords)
+ : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ [Obsolete("Use GermanAnalyzer(Version, ISet)")]
+ public GermanAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+ : this(matchVersion, stopwords.Keys.ToArray())
+ {
+
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ [Obsolete("Use GermanAnalyzer(Version, ISet)")]
+ public GermanAnalyzer(Version matchVersion, FileInfo stopwords)
+ : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
+ {
+ }
+
+ /// <summary>
+ /// Builds an exclusionlist from an array of Strings.
+ /// </summary>
+ [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
+ public void SetStemExclusionTable(String[] exclusionlist)
+ {
+ exclusionSet = StopFilter.MakeStopSet(exclusionlist);
+ SetPreviousTokenStream(null);
+ }
+
+ /// <summary>
+ /// Builds an exclusionlist from a IDictionary.
+ /// </summary>
+ [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
+ public void SetStemExclusionTable(IDictionary<string, string> exclusionlist)
+ {
+ exclusionSet = new HashSet<string>(exclusionlist.Keys);
+ SetPreviousTokenStream(null);
+ }
+
+ /// <summary>
+ /// Builds an exclusionlist from the words contained in the given file.
+ /// </summary>
+ [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
+ public void SetStemExclusionTable(FileInfo exclusionlist)
+ {
+ exclusionSet = WordlistLoader.GetWordSet(exclusionlist);
+ SetPreviousTokenStream(null);
+ }
+
+ /// <summary>
+ /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
+ /// </summary>
+ /// <param name="fieldName"></param>
+ /// <param name="reader"></param>
+ /// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new StandardTokenizer(matchVersion, reader);
+ result = new StandardFilter(result);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
+ result = new GermanStemFilter(result, exclusionSet);
+ return result;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs Mon Nov 21 04:44:55 2011
@@ -20,87 +20,89 @@
*/
using System;
+using System.Collections.Generic;
using System.IO;
using System.Collections;
+using Lucene.Net.Analysis.Tokenattributes;
namespace Lucene.Net.Analysis.De
{
- /// <summary>
- /// A filter that stems German words. It supports a table of words that should
- /// not be stemmed at all. The stemmer used can be changed at runtime after the
- /// filter object is created (as long as it is a GermanStemmer).
- /// </summary>
- public sealed class GermanStemFilter : TokenFilter
- {
- /// <summary>
- /// The actual token in the input stream.
- /// </summary>
- private Token token = null;
- private GermanStemmer stemmer = null;
- private Hashtable exclusions = null;
-
- public GermanStemFilter( TokenStream _in ) : base(_in)
- {
- stemmer = new GermanStemmer();
- }
-
- /// <summary>
- /// Builds a GermanStemFilter that uses an exclusiontable.
- /// </summary>
- /// <param name="_in"></param>
- /// <param name="exclusiontable"></param>
- public GermanStemFilter( TokenStream _in, Hashtable exclusiontable ): this(_in)
- {
- exclusions = exclusiontable;
- }
-
- /// <summary>
- /// </summary>
- /// <returns>Returns the next token in the stream, or null at EOS</returns>
- public override Token Next()
-
- {
- if ( ( token = input.Next() ) == null )
- {
- return null;
- }
- // Check the exclusiontable
- else if ( exclusions != null && exclusions.Contains( token.TermText() ) )
- {
- return token;
- }
- else
- {
- String s = stemmer.Stem( token.TermText() );
- // If not stemmed, dont waste the time creating a new token
- if ( !s.Equals( token.TermText() ) )
- {
- return new Token( s, token.StartOffset(),
- token.EndOffset(), token.Type() );
- }
- return token;
- }
- }
-
- /// <summary>
- /// Set a alternative/custom GermanStemmer for this filter.
- /// </summary>
- /// <param name="stemmer"></param>
- public void SetStemmer( GermanStemmer stemmer )
- {
- if ( stemmer != null )
- {
- this.stemmer = stemmer;
- }
- }
-
- /// <summary>
- /// Set an alternative exclusion list for this filter.
- /// </summary>
- /// <param name="exclusiontable"></param>
- public void SetExclusionTable( Hashtable exclusiontable )
- {
- exclusions = exclusiontable;
- }
- }
+ /// <summary>
+ /// A filter that stems German words. It supports a table of words that should
+ /// not be stemmed at all. The stemmer used can be changed at runtime after the
+ /// filter object is created (as long as it is a GermanStemmer).
+ /// </summary>
+ public sealed class GermanStemFilter : TokenFilter
+ {
+ /// <summary>
+ /// The actual token in the input stream.
+ /// </summary>
+ private GermanStemmer stemmer = null;
+ private ISet<string> exclusionSet = null;
+
+ private TermAttribute termAtt;
+
+ public GermanStemFilter(TokenStream _in)
+ : base(_in)
+ {
+ stemmer = new GermanStemmer();
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ /// <summary>
+ /// Builds a GermanStemFilter that uses an exclusiontable.
+ /// </summary>
+ /// <param name="_in"></param>
+ /// <param name="exclusiontable"></param>
+ public GermanStemFilter(TokenStream _in, ISet<string> exclusiontable)
+ : this(_in)
+ {
+ exclusionSet = exclusiontable;
+ }
+
+ /// <returns>
+ /// Returns true for next token in the stream, or false at EOS
+ /// </returns>
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ String term = termAtt.Term();
+ // Check the exclusion table.
+ if (exclusionSet == null || !exclusionSet.Contains(term))
+ {
+ String s = stemmer.Stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.Equals(term))
+ termAtt.SetTermBuffer(s);
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Set a alternative/custom GermanStemmer for this filter.
+ /// </summary>
+ /// <param name="stemmer"></param>
+ public void SetStemmer(GermanStemmer stemmer)
+ {
+ if (stemmer != null)
+ {
+ this.stemmer = stemmer;
+ }
+ }
+
+ /// <summary>
+ /// Set an alternative exclusion list for this filter.
+ /// </summary>
+ /// <param name="exclusiontable"></param>
+ public void SetExclusionTable(ISet<string> exclusiontable)
+ {
+ exclusionSet = exclusiontable;
+ }
+ }
}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekAnalyzer.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekAnalyzer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,155 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Standard;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analyzers.El
+{
+ /**
+ * {@link Analyzer} for the Greek language.
+ * <p>
+ * Supports an external list of stopwords (words
+ * that will not be indexed at all).
+ * A default set of stopwords is used unless an alternative list is specified.
+ * </p>
+ *
+ * <p><b>NOTE</b>: This class uses the same {@link Version}
+ * dependent settings as {@link StandardAnalyzer}.</p>
+ */
+ public sealed class GreekAnalyzer : Analyzer
+ {
+ /**
+ * List of typical Greek stopwords.
+ */
+
+ private static readonly String[] GREEK_STOP_WORDS = {
+ "ο", "η", "Ïο", "οι", "Ïα", "ÏοÏ
", "ÏηÏ", "ÏÏν", "Ïον",
+ "Ïην", "και",
+ "κι", "κ", "ειμαι", "ειÏαι", "ειναι", "ειμαÏÏε", "ειÏÏε"
+ , "ÏÏο", "ÏÏον",
+ "ÏÏη", "ÏÏην", "μα", "αλλα", "αÏο", "για", "ÏÏοÏ", "με",
+ "Ïε", "ÏÏ",
+ "ÏαÏα", "ανÏι", "καÏα", "μεÏα", "θα", "να", "δε", "δεν",
+ "μη", "μην",
+ "εÏι", "ενÏ", "εαν", "αν", "ÏοÏε", "ÏοÏ
", "ÏÏÏ", "ÏοιοÏ"
+ , "Ïοια", "Ïοιο",
+ "Ïοιοι", "ÏοιεÏ", "ÏοιÏν", "ÏοιοÏ
Ï", "αÏ
ÏοÏ", "αÏ
Ïη",
+ "αÏ
Ïο", "αÏ
Ïοι",
+ "αÏ
ÏÏν", "αÏ
ÏοÏ
Ï", "αÏ
ÏεÏ", "αÏ
Ïα", "εκεινοÏ", "εκεινη",
+ "εκεινο",
+ "εκεινοι", "εκεινεÏ", "εκεινα", "εκεινÏν", "εκεινοÏ
Ï",
+ "οÏÏÏ", "ομÏÏ",
+ "ιÏÏÏ", "οÏο", "οÏι"
+ };
+
+ /**
+ * Returns a set of default Greek-stopwords
+ * @return a set of default Greek-stopwords
+ */
+ public static ISet<string> GetDefaultStopSet()
+ {
+ return DefaultSetHolder.DEFAULT_SET;
+ }
+
+ private static class DefaultSetHolder
+ {
+ internal static ISet<string> DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet(GREEK_STOP_WORDS, false));
+ }
+
+ /**
+ * Contains the stopwords used with the {@link StopFilter}.
+ */
+ private readonly ISet<string> stopSet;
+
+ private readonly Version matchVersion;
+
+ public GreekAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_SET)
+ {
+ }
+
+ /**
+ * Builds an analyzer with the given stop words
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ */
+ public GreekAnalyzer(Version matchVersion, ISet<string> stopwords)
+ {
+ stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * @param stopwords Array of stopwords to use.
+ * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
+ */
+ public GreekAnalyzer(Version matchVersion, params string[] stopwords)
+ : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+ {
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
+ */
+ public GreekAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+ : this(matchVersion, stopwords.Keys.ToArray())
+ {
+ }
+
+ /**
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link GreekLowerCaseFilter} and {@link StopFilter}
+ */
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new StandardTokenizer(matchVersion, reader);
+ result = new GreekLowerCaseFilter(result);
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ result, stopSet);
+ return result;
+ }
+
+ private class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+ * {@link GreekLowerCaseFilter} and {@link StopFilter}
+ */
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(matchVersion, reader);
+ streams.result = new GreekLowerCaseFilter(streams.source);
+ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.result, stopSet);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
+ }
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,107 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analyzers.El
+{
+ /**
+ * Normalizes token text to lower case, removes some Greek diacritics,
+ * and standardizes final sigma to sigma.
+ *
+ */
+ public sealed class GreekLowerCaseFilter : TokenFilter
+ {
+ private TermAttribute termAtt;
+
+ public GreekLowerCaseFilter(TokenStream _in)
+ : base(_in)
+ {
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ char[] chArray = termAtt.TermBuffer();
+ int chLen = termAtt.TermLength();
+ // TODO: iterate codepoints to support supp. characters
+ for (int i = 0; i < chLen; i++)
+ {
+ chArray[i] = (char)lowerCase(chArray[i]);
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ private int lowerCase(int codepoint)
+ {
+ switch (codepoint)
+ {
+ /* There are two lowercase forms of sigma:
+ * U+03C2: small final sigma (end of word)
+ * U+03C3: small sigma (otherwise)
+ *
+ * Standardize both to U+03C3
+ */
+ case '\u03C2': /* small final sigma */
+ return '\u03C3'; /* small sigma */
+
+ /* Some greek characters contain diacritics.
+ * This filter removes these, converting to the lowercase base form.
+ */
+
+ case '\u0386': /* capital alpha with tonos */
+ case '\u03AC': /* small alpha with tonos */
+ return '\u03B1'; /* small alpha */
+
+ case '\u0388': /* capital epsilon with tonos */
+ case '\u03AD': /* small epsilon with tonos */
+ return '\u03B5'; /* small epsilon */
+
+ case '\u0389': /* capital eta with tonos */
+ case '\u03AE': /* small eta with tonos */
+ return '\u03B7'; /* small eta */
+
+ case '\u038A': /* capital iota with tonos */
+ case '\u03AA': /* capital iota with dialytika */
+ case '\u03AF': /* small iota with tonos */
+ case '\u03CA': /* small iota with dialytika */
+ case '\u0390': /* small iota with dialytika and tonos */
+ return '\u03B9'; /* small iota */
+
+ case '\u038E': /* capital upsilon with tonos */
+ case '\u03AB': /* capital upsilon with dialytika */
+ case '\u03CD': /* small upsilon with tonos */
+ case '\u03CB': /* small upsilon with dialytika */
+ case '\u03B0': /* small upsilon with dialytika and tonos */
+ return '\u03C5'; /* small upsilon */
+
+ case '\u038C': /* capital omicron with tonos */
+ case '\u03CC': /* small omicron with tonos */
+ return '\u03BF'; /* small omicron */
+
+ case '\u038F': /* capital omega with tonos */
+ case '\u03CE': /* small omega with tonos */
+ return '\u03C9'; /* small omega */
+
+ /* The previous implementation did the conversion below.
+ * Only implemented for backwards compatibility with old indexes.
+ */
+
+ case '\u03A2': /* reserved */
+ return '\u03C2'; /* small final sigma */
+
+ default:
+ return char.ToLower((char)codepoint);
+ }
+ }
+ }
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianAnalyzer.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianAnalyzer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,215 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.AR;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analyzers.Fa
+{
+ /**
+ * {@link Analyzer} for Persian.
+ * <p>
+ * This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
+ * zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
+ * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
+ * </p>
+ */
+ public sealed class PersianAnalyzer : Analyzer
+ {
+
+ /**
+ * File containing default Persian stopwords.
+ *
+ * Default stopword list is from
+ * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
+ * BSD-Licensed.
+ *
+ */
+ public readonly static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /**
+ * Contains the stopwords used with the StopFilter.
+ */
+ private readonly ISet<string> stoptable;
+
+ /**
+ * The comment character in the stopwords file. All lines prefixed with this
+ * will be ignored
+ */
+ public static readonly String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Returns an unmodifiable instance of the default stop-words set.
+ * @return an unmodifiable instance of the default stop-words set.
+ */
+ public static ISet<string> getDefaultStopSet()
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder
+ {
+ internal static readonly ISet<string> DEFAULT_STOP_SET;
+
+ static DefaultSetHolder()
+ {
+ try
+ {
+ DEFAULT_STOP_SET = LoadDefaultStopWordSet();
+ }
+ catch (IOException ex)
+ {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new Exception("Unable to load default stopword set");
+ }
+ }
+
+ static ISet<String> LoadDefaultStopWordSet()
+ {
+
+ var stream = System.Reflection.Assembly.GetAssembly(typeof(PersianAnalyzer)).GetManifestResourceStream("Lucene.Net.Analyzers.Fa." + DEFAULT_STOPWORD_FILE);
+ try
+ {
+ StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
+ // make sure it is unmodifiable as we expose it in the outer class
+ return CharArraySet.UnmodifiableSet(new CharArraySet(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT), true));
+ }
+ finally
+ {
+ stream.Close();
+ }
+ }
+ }
+
+ private readonly Version matchVersion;
+
+ /**
+ * Builds an analyzer with the default stop words:
+ * {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public PersianAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ */
+ public PersianAnalyzer(Version matchVersion, ISet<string> stopwords)
+ {
+ stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
+ */
+ public PersianAnalyzer(Version matchVersion, params string[] stopwords)
+ : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
+ */
+ public PersianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+ : this(matchVersion, stopwords.Keys.ToArray())
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. Lines can be commented out
+ * using {@link #STOPWORDS_COMMENT}
+ * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
+ */
+ public PersianAnalyzer(Version matchVersion, FileInfo stopwords)
+ : this(matchVersion, WordlistLoader.GetWordSet(stopwords, STOPWORDS_COMMENT))
+ {
+
+ }
+
+ /**
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+ * filtered with {@link LowerCaseFilter},
+ * {@link ArabicNormalizationFilter},
+ * {@link PersianNormalizationFilter} and Persian Stop words
+ */
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new ArabicLetterTokenizer(reader);
+ result = new LowerCaseFilter(result);
+ result = new ArabicNormalizationFilter(result);
+ /* additional persian-specific normalization */
+ result = new PersianNormalizationFilter(result);
+ /*
+ * the order here is important: the stopword list is normalized with the
+ * above!
+ */
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ result, stoptable);
+ return result;
+ }
+
+ private class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ }
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+ * filtered with {@link LowerCaseFilter},
+ * {@link ArabicNormalizationFilter},
+ * {@link PersianNormalizationFilter} and Persian Stop words
+ */
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new ArabicLetterTokenizer(reader);
+ streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new ArabicNormalizationFilter(streams.result);
+ /* additional persian-specific normalization */
+ streams.result = new PersianNormalizationFilter(streams.result);
+ /*
+ * the order here is important: the stopword list is normalized with the
+ * above!
+ */
+ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.result, stoptable);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizationFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizationFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizationFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizationFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,38 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analyzers.Fa
+{
+ /**
+ * A {@link TokenFilter} that applies {@link PersianNormalizer} to normalize the
+ * orthography.
+ *
+ */
+
+public sealed class PersianNormalizationFilter : TokenFilter {
+
+ private readonly PersianNormalizer normalizer;
+ private readonly TermAttribute termAtt;
+
+ public PersianNormalizationFilter(TokenStream input)
+ :base(input)
+ {
+ normalizer = new PersianNormalizer();
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ public override bool IncrementToken()
+{
+ if (input.IncrementToken()) {
+ int newlen = normalizer.Normalize(termAtt.TermBuffer(), termAtt.TermLength());
+ termAtt.SetTermLength(newlen);
+ return true;
+ }
+ return false;
+ }
+}
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizer.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizer.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,90 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analyzers.Fa
+{
+/**
+ * Normalizer for Persian.
+ * <p>
+ * Normalization is done in-place for efficiency, operating on a termbuffer.
+ * <p>
+ * Normalization is defined as:
+ * <ul>
+ * <li>Normalization of various heh + hamza forms and heh goal to heh.
+ * <li>Normalization of farsi yeh and yeh barree to arabic yeh
+ * <li>Normalization of persian keheh to arabic kaf
+ * </ul>
+ *
+ */
+public class PersianNormalizer {
+ public const char YEH = '\u064A';
+
+ public const char FARSI_YEH = '\u06CC';
+
+ public const char YEH_BARREE = '\u06D2';
+
+ public const char KEHEH = '\u06A9';
+
+ public const char KAF = '\u0643';
+
+ public const char HAMZA_ABOVE = '\u0654';
+
+ public const char HEH_YEH = '\u06C0';
+
+ public const char HEH_GOAL = '\u06C1';
+
+ public const char HEH = '\u0647';
+
+ /**
+ * Normalize an input buffer of Persian text
+ *
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return length of input buffer after normalization
+ */
+ public int Normalize(char[] s, int len) {
+
+ for (int i = 0; i < len; i++) {
+ switch (s[i]) {
+ case FARSI_YEH:
+ case YEH_BARREE:
+ s[i] = YEH;
+ break;
+ case KEHEH:
+ s[i] = KAF;
+ break;
+ case HEH_YEH:
+ case HEH_GOAL:
+ s[i] = HEH;
+ break;
+ case HAMZA_ABOVE: // necessary for HEH + HAMZA
+ len = Delete(s, i, len);
+ i--;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return len;
+ }
+
+ /**
+ * Delete a character in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len length of input buffer
+ * @return length of input buffer after deletion
+ */
+ protected int Delete(char[] s, int pos, int len) {
+ if (pos < len)
+ Array.Copy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+
+}
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/FileDiffs.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/FileDiffs.txt?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/FileDiffs.txt (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/FileDiffs.txt Mon Nov 21 04:44:55 2011
@@ -0,0 +1,155 @@
+TODO: make sure namespaces match Lucene's. (defaults to Analyzers instead of Analysis)
+TODO: Fix method naming
+TODO: Convert all javaDoc Comments to C# XML
+TODO: Normalize Line Endings
+TODO: Fix sb.ToString().Substring t-> sb.ToString(int, int)
+TODO: RussianStemFilter.SetStemmer()
+
+analysis\ar\
+analysis\ar\ArabicAnalyzer.java - PORTED
+analysis\ar\ArabicLetterTokenizer.java - PORTED
+analysis\ar\ArabicNormalizationFilter.java - PORTED
+analysis\ar\ArabicNormalizer.java - PORTED
+analysis\ar\ArabicStemFilter.java - PORTED
+analysis\ar\ArabicStemmer.java - IDENTICAL
+analysis\ar\package.html - IDENTICAL
+
+analysis\br\
+analysis\br\BrazilianAnalyzer.java - PORTED
+analysis\br\BrazilianStemFilter.java - PORTED
+analysis\br\BrazilianStemmer.java - IDENTICAL
+analysis\br\package.html - IDENTICAL
+
+analysis\cjk\
+analysis\cjk\CJKAnalyzer.java - PORTED
+analysis\cjk\CJKTokenizer.java - PORTED
+analysis\cjk\package.html - IDENTICAL
+
+analysis\cn\
+analysis\cn\ChineseAnalyzer.java - PORTED
+analysis\cn\ChineseFilter.java - PORTED
+analysis\cn\ChineseTokenizer.java - PORTED
+analysis\cn\package.html - IDENTICAL
+
+analysis\compound\hyphenation\
+analysis\compound\hyphenation\ByteVector.java - ADDED
+analysis\compound\hyphenation\CharVector.java - ADDED
+analysis\compound\hyphenation\Hyphen.java - Text files are different
+analysis\compound\hyphenation\hyphenation.dtd - IDENTICAL
+analysis\compound\hyphenation\Hyphenation.java - Text files are different
+analysis\compound\hyphenation\HyphenationException.java - Text files are different
+analysis\compound\hyphenation\HyphenationTree.java - Text files are different
+analysis\compound\hyphenation\package.html - Text files are different
+analysis\compound\hyphenation\PatternConsumer.java - Text files are different
+analysis\compound\hyphenation\PatternParser.java - Text files are different
+analysis\compound\hyphenation\TernaryTree.java - Text files are different
+
+analysis\compound\
+analysis\compound\CompoundWordTokenFilterBase.java - PORTED
+analysis\compound\DictionaryCompoundWordTokenFilter.java - PORTED
+analysis\compound\HyphenationCompoundWordTokenFilter.java - Text files are different
+analysis\compound\package.html - IDENTICAL
+
+analysis\cz\
+analysis\cz\CzechAnalyzer.java - PORTED
+analysis\cz\package.html - IDENTICAL
+
+analysis\de\
+analysis\de\GermanAnalyzer.java - PORTED
+analysis\de\GermanStemFilter.java - PORTED
+analysis\de\GermanStemmer.java - PORTED
+analysis\de\package.html - IDENTICAL
+
+analysis\el\
+analysis\el\GreekAnalyzer.java - PORTED
+analysis\el\GreekCharsets.java - REMOVED IN 3.x
+analysis\el\GreekLowerCaseFilter.java - PORTED
+analysis\el\package.html - IDENTICAL
+
+analysis\fa\
+analysis\fa\package.html - IDENTICAL
+analysis\fa\PersianAnalyzer.java - PORTED
+analysis\fa\PersianNormalizationFilter.java - PORTED
+analysis\fa\PersianNormalizer.java - PORTED
+
+analysis\fr\
+analysis\fr\ElisionFilter.java - PORTED
+analysis\fr\analysis\fr\FrenchAnalyzer.java - PORTED
+analysis\fr\FrenchStemFilter.java - PORTED
+analysis\fr\FrenchStemmer.java - PORTED
+analysis\fr\package.html - IDENTICAL
+
+analysis\miscellaneous\
+analysis\miscellaneous\EmptyTokenStream.java - PORTED
+analysis\miscellaneous\package.html - IDENTICAL
+analysis\miscellaneous\PatternAnalyzer.java - PORTED
+analysis\miscellaneous\PrefixAndSuffixAwareTokenFilter.java - PORTED
+analysis\miscellaneous\PrefixAwareTokenFilter.java - PORTED
+analysis\miscellaneous\SingleTokenTokenStream.java - PORTED
+
+analysis\ngram\
+analysis\ngram\EdgeNGramTokenFilter.java - PORTED
+analysis\ngram\EdgeNGramTokenizer.java - PORTED
+analysis\ngram\NGramTokenFilter.java - PORTED
+analysis\ngram\NGramTokenizer.java - PORTED
+analysis\ngram\package.html - IDENTICAL
+
+analysis\nl\
+analysis\nl\DutchAnalyzer.java - PORTED
+analysis\nl\DutchStemFilter.java - PORTED
+analysis\nl\DutchStemmer.java - PORTED
+analysis\nl\package.html - IDENTICAL
+analysis\nl\WordlistLoader.java - REMOVED IN 3.x
+
+analysis\payloads\
+analysis\payloads\AbstractEncoder.java - PORTED
+analysis\payloads\DelimitedPayloadTokenFilter.java - PORTED
+analysis\payloads\FloatEncoder.java - PORTED
+analysis\payloads\IdentityEncoder.java - PORTED
+analysis\payloads\IntegerEncoder.java - PORTED
+analysis\payloads\NumericPayloadTokenFilter.java - PORTED
+analysis\payloads\package.html - Text file is missing
+analysis\payloads\PayloadEncoder.java - PORTED
+analysis\payloads\PayloadHelper.java - IDENTICAL
+analysis\payloads\TokenOffsetPayloadTokenFilter.java - PORTED
+analysis\payloads\TypeAsPayloadTokenFilter.java - PORTED
+
+analysis\position\
+analysis\position\package.html - IDENTICAL
+analysis\position\PositionFilter.java - PORTED
+
+analysis\query\
+analysis\query\package.html - IDENTICAL
+analysis\query\QueryAutoStopWordAnalyzer.java - PORTED
+
+analysis\reverse\
+analysis\reverse\package.html - IDENTICAL
+analysis\reverse\ReverseStringFilter.java - PORTED
+
+analysis\ru\
+analysis\ru\package.html - IDENTICAL
+analysis\ru\RussianAnalyzer.java - PORTED
+analysis\ru\RussianCharsets.java - REMOVED IN 3.x
+analysis\ru\RussianLetterTokenizer.java - PORTED
+analysis\ru\RussianLowerCaseFilter.java - PORTED
+analysis\ru\RussianStemFilter.java - PORTED
+analysis\ru\RussianStemmer.java - PORTED
+
+analysis\shingle\
+analysis\shingle\package.html - IDENTICAL
+analysis\shingle\ShingleAnalyzerWrapper.java - PORTED
+analysis\shingle\ShingleFilter.java - PORTED
+analysis\shingle\ShingleMatrixFilter.java - PORTED
+analysis\sinks\
+analysis\sinks\DateRecognizerSinkFilter.java - PORTED
+analysis\sinks\DateRecognizerSinkTokenizer.java - REMOVED IN 3.x
+analysis\sinks\package.html - IDENTICAL
+analysis\sinks\TokenRangeSinkFilter.java - PORTED
+analysis\sinks\TokenRangeSinkTokenizer.java - REMOVED IN 3.x
+analysis\sinks\TokenTypeSinkFilter.java - PORTED
+analysis\sinks\TokenTypeSinkTokenizer.java - REMOVED IN 3.x
+
+analysis\th\
+analysis\th\package.html - IDENTICAL
+analysis\th\ThaiAnalyzer.java - PORTED
+analysis\th\ThaiWordFilter.java - PORTED WITH ISSUES - No BreakIterator. Won't compile; commented out
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/ElisionFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/ElisionFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/ElisionFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/ElisionFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analyzers.Fr
+{
+ /**
+ * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
+ * tokenized as "avion" (plane).
+ * <p>
+ * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
+ *
+ * @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
+ */
+ public sealed class ElisionFilter : TokenFilter
+ {
+ private CharArraySet articles = null;
+ private TermAttribute termAtt;
+
+ private static char[] apostrophes = { '\'', 'â' };
+
+ public void SetArticles(ISet<string> articles)
+ {
+ if (articles is CharArraySet)
+ this.articles = (CharArraySet)articles;
+ else
+ this.articles = new CharArraySet(articles, true);
+ }
+
+ /**
+ * Constructs an elision filter with standard stop words
+ */
+ internal ElisionFilter(TokenStream input)
+ : base(input)
+ {
+ this.articles = new CharArraySet(new[] { "l", "m", "t", "qu", "n", "s", "j" }, true);
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ /**
+ * Constructs an elision filter with a Set of stop words
+ */
+ public ElisionFilter(TokenStream input, ISet<string> articles)
+ : base(input)
+ {
+ SetArticles(articles);
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ /**
+ * Constructs an elision filter with an array of stop words
+ */
+ public ElisionFilter(TokenStream input, string[] articles)
+ : base(input)
+ {
+ this.articles = new CharArraySet(articles, true);
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ /**
+ * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start
+ */
+ public override sealed bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ char[] termBuffer = termAtt.TermBuffer();
+ int termLength = termAtt.TermLength();
+
+ int minPoz = int.MaxValue;
+ for (int i = 0; i < apostrophes.Length; i++)
+ {
+ char apos = apostrophes[i];
+ // The equivalent of String.indexOf(ch)
+ for (int poz = 0; poz < termLength; poz++)
+ {
+ if (termBuffer[poz] == apos)
+ {
+ minPoz = Math.Min(poz, minPoz);
+ break;
+ }
+ }
+ }
+
+ // An apostrophe has been found. If the prefix is an article strip it off.
+ if (minPoz != int.MaxValue
+ && articles.Contains(termAtt.TermBuffer(), 0, minPoz))
+ {
+ termAtt.SetTermBuffer(termAtt.TermBuffer(), minPoz + 1, termAtt.TermLength() - (minPoz + 1));
+ }
+
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -20,6 +20,7 @@
*/
using System;
+using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Collections;
@@ -27,192 +28,235 @@ using System.Collections;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.De;
using Lucene.Net.Analysis.Standard;
+using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.Fr
{
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2004 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// Analyzer for french language. Supports an external list of stopwords (words that
- /// will not be indexed at all) and an external list of exclusions (word that will
- /// not be stemmed, but indexed).
- /// A default set of stopwords is used unless an other list is specified, the
- /// exclusionlist is empty by default.
- ///
- /// <author>Patrick Talbot (based on Gerhard Schwarz work for German)</author>
- /// <version>$Id: FrenchAnalyzer.java,v 1.9 2004/10/17 11:41:40 dnaber Exp $</version>
- /// </summary>
- public sealed class FrenchAnalyzer : Analyzer
- {
-
- /// <summary>
- /// Extended list of typical french stopwords.
- /// </summary>
- public static String[] FRENCH_STOP_WORDS =
- {
- "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
- "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
- "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
- "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
- "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
- "dedans", "dehors", "delà ", "depuis", "derrière", "des", "désormais", "desquelles",
- "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
- "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
- "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
- "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
- "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là ",
- "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
- "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
- "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi",
- "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
- "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
- "qui", "quoi", "quoique", "revoici", "revoilà ", "s", "sa", "sans", "sauf", "se", "selon",
- "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
- "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
- "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
- "voici", "voilà ", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à ", "ça", "ès",
- "été", "être", "ô"
- };
-
- /// <summary>
- /// Contains the stopwords used with the StopFilter.
- /// </summary>
- private Hashtable stoptable = new Hashtable();
-
- /// <summary>
- /// Contains words that should be indexed but not stemmed.
- /// </summary>
- private Hashtable excltable = new Hashtable();
-
- /// <summary>
- /// Builds an analyzer.
- /// </summary>
- public FrenchAnalyzer()
- {
- stoptable = StopFilter.MakeStopSet( FRENCH_STOP_WORDS );
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- public FrenchAnalyzer( String[] stopwords )
- {
- stoptable = StopFilter.MakeStopSet( stopwords );
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- public FrenchAnalyzer( Hashtable stopwords )
- {
- stoptable = stopwords;
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- public FrenchAnalyzer( FileInfo stopwords )
- {
- stoptable = WordlistLoader.GetWordtable( stopwords );
- }
-
- /// <summary>
- /// Builds an exclusionlist from an array of Strings.
- /// </summary>
- public void SetStemExclusionTable( String[] exclusionlist )
- {
- excltable = StopFilter.MakeStopSet( exclusionlist );
- }
-
- /// <summary>
- /// Builds an exclusionlist from a Hashtable.
- /// </summary>
- public void SetStemExclusionTable( Hashtable exclusionlist )
- {
- excltable = exclusionlist;
- }
-
- /// <summary>
- /// Builds an exclusionlist from the words contained in the given file.
- /// </summary>
- public void SetStemExclusionTable( FileInfo exclusionlist )
- {
- excltable = WordlistLoader.GetWordtable( exclusionlist );
- }
-
- /// <summary>
- /// Creates a TokenStream which tokenizes all the text in the provided Reader.
- /// </summary>
- /// <returns>
- /// A TokenStream build from a StandardTokenizer filtered with
- /// StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
- /// </returns>
- public override TokenStream TokenStream( String fieldName, TextReader reader )
- {
-
- if (fieldName==null) throw new ArgumentException("fieldName must not be null");
- if (reader==null) throw new ArgumentException("readermust not be null");
-
- TokenStream result = new StandardTokenizer( reader );
- result = new StandardFilter( result );
- result = new StopFilter( result, stoptable );
- result = new FrenchStemFilter( result, excltable );
- // Convert to lowercase after stemming!
- result = new LowerCaseFilter( result );
- return result;
- }
- }
-
-}
+ /**
+ * {@link Analyzer} for French language.
+ * <p>
+ * Supports an external list of stopwords (words that
+ * will not be indexed at all) and an external list of exclusions (word that will
+ * not be stemmed, but indexed).
+ * A default set of stopwords is used unless an alternative list is specified, but the
+ * exclusion list is empty by default.
+ * </p>
+ *
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating FrenchAnalyzer:
+ * <ul>
+ * <li> As of 2.9, StopFilter preserves position
+ * increments
+ * </ul>
+ *
+ * <p><b>NOTE</b>: This class uses the same {@link Version}
+ * dependent settings as {@link StandardAnalyzer}.</p>
+ */
+ public sealed class FrenchAnalyzer : Analyzer
+ {
+
+ /**
+ * Extended list of typical French stopwords.
+ * @deprecated use {@link #getDefaultStopSet()} instead
+ */
+ // TODO make this private in 3.1
+ public readonly static String[] FRENCH_STOP_WORDS = {
+ "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
+ "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
+ "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
+ "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
+ "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
+ "dedans", "dehors", "delà ", "depuis", "derrière", "des", "désormais", "desquelles",
+ "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
+ "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
+ "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
+ "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
+ "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là ",
+ "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
+ "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
+ "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi",
+ "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
+ "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
+ "qui", "quoi", "quoique", "revoici", "revoilà ", "s", "sa", "sans", "sauf", "se", "selon",
+ "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
+ "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
+ "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
+ "voici", "voilà ", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à ", "ça", "ès",
+ "été", "être", "ô"
+ };
+
+ /**
+ * Contains the stopwords used with the {@link StopFilter}.
+ */
+ private readonly ISet<string> stoptable;
+ /**
+ * Contains words that should be indexed but not stemmed.
+ */
+ //TODO make this final in 3.0
+ private ISet<string> excltable = new HashSet<string>();
+
+ private readonly Version matchVersion;
+
+ /**
+ * Returns an unmodifiable instance of the default stop-words set.
+ * @return an unmodifiable instance of the default stop-words set.
+ */
+ public static ISet<string> GetDefaultStopSet()
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ static class DefaultSetHolder
+ {
+ internal static ISet<string> DEFAULT_STOP_SET = CharArraySet.UnmodifiableSet(new CharArraySet(FRENCH_STOP_WORDS, false));
+ }
+
+ /**
+ * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
+ */
+ public FrenchAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ */
+ public FrenchAnalyzer(Version matchVersion, ISet<string> stopwords)
+ : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+ }
+
+ /**
+ * Builds an analyzer with the given stop words
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ * @param stemExclutionSet
+ * a stemming exclusion set
+ */
+ public FrenchAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclutionSet)
+ {
+ this.matchVersion = matchVersion;
+ this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+ this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclutionSet));
+ }
+
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
+ */
+ public FrenchAnalyzer(Version matchVersion, params string[] stopwords)
+ : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * @throws IOException
+ * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
+ */
+ public FrenchAnalyzer(Version matchVersion, FileInfo stopwords)
+ : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
+ {
+ }
+
+ /**
+ * Builds an exclusionlist from an array of Strings.
+ * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
+ */
+ public void SetStemExclusionTable(params string[] exclusionlist)
+ {
+ excltable = StopFilter.MakeStopSet(exclusionlist);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+
+ /**
+ * Builds an exclusionlist from a Map.
+ * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
+ */
+ public void SetStemExclusionTable(IDictionary<string, string> exclusionlist)
+ {
+ excltable = new HashSet<string>(exclusionlist.Keys);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+
+ /**
+ * Builds an exclusionlist from the words contained in the given file.
+ * @throws IOException
+ * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
+ */
+ public void SetStemExclusionTable(FileInfo exclusionlist)
+ {
+ excltable = new HashSet<string>(WordlistLoader.GetWordSet(exclusionlist));
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+
+ /**
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * {@link FrenchStemFilter} and {@link LowerCaseFilter}
+ */
+ public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new StandardTokenizer(matchVersion, reader);
+ result = new StandardFilter(result);
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ result, stoptable);
+ result = new FrenchStemFilter(result, excltable);
+ // Convert to lowercase after stemming!
+ result = new LowerCaseFilter(result);
+ return result;
+ }
+
+ class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
+ * text in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * {@link FrenchStemFilter} and {@link LowerCaseFilter}
+ */
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(matchVersion, reader);
+ streams.result = new StandardFilter(streams.source);
+ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.result, stoptable);
+ streams.result = new FrenchStemFilter(streams.result, excltable);
+ // Convert to lowercase after stemming!
+ streams.result = new LowerCaseFilter(streams.result);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
+ }
+}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemFilter.cs Mon Nov 21 04:44:55 2011
@@ -20,145 +20,94 @@
*/
using System;
+using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Collections;
using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
namespace Lucene.Net.Analysis.Fr
{
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2004 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// A filter that stemms french words. It supports a table of words that should
- /// not be stemmed at all. The used stemmer can be changed at runtime after the
- /// filter object is created (as long as it is a FrenchStemmer).
- ///
- /// <author>Patrick Talbot (based on Gerhard Schwarz work for German)</author>
- /// <version>$Id: FrenchAnalyzer.java,v 1.2 2004/01/23 20:54:47 ehatcher Exp $</version>
- /// </summary>
- public sealed class FrenchStemFilter : TokenFilter
- {
-
- /// <summary>
- /// The actual token in the input stream.
- /// </summary>
- private Token token = null;
- private FrenchStemmer stemmer = null;
- private Hashtable exclusions = null;
-
- public FrenchStemFilter( TokenStream _in ) : base(_in)
- {
- stemmer = new FrenchStemmer();
- }
-
- /// <summary>
- /// Builds a FrenchStemFilter that uses an exclusiontable.
- /// </summary>
- public FrenchStemFilter( TokenStream _in, Hashtable exclusiontable ) : this( _in )
- {
- exclusions = exclusiontable;
- }
-
- /// <summary>
- /// Returns the next token in the stream, or null at EOS
- /// </summary>
- /// <returns>
- /// Returns the next token in the stream, or null at EOS
- /// </returns>
- public override Token Next()
- {
- if ( ( token = input.Next() ) == null )
- {
- return null;
- }
- // Check the exclusiontable
- else if ( exclusions != null && exclusions.Contains( token.TermText() ) )
- {
- return token;
- }
- else
- {
- String s = stemmer.Stem( token.TermText() );
- // If not stemmed, dont waste the time creating a new token
- if ( !s.Equals( token.TermText() ) )
- {
- return new Token( s, 0, s.Length, token.Type() );
- }
- return token;
- }
- }
-
- /// <summary>
- /// Set a alternative/custom FrenchStemmer for this filter.
- /// </summary>
- public void SetStemmer( FrenchStemmer stemmer )
- {
- if ( stemmer != null )
- {
- this.stemmer = stemmer;
- }
- }
-
- /// <summary>
- /// Set an alternative exclusion list for this filter.
- /// </summary>
- public void SetExclusionTable( Hashtable exclusiontable )
- {
- exclusions = exclusiontable;
- }
- }
+ /**
+ * A {@link TokenFilter} that stems french words.
+ * <p>
+ * It supports a table of words that should
+ * not be stemmed at all. The used stemmer can be changed at runtime after the
+ * filter object is created (as long as it is a {@link FrenchStemmer}).
+ * </p>
+ * NOTE: This stemmer does not implement the Snowball algorithm correctly,
+ * especially involving case problems. It is recommended that you consider using
+ * the "French" stemmer in the snowball package instead. This stemmer will likely
+ * be deprecated in a future release.
+ */
+ public sealed class FrenchStemFilter : TokenFilter
+ {
+
+ /**
+ * The actual token in the input stream.
+ */
+ private FrenchStemmer stemmer = null;
+ private ISet<string> exclusions = null;
+
+ private TermAttribute termAtt;
+
+ public FrenchStemFilter(TokenStream _in)
+ : base(_in)
+ {
+
+ stemmer = new FrenchStemmer();
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+
+ public FrenchStemFilter(TokenStream _in, ISet<string> exclusiontable)
+ : this(_in)
+ {
+ exclusions = exclusiontable;
+ }
+
+ /**
+ * @return Returns true for the next token in the stream, or false at EOS
+ */
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ String term = termAtt.Term();
+
+ // Check the exclusion table
+ if (exclusions == null || !exclusions.Contains(term))
+ {
+ String s = stemmer.Stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.Equals(term))
+ termAtt.SetTermBuffer(s);
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ /**
+ * Set a alternative/custom {@link FrenchStemmer} for this filter.
+ */
+ public void SetStemmer(FrenchStemmer stemmer)
+ {
+ if (stemmer != null)
+ {
+ this.stemmer = stemmer;
+ }
+ }
+ /**
+ * Set an alternative exclusion list for this filter.
+ */
+ public void SetExclusionTable(IDictionary<string, string> exclusiontable)
+ {
+ exclusions = new HashSet<string>(exclusiontable.Keys);
+ }
+ }
}