You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2011/11/21 05:44:59 UTC

[Lucene.Net] svn commit: r1204353 [4/9] - in /incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src: contrib/Analyzers/ contrib/Analyzers/AR/ contrib/Analyzers/BR/ contrib/Analyzers/CJK/ contrib/Analyzers/Cn/ contrib/Analyzers/Compound/ contrib/Analyzers/Compoun...

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -20,26 +20,31 @@
 */
 
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Collections;
+using System.Linq;
 using Lucene.Net.Analysis.Standard;
 using Lucene.Net.Analysis;
+using Version = Lucene.Net.Util.Version;
 
 namespace Lucene.Net.Analysis.De
 {
-	/// <summary>
-	/// Analyzer for German language. Supports an external list of stopwords (words that
-	/// will not be indexed at all) and an external list of exclusions (word that will
-	/// not be stemmed, but indexed).
-	/// A default set of stopwords is used unless an alternative list is specified, the
-	/// exclusion list is empty by default.
-	/// </summary>
-	public class GermanAnalyzer : Analyzer
-	{
-		/// <summary>
-		/// List of typical german stopwords.
-		/// </summary>
-		private String[] GERMAN_STOP_WORDS = 
+    /// <summary>
+    /// Analyzer for German language. Supports an external list of stopwords (words that
+    /// will not be indexed at all) and an external list of exclusions (word that will
+    /// not be stemmed, but indexed).
+    /// A default set of stopwords is used unless an alternative list is specified, the
+    /// exclusion list is empty by default.
+    /// </summary>
+    public class GermanAnalyzer : Analyzer
+    {
+        /// <summary>
+        /// List of typical german stopwords.
+        /// </summary>
+        [Obsolete("Use GetDefaultStopSet() instead")]
+        //TODO: make this private in 3.1
+        private static readonly String[] GERMAN_STOP_WORDS = 
 		{
 			"einer", "eine", "eines", "einem", "einen",
 			"der", "die", "das", "dass", "daß",
@@ -55,92 +60,150 @@ namespace Lucene.Net.Analysis.De
 			"durch", "wegen"
 		};
 
-		/// <summary>
-		/// Contains the stopwords used with the StopFilter. 
-		/// </summary>
-		private Hashtable stoptable = new Hashtable();
-
-		/// <summary>
-		/// Contains words that should be indexed but not stemmed. 
-		/// </summary>
-		private Hashtable excltable = new Hashtable();
-
-		/// <summary>
-		/// Builds an analyzer. 
-		/// </summary>
-		public GermanAnalyzer()
-		{
-			stoptable = StopFilter.MakeStopSet( GERMAN_STOP_WORDS );
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words. 
-		/// </summary>
-		/// <param name="stopwords"></param>
-		public GermanAnalyzer( String[] stopwords )
-		{
-			stoptable = StopFilter.MakeStopSet( stopwords );
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words. 
-		/// </summary>
-		/// <param name="stopwords"></param>
-		public GermanAnalyzer( Hashtable stopwords )
-		{
-			stoptable = stopwords;
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words. 
-		/// </summary>
-		/// <param name="stopwords"></param>
-		public GermanAnalyzer( FileInfo stopwords )
-		{
-			stoptable = WordlistLoader.GetWordtable( stopwords );
-		}
-
-		/// <summary>
-		/// Builds an exclusionlist from an array of Strings. 
-		/// </summary>
-		/// <param name="exclusionlist"></param>
-		public void SetStemExclusionTable( String[] exclusionlist )
-		{
-			excltable = StopFilter.MakeStopSet( exclusionlist );
-		}
-
-		/// <summary>
-		/// Builds an exclusionlist from a Hashtable. 
-		/// </summary>
-		/// <param name="exclusionlist"></param>
-		public void SetStemExclusionTable( Hashtable exclusionlist )
-		{
-			excltable = exclusionlist;
-		}
-
-		/// <summary>
-		/// Builds an exclusionlist from the words contained in the given file. 
-		/// </summary>
-		/// <param name="exclusionlist"></param>
-		public void SetStemExclusionTable(FileInfo exclusionlist)
-		{
-			excltable = WordlistLoader.GetWordtable(exclusionlist);
-		}
-
-		/// <summary>
-		/// Creates a TokenStream which tokenizes all the text in the provided TextReader. 
-		/// </summary>
-		/// <param name="fieldName"></param>
-		/// <param name="reader"></param>
-		/// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
-		public override TokenStream TokenStream(String fieldName, TextReader reader)
-		{
-			TokenStream result = new StandardTokenizer( reader );
-			result = new StandardFilter( result );
-			result = new LowerCaseFilter(result);
-			result = new StopFilter( result, stoptable );
-			result = new GermanStemFilter( result, excltable );
-			return result;
-		}
-	}
+        /// <summary>
+        /// Returns a set of default German-stopwords 
+        /// </summary>
+        public static ISet<string> GetDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_SET;
+        }
+
+        private static class DefaultSetHolder
+        {
+            internal static readonly ISet<string> DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet(
+                                                                                                 GERMAN_STOP_WORDS,
+                                                                                                 false));
+        }
+
+        /// <summary>
+        /// Contains the stopwords used with the StopFilter. 
+        /// </summary>
+        //TODO: make this readonly in 3.1
+        private ISet<string> stopSet;
+
+        /// <summary>
+        /// Contains words that should be indexed but not stemmed. 
+        /// </summary>
+        //TODO: make this readonly in 3.1
+        private ISet<string> exclusionSet;
+
+        private Version matchVersion;
+
+        /// <summary>
+        /// Builds an analyzer with the default stop words:
+        /// <see cref="GetDefaultStopSet"/>
+        /// </summary>
+        [Obsolete("Use GermanAnalyzer(Version) instead")]
+        public GermanAnalyzer()
+            : this(Version.LUCENE_23)
+        {
+        }
+
+        /// <summary>
+        /// Builds an analyzer with the default stop words:
+        /// <see cref="GetDefaultStopSet"/>
+        /// </summary>
+        public GermanAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_SET)
+        { }
+
+        /// <summary>
+        /// Builds an analyzer with the given stop words. 
+        /// </summary>
+        /// <param name="matchVersion">Lucene compatibility version</param>
+        /// <param name="stopwords">a stopword set</param>
+        public GermanAnalyzer(Version matchVersion, ISet<string> stopwords)
+            : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+        {
+        }
+
+        /// <summary>
+        /// Builds an analyzer with the given stop words
+        /// </summary>
+        /// <param name="matchVersion">lucene compatibility version</param>
+        /// <param name="stopwords">a stopword set</param>
+        /// <param name="stemExclusionSet">a stemming exclusion set</param>
+        public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet)
+        {
+            stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+            exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet));
+            SetOverridesTokenStreamMethod(typeof(GermanAnalyzer));
+            this.matchVersion = matchVersion;
+        }
+
+        /// <summary>
+        /// Builds an analyzer with the given stop words. 
+        /// </summary>
+        /// <param name="stopwords"></param>
+        [Obsolete("use GermanAnalyzer(Version, Set) instead")]
+        public GermanAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+        {
+        }
+
+        /// <summary>
+        /// Builds an analyzer with the given stop words.
+        /// </summary>
+        [Obsolete("Use GermanAnalyzer(Version, ISet)")]
+        public GermanAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+            : this(matchVersion, stopwords.Keys.ToArray())
+        {
+
+        }
+
+        /// <summary>
+        /// Builds an analyzer with the given stop words. 
+        /// </summary>
+        [Obsolete("Use GermanAnalyzer(Version, ISet)")]
+        public GermanAnalyzer(Version matchVersion, FileInfo stopwords)
+            : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
+        {
+        }
+
+        /// <summary>
+        /// Builds an exclusionlist from an array of Strings. 
+        /// </summary>
+        [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
+        public void SetStemExclusionTable(String[] exclusionlist)
+        {
+            exclusionSet = StopFilter.MakeStopSet(exclusionlist);
+            SetPreviousTokenStream(null);
+        }
+
+        /// <summary>
+        /// Builds an exclusionlist from a IDictionary. 
+        /// </summary>
+        [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
+        public void SetStemExclusionTable(IDictionary<string, string> exclusionlist)
+        {
+            exclusionSet = new HashSet<string>(exclusionlist.Keys);
+            SetPreviousTokenStream(null);
+        }
+
+        /// <summary>
+        /// Builds an exclusionlist from the words contained in the given file. 
+        /// </summary>
+        [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
+        public void SetStemExclusionTable(FileInfo exclusionlist)
+        {
+            exclusionSet = WordlistLoader.GetWordSet(exclusionlist);
+            SetPreviousTokenStream(null);
+        }
+
+        /// <summary>
+        /// Creates a TokenStream which tokenizes all the text in the provided TextReader. 
+        /// </summary>
+        /// <param name="fieldName"></param>
+        /// <param name="reader"></param>
+        /// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
+        public override TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            TokenStream result = new StandardTokenizer(matchVersion, reader);
+            result = new StandardFilter(result);
+            result = new LowerCaseFilter(result);
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
+            result = new GermanStemFilter(result, exclusionSet);
+            return result;
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs Mon Nov 21 04:44:55 2011
@@ -20,87 +20,89 @@
 */
 
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Collections;
+using Lucene.Net.Analysis.Tokenattributes;
 
 namespace Lucene.Net.Analysis.De
 {
-	/// <summary>
-	/// A filter that stems German words. It supports a table of words that should
-	/// not be stemmed at all. The stemmer used can be changed at runtime after the
-	/// filter object is created (as long as it is a GermanStemmer).
-	/// </summary>
-	public sealed class GermanStemFilter : TokenFilter
-	{
-		/// <summary>
-		/// The actual token in the input stream.
-		/// </summary>
-		private Token token = null;
-		private GermanStemmer stemmer = null;
-		private Hashtable exclusions = null;
-    
-		public GermanStemFilter( TokenStream _in ) : base(_in)
-		{
-			stemmer = new GermanStemmer();
-		}
-    
-		/// <summary>
-		/// Builds a GermanStemFilter that uses an exclusiontable. 
-		/// </summary>
-		/// <param name="_in"></param>
-		/// <param name="exclusiontable"></param>
-		public GermanStemFilter( TokenStream _in, Hashtable exclusiontable ): this(_in)
-		{
-			exclusions = exclusiontable;
-		}
-    
-		/// <summary>
-		/// </summary>
-		/// <returns>Returns the next token in the stream, or null at EOS</returns>
-		public override Token Next()
-	
-		{
-			if ( ( token = input.Next() ) == null ) 
-			{
-				return null;
-			}
-				// Check the exclusiontable
-			else if ( exclusions != null && exclusions.Contains( token.TermText() ) ) 
-			{
-				return token;
-			}
-			else 
-			{
-				String s = stemmer.Stem( token.TermText() );
-				// If not stemmed, dont waste the time creating a new token
-				if ( !s.Equals( token.TermText() ) ) 
-				{
-					return new Token( s, token.StartOffset(),
-						token.EndOffset(), token.Type() );
-				}
-				return token;
-			}
-		}
-
-		/// <summary>
-		/// Set a alternative/custom GermanStemmer for this filter. 
-		/// </summary>
-		/// <param name="stemmer"></param>
-		public void SetStemmer( GermanStemmer stemmer )
-		{
-			if ( stemmer != null ) 
-			{
-				this.stemmer = stemmer;
-			}
-		}
-
-		/// <summary>
-		/// Set an alternative exclusion list for this filter. 
-		/// </summary>
-		/// <param name="exclusiontable"></param>
-		public void SetExclusionTable( Hashtable exclusiontable )
-		{
-			exclusions = exclusiontable;
-		}
-	}
+    /// <summary>
+    /// A filter that stems German words. It supports a table of words that should
+    /// not be stemmed at all. The stemmer used can be changed at runtime after the
+    /// filter object is created (as long as it is a GermanStemmer).
+    /// </summary>
+    public sealed class GermanStemFilter : TokenFilter
+    {
+        /// <summary>
+        /// The actual token in the input stream.
+        /// </summary>
+        private GermanStemmer stemmer = null;
+        private ISet<string> exclusionSet = null;
+
+        private TermAttribute termAtt;
+
+        public GermanStemFilter(TokenStream _in)
+            : base(_in)
+        {
+            stemmer = new GermanStemmer();
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        /// <summary>
+        /// Builds a GermanStemFilter that uses an exclusiontable. 
+        /// </summary>
+        /// <param name="_in"></param>
+        /// <param name="exclusiontable"></param>
+        public GermanStemFilter(TokenStream _in, ISet<string> exclusiontable)
+            : this(_in)
+        {
+            exclusionSet = exclusiontable;
+        }
+
+        /// <returns>
+        /// Returns true for next token in the stream, or false at EOS
+        /// </returns>
+        public override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                String term = termAtt.Term();
+                // Check the exclusion table.
+                if (exclusionSet == null || !exclusionSet.Contains(term))
+                {
+                    String s = stemmer.Stem(term);
+                    // If not stemmed, don't waste the time adjusting the token.
+                    if ((s != null) && !s.Equals(term))
+                        termAtt.SetTermBuffer(s);
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Set a alternative/custom GermanStemmer for this filter. 
+        /// </summary>
+        /// <param name="stemmer"></param>
+        public void SetStemmer(GermanStemmer stemmer)
+        {
+            if (stemmer != null)
+            {
+                this.stemmer = stemmer;
+            }
+        }
+
+        /// <summary>
+        /// Set an alternative exclusion list for this filter. 
+        /// </summary>
+        /// <param name="exclusiontable"></param>
+        public void SetExclusionTable(ISet<string> exclusiontable)
+        {
+            exclusionSet = exclusiontable;
+        }
+    }
 }
\ No newline at end of file

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekAnalyzer.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekAnalyzer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,155 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Standard;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analyzers.El
+{
+    /**
+     * {@link Analyzer} for the Greek language. 
+     * <p>
+     * Supports an external list of stopwords (words
+     * that will not be indexed at all).
+     * A default set of stopwords is used unless an alternative list is specified.
+     * </p>
+     *
+     * <p><b>NOTE</b>: This class uses the same {@link Version}
+     * dependent settings as {@link StandardAnalyzer}.</p>
+     */
+    public sealed class GreekAnalyzer : Analyzer
+    {
+        /**
+         * List of typical Greek stopwords.
+         */
+
+        private static readonly String[] GREEK_STOP_WORDS = {
+                                                                "ο", "η", "το", "οι", "τα", "του", "τησ", "των", "τον",
+                                                                "την", "και",
+                                                                "κι", "κ", "ειμαι", "εισαι", "ειναι", "ειμαστε", "ειστε"
+                                                                , "στο", "στον",
+                                                                "στη", "στην", "μα", "αλλα", "απο", "για", "προσ", "με",
+                                                                "σε", "ωσ",
+                                                                "παρα", "αντι", "κατα", "μετα", "θα", "να", "δε", "δεν",
+                                                                "μη", "μην",
+                                                                "επι", "ενω", "εαν", "αν", "τοτε", "που", "πωσ", "ποιοσ"
+                                                                , "ποια", "ποιο",
+                                                                "ποιοι", "ποιεσ", "ποιων", "ποιουσ", "αυτοσ", "αυτη",
+                                                                "αυτο", "αυτοι",
+                                                                "αυτων", "αυτουσ", "αυτεσ", "αυτα", "εκεινοσ", "εκεινη",
+                                                                "εκεινο",
+                                                                "εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ",
+                                                                "οπωσ", "ομωσ",
+                                                                "ισωσ", "οσο", "οτι"
+                                                            };
+
+        /**
+         * Returns a set of default Greek-stopwords 
+         * @return a set of default Greek-stopwords 
+         */
+        public static ISet<string> GetDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_SET;
+        }
+
+        private static class DefaultSetHolder
+        {
+            internal static ISet<string> DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet(GREEK_STOP_WORDS, false));
+        }
+
+        /**
+         * Contains the stopwords used with the {@link StopFilter}.
+         */
+        private readonly ISet<string> stopSet;
+
+        private readonly Version matchVersion;
+
+        public GreekAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_SET)
+        {
+        }
+
+        /**
+         * Builds an analyzer with the given stop words 
+         * 
+         * @param matchVersion
+         *          lucene compatibility version
+         * @param stopwords
+         *          a stopword set
+         */
+        public GreekAnalyzer(Version matchVersion, ISet<string> stopwords)
+        {
+            stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+            this.matchVersion = matchVersion;
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * @param stopwords Array of stopwords to use.
+         * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
+         */
+        public GreekAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+        {
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
+         */
+        public GreekAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+            : this(matchVersion, stopwords.Keys.ToArray())
+        {
+        }
+
+        /**
+         * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+         *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
+         */
+        public override TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            TokenStream result = new StandardTokenizer(matchVersion, reader);
+            result = new GreekLowerCaseFilter(result);
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                    result, stopSet);
+            return result;
+        }
+
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+         * in the provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+         *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
+         */
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new StandardTokenizer(matchVersion, reader);
+                streams.result = new GreekLowerCaseFilter(streams.source);
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.result, stopSet);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
+}

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,107 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analyzers.El
+{
+    /**
+     * Normalizes token text to lower case, removes some Greek diacritics,
+     * and standardizes final sigma to sigma. 
+     *
+     */
+    public sealed class GreekLowerCaseFilter : TokenFilter
+    {
+        private TermAttribute termAtt;
+
+        public GreekLowerCaseFilter(TokenStream _in)
+            : base(_in)
+        {
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        public override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                char[] chArray = termAtt.TermBuffer();
+                int chLen = termAtt.TermLength();
+                // TODO: iterate codepoints to support supp. characters
+                for (int i = 0; i < chLen; i++)
+                {
+                    chArray[i] = (char)lowerCase(chArray[i]);
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        private int lowerCase(int codepoint)
+        {
+            switch (codepoint)
+            {
+                /* There are two lowercase forms of sigma:
+                 *   U+03C2: small final sigma (end of word)
+                 *   U+03C3: small sigma (otherwise)
+                 *   
+                 * Standardize both to U+03C3
+                 */
+                case '\u03C2': /* small final sigma */
+                    return '\u03C3'; /* small sigma */
+
+                /* Some greek characters contain diacritics.
+                 * This filter removes these, converting to the lowercase base form.
+                 */
+
+                case '\u0386': /* capital alpha with tonos */
+                case '\u03AC': /* small alpha with tonos */
+                    return '\u03B1'; /* small alpha */
+
+                case '\u0388': /* capital epsilon with tonos */
+                case '\u03AD': /* small epsilon with tonos */
+                    return '\u03B5'; /* small epsilon */
+
+                case '\u0389': /* capital eta with tonos */
+                case '\u03AE': /* small eta with tonos */
+                    return '\u03B7'; /* small eta */
+
+                case '\u038A': /* capital iota with tonos */
+                case '\u03AA': /* capital iota with dialytika */
+                case '\u03AF': /* small iota with tonos */
+                case '\u03CA': /* small iota with dialytika */
+                case '\u0390': /* small iota with dialytika and tonos */
+                    return '\u03B9'; /* small iota */
+
+                case '\u038E': /* capital upsilon with tonos */
+                case '\u03AB': /* capital upsilon with dialytika */
+                case '\u03CD': /* small upsilon with tonos */
+                case '\u03CB': /* small upsilon with dialytika */
+                case '\u03B0': /* small upsilon with dialytika and tonos */
+                    return '\u03C5'; /* small upsilon */
+
+                case '\u038C': /* capital omicron with tonos */
+                case '\u03CC': /* small omicron with tonos */
+                    return '\u03BF'; /* small omicron */
+
+                case '\u038F': /* capital omega with tonos */
+                case '\u03CE': /* small omega with tonos */
+                    return '\u03C9'; /* small omega */
+
+                /* The previous implementation did the conversion below.
+                 * Only implemented for backwards compatibility with old indexes.
+                 */
+
+                case '\u03A2': /* reserved */
+                    return '\u03C2'; /* small final sigma */
+
+                default:
+                    return char.ToLower((char)codepoint);
+            }
+        }
+    }
+}

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianAnalyzer.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianAnalyzer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,215 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.AR;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analyzers.Fa
+{
+    /**
+     * {@link Analyzer} for Persian.
+     * <p>
+     * This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
+     * zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
+     * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
+     * </p>
+     */
+    public sealed class PersianAnalyzer : Analyzer
+    {
+
+        /**
+         * File containing default Persian stopwords.
+         * 
+         * Default stopword list is from
+         * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
+         * BSD-Licensed.
+         * 
+         */
+        public readonly static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+        /**
+         * Contains the stopwords used with the StopFilter.
+         */
+        private readonly ISet<string> stoptable;
+
+        /**
+         * The comment character in the stopwords file. All lines prefixed with this
+         * will be ignored
+         */
+        public static readonly String STOPWORDS_COMMENT = "#";
+
+        /**
+         * Returns an unmodifiable instance of the default stop-words set.
+         * @return an unmodifiable instance of the default stop-words set.
+         */
+        public static ISet<string> getDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_SET;
+        }
+
+        /**
+         * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+         * accesses the static final set the first time.;
+         */
+        private static class DefaultSetHolder
+        {
+            internal static readonly ISet<string> DEFAULT_STOP_SET;
+
+            static DefaultSetHolder()
+            {
+                try
+                {
+                    DEFAULT_STOP_SET = LoadDefaultStopWordSet();
+                }
+                catch (IOException ex)
+                {
+                    // default set should always be present as it is part of the
+                    // distribution (JAR)
+                    throw new Exception("Unable to load default stopword set");
+                }
+            }
+
+            static ISet<String> LoadDefaultStopWordSet()
+            {
+
+                var stream = System.Reflection.Assembly.GetAssembly(typeof(PersianAnalyzer)).GetManifestResourceStream("Lucene.Net.Analyzers.Fa." + DEFAULT_STOPWORD_FILE);
+                try
+                {
+                    StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
+                    // make sure it is unmodifiable as we expose it in the outer class
+                    return CharArraySet.UnmodifiableSet(new CharArraySet(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT), true));
+                }
+                finally
+                {
+                    stream.Close();
+                }
+            }
+        }
+
+        private readonly Version matchVersion;
+
+        /**
+         * Builds an analyzer with the default stop words:
+         * {@link #DEFAULT_STOPWORD_FILE}.
+         */
+        public PersianAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+        {
+
+        }
+
+        /**
+         * Builds an analyzer with the given stop words 
+         * 
+         * @param matchVersion
+         *          lucene compatibility version
+         * @param stopwords
+         *          a stopword set
+         */
+        public PersianAnalyzer(Version matchVersion, ISet<string> stopwords)
+        {
+            stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+            this.matchVersion = matchVersion;
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
+         */
+        public PersianAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+        {
+
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
+         */
+        public PersianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+            : this(matchVersion, stopwords.Keys.ToArray())
+        {
+
+        }
+
+        /**
+         * Builds an analyzer with the given stop words. Lines can be commented out
+         * using {@link #STOPWORDS_COMMENT}
+         * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
+         */
+        public PersianAnalyzer(Version matchVersion, FileInfo stopwords)
+            : this(matchVersion, WordlistLoader.GetWordSet(stopwords, STOPWORDS_COMMENT))
+        {
+
+        }
+
+        /**
+         * Creates a {@link TokenStream} which tokenizes all the text in the provided
+         * {@link Reader}.
+         * 
+         * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+         *         filtered with {@link LowerCaseFilter}, 
+         *         {@link ArabicNormalizationFilter},
+         *         {@link PersianNormalizationFilter} and Persian Stop words
+         */
+        public override TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            TokenStream result = new ArabicLetterTokenizer(reader);
+            result = new LowerCaseFilter(result);
+            result = new ArabicNormalizationFilter(result);
+            /* additional persian-specific normalization */
+            result = new PersianNormalizationFilter(result);
+            /*
+             * the order here is important: the stopword list is normalized with the
+             * above!
+             */
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                    result, stoptable);
+            return result;
+        }
+
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        }
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+         * in the provided {@link Reader}.
+         * 
+         * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+         *         filtered with {@link LowerCaseFilter}, 
+         *         {@link ArabicNormalizationFilter},
+         *         {@link PersianNormalizationFilter} and Persian Stop words
+         */
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new ArabicLetterTokenizer(reader);
+                streams.result = new LowerCaseFilter(streams.source);
+                streams.result = new ArabicNormalizationFilter(streams.result);
+                /* additional persian-specific normalization */
+                streams.result = new PersianNormalizationFilter(streams.result);
+                /*
+                 * the order here is important: the stopword list is normalized with the
+                 * above!
+                 */
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.result, stoptable);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizationFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizationFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizationFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizationFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,38 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analyzers.Fa
+{
+    /**
+ * A {@link TokenFilter} that applies {@link PersianNormalizer} to normalize the
+ * orthography.
+ * 
+ */
+
+public sealed class PersianNormalizationFilter : TokenFilter {
+
+  private readonly PersianNormalizer normalizer;
+  private readonly TermAttribute termAtt;
+
+  public PersianNormalizationFilter(TokenStream input) 
+      :base(input)
+  {
+    normalizer = new PersianNormalizer();
+    termAtt = AddAttribute<TermAttribute>();
+  }
+
+  public override bool IncrementToken()
+{
+    if (input.IncrementToken()) {
+      int newlen = normalizer.Normalize(termAtt.TermBuffer(), termAtt.TermLength());
+      termAtt.SetTermLength(newlen);
+      return true;
+    } 
+    return false;
+  }
+}
+}

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizer.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizer.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizer.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,90 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analyzers.Fa
+{
+/**
+ * Normalizer for Persian.
+ * <p>
+ * Normalization is done in-place for efficiency, operating on a termbuffer.
+ * <p>
+ * Normalization is defined as:
+ * <ul>
+ * <li>Normalization of various heh + hamza forms and heh goal to heh.
+ * <li>Normalization of farsi yeh and yeh barree to arabic yeh
+ * <li>Normalization of persian keheh to arabic kaf
+ * </ul>
+ * 
+ */
+public class PersianNormalizer {
+  public const char YEH = '\u064A';
+
+  public const char FARSI_YEH = '\u06CC';
+
+  public const char YEH_BARREE = '\u06D2';
+
+  public const char KEHEH = '\u06A9';
+
+  public const char KAF = '\u0643';
+
+  public const char HAMZA_ABOVE = '\u0654';
+
+  public const char HEH_YEH = '\u06C0';
+
+  public const char HEH_GOAL = '\u06C1';
+
+  public const char HEH = '\u0647';
+
+  /**
+   * Normalize an input buffer of Persian text
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int Normalize(char[] s, int len) {
+
+    for (int i = 0; i < len; i++) {
+      switch (s[i]) {
+      case FARSI_YEH:
+      case YEH_BARREE:
+        s[i] = YEH;
+        break;
+      case KEHEH:
+        s[i] = KAF;
+        break;
+      case HEH_YEH:
+      case HEH_GOAL:
+        s[i] = HEH;
+        break;
+      case HAMZA_ABOVE: // necessary for HEH + HAMZA
+        len = Delete(s, i, len);
+        i--;
+        break;
+      default:
+        break;
+      }
+    }
+
+    return len;
+  }
+
+  /**
+   * Delete a character in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len length of input buffer
+   * @return length of input buffer after deletion
+   */
+  protected int Delete(char[] s, int pos, int len) {
+    if (pos < len)
+      Array.Copy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+
+}
+}

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/FileDiffs.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/FileDiffs.txt?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/FileDiffs.txt (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/FileDiffs.txt Mon Nov 21 04:44:55 2011
@@ -0,0 +1,155 @@
+TODO: make sure namespaces match Lucene's.  (defaults to Analyzers instead of Analysis)
+TODO: Fix method naming
+TODO: Convert all javaDoc Comments to C# XML
+TODO: Normalize Line Endings
+TODO: Fix sb.ToString().Substring t-> sb.ToString(int, int)
+TODO: RussianStemFilter.SetStemmer()
+
+analysis\ar\
+analysis\ar\ArabicAnalyzer.java - PORTED
+analysis\ar\ArabicLetterTokenizer.java - PORTED
+analysis\ar\ArabicNormalizationFilter.java - PORTED
+analysis\ar\ArabicNormalizer.java - PORTED
+analysis\ar\ArabicStemFilter.java - PORTED
+analysis\ar\ArabicStemmer.java - IDENTICAL
+analysis\ar\package.html - IDENTICAL
+
+analysis\br\
+analysis\br\BrazilianAnalyzer.java - PORTED
+analysis\br\BrazilianStemFilter.java - PORTED
+analysis\br\BrazilianStemmer.java - IDENTICAL
+analysis\br\package.html - IDENTICAL
+
+analysis\cjk\
+analysis\cjk\CJKAnalyzer.java - PORTED
+analysis\cjk\CJKTokenizer.java - PORTED 
+analysis\cjk\package.html - IDENTICAL
+
+analysis\cn\
+analysis\cn\ChineseAnalyzer.java - PORTED
+analysis\cn\ChineseFilter.java - PORTED
+analysis\cn\ChineseTokenizer.java - PORTED
+analysis\cn\package.html - IDENTICAL
+
+analysis\compound\hyphenation\
+analysis\compound\hyphenation\ByteVector.java - ADDED
+analysis\compound\hyphenation\CharVector.java - ADDED
+analysis\compound\hyphenation\Hyphen.java - Text files are different
+analysis\compound\hyphenation\hyphenation.dtd - IDENTICAL
+analysis\compound\hyphenation\Hyphenation.java - Text files are different
+analysis\compound\hyphenation\HyphenationException.java - Text files are different
+analysis\compound\hyphenation\HyphenationTree.java - Text files are different
+analysis\compound\hyphenation\package.html - Text files are different
+analysis\compound\hyphenation\PatternConsumer.java - Text files are different
+analysis\compound\hyphenation\PatternParser.java - Text files are different
+analysis\compound\hyphenation\TernaryTree.java - Text files are different
+
+analysis\compound\
+analysis\compound\CompoundWordTokenFilterBase.java - PORTED
+analysis\compound\DictionaryCompoundWordTokenFilter.java - PORTED
+analysis\compound\HyphenationCompoundWordTokenFilter.java - Text files are different
+analysis\compound\package.html - IDENTICAL
+
+analysis\cz\
+analysis\cz\CzechAnalyzer.java - PORTED
+analysis\cz\package.html - IDENTICAL
+
+analysis\de\
+analysis\de\GermanAnalyzer.java - PORTED
+analysis\de\GermanStemFilter.java - PORTED
+analysis\de\GermanStemmer.java - PORTED
+analysis\de\package.html - IDENTICAL
+
+analysis\el\
+analysis\el\GreekAnalyzer.java - PORTED
+analysis\el\GreekCharsets.java - REMOVED IN 3.x
+analysis\el\GreekLowerCaseFilter.java - PORTED
+analysis\el\package.html - IDENTICAL
+
+analysis\fa\
+analysis\fa\package.html - IDENTICAL
+analysis\fa\PersianAnalyzer.java - PORTED
+analysis\fa\PersianNormalizationFilter.java - PORTED
+analysis\fa\PersianNormalizer.java - PORTED
+
+analysis\fr\
+analysis\fr\ElisionFilter.java - PORTED
+analysis\fr\analysis\fr\FrenchAnalyzer.java - PORTED
+analysis\fr\FrenchStemFilter.java - PORTED
+analysis\fr\FrenchStemmer.java - PORTED
+analysis\fr\package.html - IDENTICAL
+
+analysis\miscellaneous\
+analysis\miscellaneous\EmptyTokenStream.java - PORTED
+analysis\miscellaneous\package.html - IDENTICAL
+analysis\miscellaneous\PatternAnalyzer.java - PORTED
+analysis\miscellaneous\PrefixAndSuffixAwareTokenFilter.java - PORTED
+analysis\miscellaneous\PrefixAwareTokenFilter.java - PORTED
+analysis\miscellaneous\SingleTokenTokenStream.java - PORTED
+
+analysis\ngram\
+analysis\ngram\EdgeNGramTokenFilter.java - PORTED
+analysis\ngram\EdgeNGramTokenizer.java - PORTED
+analysis\ngram\NGramTokenFilter.java - PORTED
+analysis\ngram\NGramTokenizer.java - PORTED
+analysis\ngram\package.html - IDENTICAL
+
+analysis\nl\
+analysis\nl\DutchAnalyzer.java - PORTED
+analysis\nl\DutchStemFilter.java - PORTED
+analysis\nl\DutchStemmer.java - PORTED
+analysis\nl\package.html - IDENTICAL
+analysis\nl\WordlistLoader.java - REMOVED IN 3.x
+
+analysis\payloads\
+analysis\payloads\AbstractEncoder.java - PORTED
+analysis\payloads\DelimitedPayloadTokenFilter.java -  PORTED
+analysis\payloads\FloatEncoder.java -  PORTED
+analysis\payloads\IdentityEncoder.java -  PORTED
+analysis\payloads\IntegerEncoder.java -  PORTED
+analysis\payloads\NumericPayloadTokenFilter.java - PORTED
+analysis\payloads\package.html -  Text file is missing
+analysis\payloads\PayloadEncoder.java - PORTED
+analysis\payloads\PayloadHelper.java - IDENTICAL
+analysis\payloads\TokenOffsetPayloadTokenFilter.java - PORTED
+analysis\payloads\TypeAsPayloadTokenFilter.java - PORTED
+
+analysis\position\
+analysis\position\package.html - IDENTICAL
+analysis\position\PositionFilter.java - PORTED
+
+analysis\query\
+analysis\query\package.html - IDENTICAL
+analysis\query\QueryAutoStopWordAnalyzer.java - PORTED
+
+analysis\reverse\
+analysis\reverse\package.html - IDENTICAL
+analysis\reverse\ReverseStringFilter.java - PORTED
+
+analysis\ru\
+analysis\ru\package.html - IDENTICAL
+analysis\ru\RussianAnalyzer.java - PORTED
+analysis\ru\RussianCharsets.java - REMOVED IN 3.x
+analysis\ru\RussianLetterTokenizer.java - PORTED
+analysis\ru\RussianLowerCaseFilter.java - PORTED
+analysis\ru\RussianStemFilter.java - PORTED
+analysis\ru\RussianStemmer.java - PORTED
+
+analysis\shingle\
+analysis\shingle\package.html - IDENTICAL
+analysis\shingle\ShingleAnalyzerWrapper.java - PORTED
+analysis\shingle\ShingleFilter.java - PORTED
+analysis\shingle\ShingleMatrixFilter.java - PORTED
+analysis\sinks\
+analysis\sinks\DateRecognizerSinkFilter.java - PORTED
+analysis\sinks\DateRecognizerSinkTokenizer.java - REMOVED IN 3.x
+analysis\sinks\package.html - IDENTICAL
+analysis\sinks\TokenRangeSinkFilter.java - PORTED
+analysis\sinks\TokenRangeSinkTokenizer.java - REMOVED IN 3.x
+analysis\sinks\TokenTypeSinkFilter.java - PORTED
+analysis\sinks\TokenTypeSinkTokenizer.java - REMOVED IN 3.x
+
+analysis\th\
+analysis\th\package.html - IDENTICAL
+analysis\th\ThaiAnalyzer.java - PORTED
+analysis\th\ThaiWordFilter.java - PORTED WITH ISSUES - No BreakIterator.  Won't compile; commented out

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/ElisionFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/ElisionFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/ElisionFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/ElisionFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analyzers.Fr
+{
+    /**
+     * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
+     * tokenized as "avion" (plane).
+     * <p>
+     * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out.
+     * 
+     * @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
+     */
+    public sealed class ElisionFilter : TokenFilter
+    {
+        private CharArraySet articles = null;
+        private TermAttribute termAtt;
+
+        private static char[] apostrophes = { '\'', '’' };
+
+        public void SetArticles(ISet<string> articles)
+        {
+            if (articles is CharArraySet)
+                this.articles = (CharArraySet)articles;
+            else
+                this.articles = new CharArraySet(articles, true);
+        }
+
+        /**
+         * Constructs an elision filter with standard stop words
+         */
+        internal ElisionFilter(TokenStream input)
+            : base(input)
+        {
+            this.articles = new CharArraySet(new[] { "l", "m", "t", "qu", "n", "s", "j" }, true);
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        /**
+         * Constructs an elision filter with a Set of stop words
+         */
+        public ElisionFilter(TokenStream input, ISet<string> articles)
+            : base(input)
+        {
+            SetArticles(articles);
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        /**
+         * Constructs an elision filter with an array of stop words
+         */
+        public ElisionFilter(TokenStream input, string[] articles)
+            : base(input)
+        {
+            this.articles = new CharArraySet(articles, true);
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        /**
+         * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start
+         */
+        public override sealed bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                char[] termBuffer = termAtt.TermBuffer();
+                int termLength = termAtt.TermLength();
+
+                int minPoz = int.MaxValue;
+                for (int i = 0; i < apostrophes.Length; i++)
+                {
+                    char apos = apostrophes[i];
+                    // The equivalent of String.indexOf(ch)
+                    for (int poz = 0; poz < termLength; poz++)
+                    {
+                        if (termBuffer[poz] == apos)
+                        {
+                            minPoz = Math.Min(poz, minPoz);
+                            break;
+                        }
+                    }
+                }
+
+                // An apostrophe has been found. If the prefix is an article strip it off.
+                if (minPoz != int.MaxValue
+                    && articles.Contains(termAtt.TermBuffer(), 0, minPoz))
+                {
+                    termAtt.SetTermBuffer(termAtt.TermBuffer(), minPoz + 1, termAtt.TermLength() - (minPoz + 1));
+                }
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+}

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -20,6 +20,7 @@
 */
 
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Text;
 using System.Collections;
@@ -27,192 +28,235 @@ using System.Collections;
 using Lucene.Net.Analysis;
 using Lucene.Net.Analysis.De;
 using Lucene.Net.Analysis.Standard;
+using Version = Lucene.Net.Util.Version;
 
 namespace Lucene.Net.Analysis.Fr
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// Analyzer for french language. Supports an external list of stopwords (words that
-	/// will not be indexed at all) and an external list of exclusions (word that will
-	/// not be stemmed, but indexed).
-	/// A default set of stopwords is used unless an other list is specified, the
-	/// exclusionlist is empty by default.
-	/// 
-	/// <author>Patrick Talbot (based on Gerhard Schwarz work for German)</author>
-	/// <version>$Id: FrenchAnalyzer.java,v 1.9 2004/10/17 11:41:40 dnaber Exp $</version>
-	/// </summary>
-	public sealed class FrenchAnalyzer : Analyzer 
-	{
-
-		/// <summary>
-		/// Extended list of typical french stopwords.
-		/// </summary>
-		public static String[] FRENCH_STOP_WORDS = 
-				 {
-					 "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
-					 "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
-					 "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
-					 "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
-					 "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
-					 "dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles",
-					 "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
-					 "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
-					 "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
-					 "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
-					 "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là",
-					 "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
-					 "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
-					 "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi",
-					 "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
-					 "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
-					 "qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf", "se", "selon",
-					 "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
-					 "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
-					 "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
-					 "voici", "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès",
-					 "été", "être", "ô"
-				 };
-
-		/// <summary>
-		/// Contains the stopwords used with the StopFilter.
-		/// </summary>
-		private Hashtable stoptable = new Hashtable();
-
-		/// <summary>
-		/// Contains words that should be indexed but not stemmed.
-		/// </summary>
-		private Hashtable excltable = new Hashtable();
-
-		/// <summary>
-		/// Builds an analyzer.
-		/// </summary>
-		public FrenchAnalyzer() 
-		{
-			stoptable = StopFilter.MakeStopSet( FRENCH_STOP_WORDS );
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words.
-		/// </summary>
-		public FrenchAnalyzer( String[] stopwords ) 
-		{
-			stoptable = StopFilter.MakeStopSet( stopwords );
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words.
-		/// </summary>
-		public FrenchAnalyzer( Hashtable stopwords ) 
-		{
-			stoptable = stopwords;
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words.
-		/// </summary>
-		public FrenchAnalyzer( FileInfo stopwords ) 
-		{
-			stoptable = WordlistLoader.GetWordtable( stopwords );
-		}
-
-		/// <summary>
-		/// Builds an exclusionlist from an array of Strings.
-		/// </summary>
-		public void SetStemExclusionTable( String[] exclusionlist ) 
-		{
-			excltable = StopFilter.MakeStopSet( exclusionlist );
-		}
-
-		/// <summary>
-		/// Builds an exclusionlist from a Hashtable.
-		/// </summary>
-		public void SetStemExclusionTable( Hashtable exclusionlist ) 
-		{
-			excltable = exclusionlist;
-		}
-
-		/// <summary>
-		/// Builds an exclusionlist from the words contained in the given file.
-		/// </summary>
-		public void SetStemExclusionTable( FileInfo exclusionlist ) 
-		{
-			excltable = WordlistLoader.GetWordtable( exclusionlist );
-		}
-
-		/// <summary>
-		/// Creates a TokenStream which tokenizes all the text in the provided Reader.
-		/// </summary>
-		/// <returns>
-		/// A TokenStream build from a StandardTokenizer filtered with
-		/// 	StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
-		/// </returns>
-		public override TokenStream TokenStream( String fieldName, TextReader reader ) 
-		{
-		
-			if (fieldName==null) throw new ArgumentException("fieldName must not be null");
-			if (reader==null) throw new ArgumentException("readermust not be null");
-				
-			TokenStream result = new StandardTokenizer( reader );
-			result = new StandardFilter( result );
-			result = new StopFilter( result, stoptable );
-			result = new FrenchStemFilter( result, excltable );
-			// Convert to lowercase after stemming!
-			result = new LowerCaseFilter( result );
-			return result;
-		}
-	}
-
-}
+    /**
+ * {@link Analyzer} for French language. 
+ * <p>
+ * Supports an external list of stopwords (words that
+ * will not be indexed at all) and an external list of exclusions (word that will
+ * not be stemmed, but indexed).
+ * A default set of stopwords is used unless an alternative list is specified, but the
+ * exclusion list is empty by default.
+ * </p>
+ *
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating FrenchAnalyzer:
+ * <ul>
+ *   <li> As of 2.9, StopFilter preserves position
+ *        increments
+ * </ul>
+ *
+ * <p><b>NOTE</b>: This class uses the same {@link Version}
+ * dependent settings as {@link StandardAnalyzer}.</p>
+ */
+    public sealed class FrenchAnalyzer : Analyzer
+    {
+
+        /**
+         * Extended list of typical French stopwords.
+         * @deprecated use {@link #getDefaultStopSet()} instead
+         */
+        // TODO make this private in 3.1
+        public readonly static String[] FRENCH_STOP_WORDS = {
+    "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
+    "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
+    "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
+    "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
+    "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
+    "dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles",
+    "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
+    "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
+    "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
+    "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
+    "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là",
+    "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
+    "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
+    "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi",
+    "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
+    "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
+    "qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf", "se", "selon",
+    "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
+    "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
+    "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
+    "voici", "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès",
+    "été", "être", "ô"
+  };
+
+        /**
+         * Contains the stopwords used with the {@link StopFilter}.
+         */
+        private readonly ISet<string> stoptable;
+        /**
+         * Contains words that should be indexed but not stemmed.
+         */
+        //TODO make this final in 3.0
+        private ISet<string> excltable = new HashSet<string>();
+
+        private readonly Version matchVersion;
+
+        /**
+         * Returns an unmodifiable instance of the default stop-words set.
+         * @return an unmodifiable instance of the default stop-words set.
+         */
+        public static ISet<string> GetDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_SET;
+        }
+
+        static class DefaultSetHolder
+        {
+            internal static ISet<string> DEFAULT_STOP_SET = CharArraySet.UnmodifiableSet(new CharArraySet(FRENCH_STOP_WORDS, false));
+        }
+
+        /**
+         * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
+         */
+        public FrenchAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+        {
+
+        }
+
+        /**
+         * Builds an analyzer with the given stop words
+         * 
+         * @param matchVersion
+         *          lucene compatibility version
+         * @param stopwords
+         *          a stopword set
+         */
+        public FrenchAnalyzer(Version matchVersion, ISet<string> stopwords)
+            : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+        {
+        }
+
+        /**
+         * Builds an analyzer with the given stop words
+         * 
+         * @param matchVersion
+         *          lucene compatibility version
+         * @param stopwords
+         *          a stopword set
+         * @param stemExclutionSet
+         *          a stemming exclusion set
+         */
+        public FrenchAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclutionSet)
+        {
+            this.matchVersion = matchVersion;
+            this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+            this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclutionSet));
+        }
+
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
+         */
+        public FrenchAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+        {
+
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * @throws IOException
+         * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
+         */
+        public FrenchAnalyzer(Version matchVersion, FileInfo stopwords)
+            : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
+        {
+        }
+
+        /**
+         * Builds an exclusionlist from an array of Strings.
+         * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
+         */
+        public void SetStemExclusionTable(params string[] exclusionlist)
+        {
+            excltable = StopFilter.MakeStopSet(exclusionlist);
+            SetPreviousTokenStream(null); // force a new stemmer to be created
+        }
+
+        /**
+         * Builds an exclusionlist from a Map.
+         * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
+         */
+        public void SetStemExclusionTable(IDictionary<string, string> exclusionlist)
+        {
+            excltable = new HashSet<string>(exclusionlist.Keys);
+            SetPreviousTokenStream(null); // force a new stemmer to be created
+        }
+
+        /**
+         * Builds an exclusionlist from the words contained in the given file.
+         * @throws IOException
+         * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
+         */
+        public void SetStemExclusionTable(FileInfo exclusionlist)
+        {
+            excltable = new HashSet<string>(WordlistLoader.GetWordSet(exclusionlist));
+            SetPreviousTokenStream(null); // force a new stemmer to be created
+        }
+
+        /**
+         * Creates a {@link TokenStream} which tokenizes all the text in the provided
+         * {@link Reader}.
+         *
+         * @return A {@link TokenStream} built from a {@link StandardTokenizer} 
+         *         filtered with {@link StandardFilter}, {@link StopFilter}, 
+         *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
+         */
+        public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            TokenStream result = new StandardTokenizer(matchVersion, reader);
+            result = new StandardFilter(result);
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                    result, stoptable);
+            result = new FrenchStemFilter(result, excltable);
+            // Convert to lowercase after stemming!
+            result = new LowerCaseFilter(result);
+            return result;
+        }
+
+        class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the 
+         * text in the provided {@link Reader}.
+         *
+         * @return A {@link TokenStream} built from a {@link StandardTokenizer} 
+         *         filtered with {@link StandardFilter}, {@link StopFilter}, 
+         *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
+         */
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new StandardTokenizer(matchVersion, reader);
+                streams.result = new StandardFilter(streams.source);
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.result, stoptable);
+                streams.result = new FrenchStemFilter(streams.result, excltable);
+                // Convert to lowercase after stemming!
+                streams.result = new LowerCaseFilter(streams.result);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
+}
\ No newline at end of file

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemFilter.cs Mon Nov 21 04:44:55 2011
@@ -20,145 +20,94 @@
 */
 
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Text;
 using System.Collections;
 
 using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
 
 namespace Lucene.Net.Analysis.Fr
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// A filter that stemms french words. It supports a table of words that should
-	/// not be stemmed at all. The used stemmer can be changed at runtime after the
-	/// filter object is created (as long as it is a FrenchStemmer).
-	/// 
-	/// <author>Patrick Talbot (based on Gerhard Schwarz work for German)</author>
-	/// <version>$Id: FrenchAnalyzer.java,v 1.2 2004/01/23 20:54:47 ehatcher Exp $</version>
-	/// </summary>
-	public sealed class FrenchStemFilter : TokenFilter 
-	{
-
-		/// <summary>
-		/// The actual token in the input stream.
-		/// </summary>
-		private Token token = null;
-		private FrenchStemmer stemmer = null;
-		private Hashtable exclusions = null;
-
-		public FrenchStemFilter( TokenStream _in ) : base(_in)
-		{
-			stemmer = new FrenchStemmer();
-		}
-
-		/// <summary>
-		/// Builds a FrenchStemFilter that uses an exclusiontable.
-		/// </summary>
-		public FrenchStemFilter( TokenStream _in, Hashtable exclusiontable ) : 	this( _in )
-		{
-			exclusions = exclusiontable;
-		}
-
-		/// <summary>
-		/// Returns the next token in the stream, or null at EOS
-		/// </summary>
-		/// <returns>
-		/// Returns the next token in the stream, or null at EOS
-		/// </returns>
-		public override Token Next()
-		{
-			if ( ( token = input.Next() ) == null ) 
-			{
-				return null;
-			}
-				// Check the exclusiontable
-			else if ( exclusions != null && exclusions.Contains( token.TermText() ) ) 
-			{
-				return token;
-			}
-			else 
-			{
-				String s = stemmer.Stem( token.TermText() );
-				// If not stemmed, dont waste the time creating a new token
-				if ( !s.Equals( token.TermText() ) ) 
-				{
-					return new Token( s, 0, s.Length, token.Type() );
-				}
-				return token;
-			}
-		}
-
-		/// <summary>
-		/// Set a alternative/custom FrenchStemmer for this filter.
-		/// </summary>
-		public void SetStemmer( FrenchStemmer stemmer ) 
-		{
-			if ( stemmer != null ) 
-			{
-				this.stemmer = stemmer;
-			}
-		}
-
-		/// <summary>
-		/// Set an alternative exclusion list for this filter.
-		/// </summary>
-		public void SetExclusionTable( Hashtable exclusiontable ) 
-		{
-			exclusions = exclusiontable;
-		}
-	}
+    /**
+ * A {@link TokenFilter} that stems french words. 
+ * <p>
+ * It supports a table of words that should
+ * not be stemmed at all. The used stemmer can be changed at runtime after the
+ * filter object is created (as long as it is a {@link FrenchStemmer}).
+ * </p>
+ * NOTE: This stemmer does not implement the Snowball algorithm correctly,
+ * especially involving case problems. It is recommended that you consider using
+ * the "French" stemmer in the snowball package instead. This stemmer will likely
+ * be deprecated in a future release.
+ */
+    public sealed class FrenchStemFilter : TokenFilter
+    {
+
+        /**
+         * The actual token in the input stream.
+         */
+        private FrenchStemmer stemmer = null;
+        private ISet<string> exclusions = null;
+
+        private TermAttribute termAtt;
+
+        public FrenchStemFilter(TokenStream _in)
+            : base(_in)
+        {
+
+            stemmer = new FrenchStemmer();
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+
+        public FrenchStemFilter(TokenStream _in, ISet<string> exclusiontable)
+            : this(_in)
+        {
+            exclusions = exclusiontable;
+        }
+
+        /**
+         * @return  Returns true for the next token in the stream, or false at EOS
+         */
+        public override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                String term = termAtt.Term();
+
+                // Check the exclusion table
+                if (exclusions == null || !exclusions.Contains(term))
+                {
+                    String s = stemmer.Stem(term);
+                    // If not stemmed, don't waste the time  adjusting the token.
+                    if ((s != null) && !s.Equals(term))
+                        termAtt.SetTermBuffer(s);
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+        /**
+         * Set a alternative/custom {@link FrenchStemmer} for this filter.
+         */
+        public void SetStemmer(FrenchStemmer stemmer)
+        {
+            if (stemmer != null)
+            {
+                this.stemmer = stemmer;
+            }
+        }
+        /**
+         * Set an alternative exclusion list for this filter.
+         */
+        public void SetExclusionTable(IDictionary<string, string> exclusiontable)
+        {
+            exclusions = new HashSet<string>(exclusiontable.Keys);
+        }
+    }
 }