You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2012/02/28 23:43:28 UTC
[Lucene.Net] svn commit: r1294875 [5/45] - in /incubator/lucene.net/trunk: ./ build/
build/vs2010/contrib/ build/vs2010/test/ doc/ src/ src/contrib/Analyzers/
src/contrib/Analyzers/AR/ src/contrib/Analyzers/BR/
src/contrib/Analyzers/CJK/ src/contrib/Analyzers/Cn/ s...
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -20,198 +20,269 @@
*/
using System;
+using System.Collections.Generic;
using System.IO;
using System.Collections;
using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Support;
+using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.Nl
{
-
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// Analyzer for Dutch language. Supports an external list of stopwords (words that
- /// will not be indexed at all), an external list of exclusions (word that will
- /// not be stemmed, but indexed) and an external list of word-stem pairs that overrule
- /// the algorithm (dictionary stemming).
- /// A default set of stopwords is used unless an alternative list is specified, the
- /// exclusion list is empty by default.
- /// <version>$Id: DutchAnalyzer.java,v 1.1 2004/03/09 14:55:08 otis Exp $</version>
- /// </summary>
- /// <author>Edwin de Jonge</author>
- public class DutchAnalyzer : Analyzer
- {
- /// <summary>
- /// List of typical german stopwords.
- /// </summary>
- public static string[] DUTCH_STOP_WORDS =
- {
- "de","en","van","ik","te","dat","die","in","een",
- "hij","het","niet","zijn","is","was","op","aan","met","als","voor","had",
- "er","maar","om","hem","dan","zou","of","wat","mijn","men","dit","zo",
- "door","over","ze","zich","bij","ook","tot","je","mij","uit","der","daar",
- "haar","naar","heb","hoe","heeft","hebben","deze","u","want","nog","zal",
- "me","zij","nu","ge","geen","omdat","iets","worden","toch","al","waren",
- "veel","meer","doen","toen","moet","ben","zonder","kan","hun","dus",
- "alles","onder","ja","eens","hier","wie","werd","altijd","doch","wordt",
- "wezen","kunnen","ons","zelf","tegen","na","reeds","wil","kon","niets",
- "uw","iemand","geweest","andere"
- };
- /// <summary>
- /// Contains the stopwords used with the StopFilter.
- /// </summary>
- private Hashtable stoptable = new Hashtable();
-
- /// <summary>
- /// Contains words that should be indexed but not stemmed.
- /// </summary>
- private Hashtable excltable = new Hashtable();
-
- private Hashtable _stemdict = new Hashtable();
-
- /// <summary>
- /// Builds an analyzer.
- /// </summary>
- public DutchAnalyzer()
- {
- stoptable = StopFilter.MakeStopSet( DUTCH_STOP_WORDS );
- _stemdict.Add("fiets","fiets"); //otherwise fiet
- _stemdict.Add("bromfiets","bromfiets"); //otherwise bromfiet
- _stemdict.Add("ei","eier");
- _stemdict.Add("kind","kinder");
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="stopwords"></param>
- public DutchAnalyzer( String[] stopwords )
- {
- stoptable = StopFilter.MakeStopSet( stopwords );
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="stopwords"></param>
- public DutchAnalyzer( Hashtable stopwords )
- {
- stoptable = stopwords;
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="stopwords"></param>
- public DutchAnalyzer( FileInfo stopwords )
- {
- stoptable = WordlistLoader.GetWordtable( stopwords );
- }
-
- /// <summary>
- /// Builds an exclusionlist from an array of Strings.
- /// </summary>
- /// <param name="exclusionlist"></param>
- public void SetStemExclusionTable( String[] exclusionlist )
- {
- excltable = StopFilter.MakeStopSet( exclusionlist );
- }
-
- /// <summary>
- /// Builds an exclusionlist from a Hashtable.
- /// </summary>
- /// <param name="exclusionlist"></param>
- public void SetStemExclusionTable( Hashtable exclusionlist )
- {
- excltable = exclusionlist;
- }
-
- /// <summary>
- /// Builds an exclusionlist from the words contained in the given file.
- /// </summary>
- /// <param name="exclusionlist"></param>
- public void SetStemExclusionTable(FileInfo exclusionlist)
- {
- excltable = WordlistLoader.GetWordtable(exclusionlist);
- }
-
- /// <summary>
- /// Reads a stemdictionary file , that overrules the stemming algorithm
- /// This is a textfile that contains per line
- /// word\tstem
- /// i.e: tabseperated
- /// </summary>
- /// <param name="stemdict"></param>
- public void SetStemDictionary(FileInfo stemdict)
- {
- _stemdict = WordlistLoader.GetStemDict(stemdict);
- }
-
- /// <summary>
- /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
- /// </summary>
- /// <param name="fieldName"></param>
- /// <param name="reader"></param>
- /// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
- public override TokenStream TokenStream(String fieldName, TextReader reader)
- {
- TokenStream result = new StandardTokenizer( reader );
- result = new StandardFilter( result );
- result = new StopFilter( result, stoptable );
- result = new DutchStemFilter( result, excltable, _stemdict);
- return result;
- }
- }
+ /**
+ * {@link Analyzer} for Dutch language.
+ * <p>
+ * Supports an external list of stopwords (words that
+ * will not be indexed at all), an external list of exclusions (word that will
+ * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
+ * the algorithm (dictionary stemming).
+ * A default set of stopwords is used unless an alternative list is specified, but the
+ * exclusion list is empty by default.
+ * </p>
+ *
+ * <p><b>NOTE</b>: This class uses the same {@link Version}
+ * dependent settings as {@link StandardAnalyzer}.</p>
+ */
+ public class DutchAnalyzer : Analyzer
+ {
+ /**
+ * List of typical Dutch stopwords.
+ * @deprecated use {@link #getDefaultStopSet()} instead
+ */
+ public static readonly String[] DUTCH_STOP_WORDS =
+ {
+ "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
+ "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
+ "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
+ "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
+ "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
+ "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
+ "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
+ "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
+ "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
+ "uw", "iemand", "geweest", "andere"
+ };
+ /**
+ * Returns an unmodifiable instance of the default stop-words set.
+ * @return an unmodifiable instance of the default stop-words set.
+ */
+ public static ISet<string> getDefaultStopSet()
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ static class DefaultSetHolder
+ {
+ internal static readonly ISet<string> DEFAULT_STOP_SET = CharArraySet
+ .UnmodifiableSet(new CharArraySet(DUTCH_STOP_WORDS, false));
+ }
+
+
+ /**
+ * Contains the stopwords used with the StopFilter.
+ */
+ private readonly ISet<string> stoptable;
+
+ /**
+ * Contains words that should be indexed but not stemmed.
+ */
+ private ISet<string> excltable = new HashSet<string>();
+
+ private IDictionary<String, String> stemdict = new HashMap<String, String>();
+ private readonly Version matchVersion;
+
+ /**
+ * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS})
+ * and a few default entries for the stem exclusion table.
+ *
+ */
+ public DutchAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ stemdict.Add("fiets", "fiets"); //otherwise fiet
+ stemdict.Add("bromfiets", "bromfiets"); //otherwise bromfiet
+ stemdict.Add("ei", "eier");
+ stemdict.Add("kind", "kinder");
+ }
+
+ public DutchAnalyzer(Version matchVersion, ISet<string> stopwords)
+ : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+
+ }
+
+ public DutchAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionTable)
+ {
+ stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+ excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionTable));
+ this.matchVersion = matchVersion;
+ SetOverridesTokenStreamMethod<DutchAnalyzer>();
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion
+ * @param stopwords
+ * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
+ */
+ public DutchAnalyzer(Version matchVersion, params string[] stopwords)
+ : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopwords
+ * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
+ */
+ public DutchAnalyzer(Version matchVersion, HashSet<string> stopwords)
+ : this(matchVersion, (ISet<string>)stopwords)
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopwords
+ * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
+ */
+ public DutchAnalyzer(Version matchVersion, FileInfo stopwords)
+ {
+ // this is completely broken!
+ SetOverridesTokenStreamMethod<DutchAnalyzer>();
+ try
+ {
+ stoptable = WordlistLoader.GetWordSet(stopwords);
+ }
+ catch (IOException e)
+ {
+ // TODO: throw IOException
+ throw new Exception("", e);
+ }
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Builds an exclusionlist from an array of Strings.
+ *
+ * @param exclusionlist
+ * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
+ */
+ public void SetStemExclusionTable(params string[] exclusionlist)
+ {
+ excltable = StopFilter.MakeStopSet(exclusionlist);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+
+ /**
+ * Builds an exclusionlist from a Hashtable.
+ * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
+ */
+ public void SetStemExclusionTable(HashSet<string> exclusionlist)
+ {
+ excltable = exclusionlist;
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+
+ /**
+ * Builds an exclusionlist from the words contained in the given file.
+ * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
+ */
+ public void SetStemExclusionTable(FileInfo exclusionlist)
+ {
+ try
+ {
+ excltable = WordlistLoader.GetWordSet(exclusionlist);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+ catch (IOException e)
+ {
+ // TODO: throw IOException
+ throw new Exception("", e);
+ }
+ }
+
+ /**
+ * Reads a stemdictionary file , that overrules the stemming algorithm
+ * This is a textfile that contains per line
+ * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
+ */
+ public void SetStemDictionary(FileInfo stemdictFile)
+ {
+ try
+ {
+ stemdict = WordlistLoader.GetStemDict(stemdictFile);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+ catch (IOException e)
+ {
+ // TODO: throw IOException
+ throw new Exception(string.Empty, e);
+ }
+ }
+
+ /**
+ * Creates a {@link TokenStream} which tokenizes all the text in the
+ * provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * and {@link DutchStemFilter}
+ */
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new StandardTokenizer(matchVersion, reader);
+ result = new StandardFilter(result);
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ result, stoptable);
+ result = new DutchStemFilter(result, excltable, stemdict);
+ return result;
+ }
+
+ class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
+ * text in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * and {@link DutchStemFilter}
+ */
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ if (overridesTokenStreamMethod)
+ {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return TokenStream(fieldName, reader);
+ }
+
+ SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(matchVersion, reader);
+ streams.result = new StandardFilter(streams.source);
+ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.result, stoptable);
+ streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs Tue Feb 28 22:43:08 2012
@@ -20,167 +20,113 @@
*/
using System;
+using System.Collections.Generic;
using System.IO;
using System.Collections;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Support;
namespace Lucene.Net.Analysis.Nl
{
-
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// A filter that stems Dutch words. It supports a table of words that should
- /// not be stemmed at all. The stemmer used can be changed at runtime after the
- /// filter object is created (as long as it is a DutchStemmer).
- ///
- /// <version>$Id: DutchStemFilter.java,v 1.1 2004/03/09 14:55:08 otis Exp $</version>
- /// </summary>
- /// <author>Edwin de Jonge</author>
- public sealed class DutchStemFilter : TokenFilter
- {
- /// <summary>
- /// The actual token in the input stream.
- /// </summary>
- private Token token = null;
- private DutchStemmer stemmer = null;
- private Hashtable exclusions = null;
-
- public DutchStemFilter( TokenStream _in ) : base(_in)
- {
- stemmer = new DutchStemmer();
- }
-
- /// <summary>
- /// Builds a DutchStemFilter that uses an exclusiontable.
- /// </summary>
- /// <param name="_in"></param>
- /// <param name="exclusiontable"></param>
- public DutchStemFilter( TokenStream _in, Hashtable exclusiontable ): this(_in)
- {
- exclusions = exclusiontable;
- }
-
- /// <summary>
- ///
- /// </summary>
- /// <param name="_in"></param>
- /// <param name="exclusiontable"></param>
- /// <param name="stemdictionary">Dictionary of word stem pairs, that overrule the algorithm</param>
- public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable stemdictionary): this(_in, exclusiontable)
- {
- stemmer.SetStemDictionary(stemdictionary);
- }
-
- /// <summary>
- /// </summary>
- /// <returns>Returns the next token in the stream, or null at EOS</returns>
- public override Token Next()
-
- {
- if ( ( token = input.Next() ) == null )
- {
- return null;
- }
- // Check the exclusiontable
- else if ( exclusions != null && exclusions.Contains( token.TermText() ) )
- {
- return token;
- }
- else
- {
- String s = stemmer.Stem( token.TermText() );
- // If not stemmed, dont waste the time creating a new token
- if ( !s.Equals( token.TermText() ) )
- {
- return new Token( s, token.StartOffset(),
- token.EndOffset(), token.Type() );
- }
- return token;
- }
- }
-
- /// <summary>
- /// Set a alternative/custom DutchStemmer for this filter.
- /// </summary>
- /// <param name="stemmer"></param>
- public void SetStemmer( DutchStemmer stemmer )
- {
- if ( stemmer != null )
- {
- this.stemmer = stemmer;
- }
- }
-
- /// <summary>
- /// Set an alternative exclusion list for this filter.
- /// </summary>
- /// <param name="exclusiontable"></param>
- public void SetExclusionTable( Hashtable exclusiontable )
- {
- exclusions = exclusiontable;
- }
-
- /// <summary>
- /// Set dictionary for stemming, this dictionary overrules the algorithm,
- /// so you can correct for a particular unwanted word-stem pair.
- /// </summary>
- /// <param name="dict"></param>
- public void SetStemDictionary(Hashtable dict)
- {
- if (stemmer != null)
- stemmer.SetStemDictionary(dict);
- }
- }
+ /**
+ * A {@link TokenFilter} that stems Dutch words.
+ * <p>
+ * It supports a table of words that should
+ * not be stemmed at all. The stemmer used can be changed at runtime after the
+ * filter object is created (as long as it is a {@link DutchStemmer}).
+ * </p>
+ * NOTE: This stemmer does not implement the Snowball algorithm correctly,
+ * specifically doubled consonants. It is recommended that you consider using
+ * the "Dutch" stemmer in the snowball package instead. This stemmer will likely
+ * be deprecated in a future release.
+ */
+ public sealed class DutchStemFilter : TokenFilter
+ {
+ /**
+ * The actual token in the input stream.
+ */
+ private DutchStemmer stemmer = null;
+ private ISet<string> exclusions = null;
+
+ private TermAttribute termAtt;
+
+ public DutchStemFilter(TokenStream _in)
+ : base(_in)
+ {
+ stemmer = new DutchStemmer();
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ /**
+ * Builds a DutchStemFilter that uses an exclusion table.
+ */
+ public DutchStemFilter(TokenStream _in, ISet<string> exclusiontable)
+ : this(_in)
+ {
+ exclusions = exclusiontable;
+ }
+
+ /**
+ * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
+ */
+ public DutchStemFilter(TokenStream _in, ISet<string> exclusiontable, IDictionary<string, string> stemdictionary)
+ : this(_in, exclusiontable)
+ {
+ stemmer.SetStemDictionary(stemdictionary);
+ }
+
+ /**
+ * Returns the next token in the stream, or null at EOS
+ */
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ String term = termAtt.Term();
+
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.Contains(term))
+ {
+ String s = stemmer.Stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.Equals(term))
+ termAtt.SetTermBuffer(s);
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /**
+ * Set a alternative/custom {@link DutchStemmer} for this filter.
+ */
+ public void SetStemmer(DutchStemmer stemmer)
+ {
+ if (stemmer != null)
+ {
+ this.stemmer = stemmer;
+ }
+ }
+
+ /**
+ * Set an alternative exclusion list for this filter.
+ */
+ public void SetExclusionTable(HashSet<string> exclusiontable)
+ {
+ exclusions = exclusiontable;
+ }
+
+ /**
+ * Set dictionary for stemming, this dictionary overrules the algorithm,
+ * so you can correct for a particular unwanted word-stem pair.
+ */
+ public void SetStemDictionary(IDictionary<string, string> dict)
+ {
+ if (stemmer != null)
+ stemmer.SetStemDictionary(dict);
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs Tue Feb 28 22:43:08 2012
@@ -23,484 +23,440 @@ using System;
using System.IO;
using System.Text;
using System.Collections;
+using System.Collections.Generic;
namespace Lucene.Net.Analysis.Nl
{
-
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// A stemmer for Dutch words. The algorithm is an implementation of
- /// the <see c="http://snowball.tartarus.org/dutch/stemmer.html">dutch stemming</see>
- /// algorithm in snowball. Snowball is a project of Martin Porter (does Porter Stemmer ring a bell?):
- ///
- /// @version $Id: DutchStemmer.java,v 1.1 2004/03/09 14:55:08 otis Exp $
- /// </summary>
- /// <author>Edwin de Jonge (ejne@cbs.nl)</author>
- public class DutchStemmer
- {
- /// <summary>
- /// Buffer for the terms while stemming them.
- /// </summary>
- private StringBuilder sb = new StringBuilder();
- private bool _removedE;
- private Hashtable _stemDict;
-
-
- private int _R1;
- private int _R2;
-
- /// <summary>
- /// Stemms the given term to an unique <tt>discriminator</tt>.
- /// </summary>
- /// <param name="term">The term that should be stemmed.</param>
- /// <returns>Discriminator for <tt>term</tt></returns>
- //TODO convert to internal
- public string Stem( String term )
- {
- term = term.ToLower();
- if ( !IsStemmable( term ) )
- return term;
- if (_stemDict != null && _stemDict.Contains(term))
- return _stemDict[term] as string;
- // Reset the StringBuilder.
- sb.Remove(0, sb.Length);
- sb.Insert(0, term);
- // Stemming starts here...
- Substitute(sb);
- StoreYandI(sb);
- _R1 = GetRIndex(sb, 0);
- _R1 = Math.Max(3,_R1);
- Step1(sb);
- Step2(sb);
- _R2 = GetRIndex(sb, _R1);
- Step3a(sb);
- Step3b(sb);
- Step4(sb);
- ReStoreYandI(sb);
- return sb.ToString();
- }
-
- private bool enEnding(StringBuilder sb)
- {
- string[] enend = new string[]{"ene","en"};
- foreach(string end in enend)
- {
- string s = sb.ToString();
- int index = s.Length - end.Length;
- if ( s.EndsWith(end) &&
- index >= _R1 &&
- IsValidEnEnding(sb,index-1)
- )
- {
- sb.Remove(index, end.Length);
- UnDouble(sb,index);
- return true;
- }
- }
- return false;
- }
-
-
- private void Step1(StringBuilder sb)
- {
- if (_R1 >= sb.Length)
- return;
-
- string s = sb.ToString();
- int lengthR1 = sb.Length - _R1;
- int index;
-
- if (s.EndsWith("heden"))
- {
- sb.Replace("heden","heid", _R1, lengthR1);
- return;
- }
-
- if (enEnding(sb))
- return;
-
- if (s.EndsWith("se") &&
- (index = s.Length - 2) >= _R1 &&
- IsValidSEnding(sb, index -1)
- )
- {
- sb.Remove(index, 2);
- return;
- }
- if (s.EndsWith("s") &&
- (index = s.Length - 1) >= _R1 &&
- IsValidSEnding(sb, index - 1))
- {
- sb.Remove(index, 1);
- }
- }
-
- /// <summary>
- /// Delete suffix e if in R1 and
- /// preceded by a non-vowel, and then undouble the ending
- /// </summary>
- /// <param name="sb">string being stemmed</param>
- private void Step2(StringBuilder sb)
- {
- _removedE = false;
- if (_R1 >= sb.Length)
- return;
- string s = sb.ToString();
- int index = s.Length - 1;
- if ( index >= _R1 &&
- s.EndsWith("e") &&
- !IsVowel(sb[index-1]))
- {
- sb.Remove(index,1);
- UnDouble(sb);
- _removedE = true;
- }
- }
-
- /// <summary>
- /// Delete "heid"
- /// </summary>
- /// <param name="sb">string being stemmed</param>
- private void Step3a(StringBuilder sb)
- {
- if (_R2 >= sb.Length)
- return;
- string s = sb.ToString();
- int index = s.Length - 4;
- if (s.EndsWith("heid")&& index >= _R2 && sb[index - 1] != 'c')
- {
- sb.Remove(index,4); //remove heid
- enEnding(sb);
- }
- }
-
- /// <summary>
- /// <p>A d-suffix, or derivational suffix, enables a new word,
- /// often with a different grammatical category, or with a different
- /// sense, to be built from another word. Whether a d-suffix can be
- /// attached is discovered not from the rules of grammar, but by
- /// referring to a dictionary. So in English, ness can be added to
- /// certain adjectives to form corresponding nouns (littleness,
- /// kindness, foolishness ...) but not to all adjectives
- /// (not for example, to big, cruel, wise ...) d-suffixes can be
- /// used to change meaning, often in rather exotic ways.</p>
- /// Remove "ing", "end", "ig", "lijk", "baar" and "bar"
- /// </summary>
- /// <param name="sb">string being stemmed</param>
- private void Step3b(StringBuilder sb)
- {
- if (_R2 >= sb.Length)
- return;
- string s = sb.ToString();
- int index;
-
- if ((s.EndsWith("end") || s.EndsWith("ing")) &&
- (index = s.Length - 3) >= _R2
- )
- {
- sb.Remove(index,3);
- if (sb[index - 2] == 'i' &&
- sb[index - 1] == 'g')
- {
- if (sb[index - 3] != 'e' & index-2 >= _R2)
- {
- index -= 2;
- sb.Remove(index,2);
- }
- }
- else
- {
- UnDouble(sb,index);
- }
- return;
- }
- if ( s.EndsWith("ig") &&
- (index = s.Length - 2) >= _R2
- )
- {
- if (sb[index - 1] != 'e')
- sb.Remove(index, 2);
- return;
- }
- if (s.EndsWith("lijk") &&
- (index = s.Length - 4) >= _R2
- )
- {
- sb.Remove(index, 4);
- Step2(sb);
- return;
- }
- if (s.EndsWith("baar") &&
- (index = s.Length - 4) >= _R2
- )
- {
- sb.Remove(index, 4);
- return;
- }
- if (s.EndsWith("bar") &&
- (index = s.Length - 3) >= _R2
- )
- {
- if (_removedE)
- sb.Remove(index, 3);
- return;
- }
- }
-
- /// <summary>
- /// undouble vowel
- /// If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
- /// </summary>
- /// <param name="sb">string being stemmed</param>
- private void Step4(StringBuilder sb)
- {
- if (sb.Length < 4)
- return;
- string end = sb.ToString(sb.Length - 4,4);
- char c = end[0];
- char v1 = end[1];
- char v2 = end[2];
- char d = end[3];
- if (v1 == v2 &&
- d != 'I' &&
- v1 != 'i' &&
- IsVowel(v1) &&
- !IsVowel(d) &&
- !IsVowel(c))
- {
- sb.Remove(sb.Length - 2, 1);
- }
- }
-
- /// <summary>
- /// Checks if a term could be stemmed.
- /// </summary>
- /// <param name="term"></param>
- /// <returns>true if, and only if, the given term consists in letters.</returns>
- private bool IsStemmable( String term )
- {
- for ( int c = 0; c < term.Length; c++ )
- {
- if ( !Char.IsLetter(term[c])) return false;
- }
- return true;
- }
-
- /// <summary>
- /// Substitute ä, ë, ï, ö, ü, á , é, Ã, ó, ú
- /// </summary>
- /// <param name="buffer"></param>
- private void Substitute( StringBuilder buffer )
- {
- for ( int i = 0; i < buffer.Length; i++ )
- {
- switch (buffer[i])
- {
- case 'ä':
- case 'á':
- {
- buffer[i] = 'a';
- break;
- }
- case 'ë':
- case 'é':
- {
- buffer[i] = 'e';
- break;
- }
- case 'ü':
- case 'ú':
- {
- buffer[i] = 'u';
- break;
- }
- case 'ï':
- case 'i':
- {
- buffer[i] = 'i';
- break;
- }
- case 'ö':
- case 'ó':
- {
- buffer[i] = 'o';
- break;
- }
- }
- }
- }
-
-// private bool IsValidSEnding(StringBuilder sb)
-// {
-// return IsValidSEnding(sb,sb.Length - 1);
-// }
-
- private bool IsValidSEnding(StringBuilder sb, int index)
- {
- char c = sb[index];
- if (IsVowel(c) || c == 'j')
- return false;
- return true;
- }
-
-// private bool IsValidEnEnding(StringBuilder sb)
-// {
-// return IsValidEnEnding(sb,sb.Length - 1);
-// }
-
- private bool IsValidEnEnding(StringBuilder sb, int index)
- {
- char c = sb[index];
- if (IsVowel(c))
- return false;
- if (c < 3)
- return false;
- // ends with "gem"?
- if (c == 'm' && sb[index - 2] == 'g' && sb[index-1] == 'e')
- return false;
- return true;
- }
-
- private void UnDouble(StringBuilder sb)
- {
- UnDouble(sb, sb.Length);
- }
-
- private void UnDouble(StringBuilder sb, int endIndex)
- {
- string s = sb.ToString(0, endIndex);
- if (s.EndsWith("kk") || s.EndsWith("tt") || s.EndsWith("dd") || s.EndsWith("nn") || s.EndsWith("mm") || s.EndsWith("ff"))
- {
- sb.Remove(endIndex-1,1);
- }
- }
-
- private int GetRIndex(StringBuilder sb, int start)
- {
- if (start == 0)
- start = 1;
- int i = start;
- for (; i < sb.Length; i++)
- {
- //first non-vowel preceded by a vowel
- if (!IsVowel(sb[i]) && IsVowel(sb[i-1]))
- {
- return i + 1;
- }
- }
- return i + 1;
- }
-
- private void StoreYandI(StringBuilder sb)
- {
- if (sb[0] == 'y')
- sb[0] = 'Y';
- //char c;
- int last = sb.Length - 1;
- for (int i = 1; i < last; i++)
- {
- switch (sb[i])
- {
- case 'i':
- {
- if (IsVowel(sb[i-1]) &&
- IsVowel(sb[i+1])
- )
- sb[i] = 'I';
- break;
- }
- case 'y':
- {
- if (IsVowel(sb[i-1]))
- sb[i] = 'Y';
- break;
- }
- }
- }
- if (last > 0 && sb[last]=='y' && IsVowel(sb[last-1]))
- sb[last]='Y';
- }
-
- private void ReStoreYandI(StringBuilder sb)
- {
- sb.Replace("I","i");
- sb.Replace("Y","y");
- }
-
- private bool IsVowel(char c)
- {
- switch (c)
- {
- case 'e':
- case 'a':
- case 'o':
- case 'i':
- case 'u':
- case 'y':
- case 'è':
- {
- return true;
- }
- }
- return false;
- }
-
- internal void SetStemDictionary(Hashtable dict)
- {
- _stemDict = dict;
- }
- }
+ /**
+ * A stemmer for Dutch words.
+ * <p>
+ * The algorithm is an implementation of
+ * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
+ * algorithm in Martin Porter's snowball project.
+ * </p>
+ */
+
+ public class DutchStemmer
+ {
+ /**
+ * Buffer for the terms while stemming them.
+ */
+ private StringBuilder sb = new StringBuilder();
+ private bool _removedE;
+ private IDictionary<string, string> _stemDict;
+
+ private int _R1;
+ private int _R2;
+
+ //TODO convert to internal
+ /*
+ * Stems the given term to an unique <tt>discriminator</tt>.
+ *
+ * @param term The term that should be stemmed.
+ * @return Discriminator for <tt>term</tt>
+ */
+ public String Stem(String term)
+ {
+ term = term.ToLower();
+ if (!isStemmable(term))
+ return term;
+ if (_stemDict != null && _stemDict.ContainsKey(term))
+ if (_stemDict[term] is String)
+ return (String)_stemDict[term];
+ else
+ return null;
+
+ // Reset the StringBuilder.
+ sb.Clear();
+ sb.Insert(0, term);
+ // Stemming starts here...
+ substitute(sb);
+ storeYandI(sb);
+ _R1 = getRIndex(sb, 0);
+ _R1 = Math.Max(3, _R1);
+ step1(sb);
+ step2(sb);
+ _R2 = getRIndex(sb, _R1);
+ step3a(sb);
+ step3b(sb);
+ step4(sb);
+ reStoreYandI(sb);
+ return sb.ToString();
+ }
+
+ private bool enEnding(StringBuilder sb)
+ {
+ String[] enend = new String[] { "ene", "en" };
+ for (int i = 0; i < enend.Length; i++)
+ {
+ String end = enend[i];
+ String s = sb.ToString();
+ int index = s.Length - end.Length;
+ if (s.EndsWith(end) &&
+ index >= _R1 &&
+ isValidEnEnding(sb, index - 1)
+ )
+ {
+ sb.Remove(index, end.Length);
+ unDouble(sb, index);
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ private void step1(StringBuilder sb)
+ {
+ if (_R1 >= sb.Length)
+ return;
+
+ String s = sb.ToString();
+ int LengthR1 = sb.Length - _R1;
+ int index;
+
+ if (s.EndsWith("heden"))
+ {
+ var toReplace = sb.ToString(_R1, LengthR1).Replace("heden", "heid");
+ sb.Remove(_R1, LengthR1);
+ sb.Insert(_R1, toReplace);
+ return;
+ }
+
+ if (enEnding(sb))
+ return;
+
+ if (s.EndsWith("se") &&
+ (index = s.Length - 2) >= _R1 &&
+ isValidSEnding(sb, index - 1)
+ )
+ {
+ sb.Remove(index, 2);
+ return;
+ }
+ if (s.EndsWith("s") &&
+ (index = s.Length - 1) >= _R1 &&
+ isValidSEnding(sb, index - 1))
+ {
+ sb.Remove(index, 1);
+ }
+ }
+
+ /**
+ * Remove suffix e if in R1 and
+ * preceded by a non-vowel, and then undouble the ending
+ *
+ * @param sb String being stemmed
+ */
+ private void step2(StringBuilder sb)
+ {
+ _removedE = false;
+ if (_R1 >= sb.Length)
+ return;
+ String s = sb.ToString();
+ int index = s.Length - 1;
+ if (index >= _R1 &&
+ s.EndsWith("e") &&
+ !isVowel(sb[index - 1]))
+ {
+ sb.Remove(index, 1);
+ unDouble(sb);
+ _removedE = true;
+ }
+ }
+
+ /**
+ * Remove "heid"
+ *
+ * @param sb String being stemmed
+ */
+ private void step3a(StringBuilder sb)
+ {
+ if (_R2 >= sb.Length)
+ return;
+ String s = sb.ToString();
+ int index = s.Length - 4;
+ if (s.EndsWith("heid") && index >= _R2 && sb[index - 1] != 'c')
+ {
+ sb.Remove(index, 4); //remove heid
+ enEnding(sb);
+ }
+ }
+
+ /**
+ * <p>A d-suffix, or derivational suffix, enables a new word,
+ * often with a different grammatical category, or with a different
+ * sense, to be built from another word. Whether a d-suffix can be
+ * attached is discovered not from the rules of grammar, but by
+ * referring to a dictionary. So in English, ness can be added to
+ * certain adjectives to form corresponding nouns (littleness,
+ * kindness, foolishness ...) but not to all adjectives
+ * (not for example, to big, cruel, wise ...) d-suffixes can be
+ * used to change meaning, often in rather exotic ways.</p>
+ * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
+ *
+ * @param sb String being stemmed
+ */
+ private void step3b(StringBuilder sb)
+ {
+ if (_R2 >= sb.Length)
+ return;
+ String s = sb.ToString();
+ int index = 0;
+
+ if ((s.EndsWith("end") || s.EndsWith("ing")) &&
+ (index = s.Length - 3) >= _R2)
+ {
+ sb.Remove(index, 3);
+ if (sb[index - 2] == 'i' &&
+ sb[index - 1] == 'g')
+ {
+ if (sb[index - 3] != 'e' & index - 2 >= _R2)
+ {
+ index -= 2;
+ sb.Remove(index, 2);
+ }
+ }
+ else
+ {
+ unDouble(sb, index);
+ }
+ return;
+ }
+ if (s.EndsWith("ig") &&
+ (index = s.Length - 2) >= _R2
+ )
+ {
+ if (sb[index - 1] != 'e')
+ sb.Remove(index, 2);
+ return;
+ }
+ if (s.EndsWith("lijk") &&
+ (index = s.Length - 4) >= _R2
+ )
+ {
+ sb.Remove(index, 4);
+ step2(sb);
+ return;
+ }
+ if (s.EndsWith("baar") &&
+ (index = s.Length - 4) >= _R2
+ )
+ {
+ sb.Remove(index, 4);
+ return;
+ }
+ if (s.EndsWith("bar") &&
+ (index = s.Length - 3) >= _R2
+ )
+ {
+ if (_removedE)
+ sb.Remove(index, 3);
+ return;
+ }
+ }
+
+ /**
+ * undouble vowel
+ * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
+ *
+ * @param sb String being stemmed
+ */
+ private void step4(StringBuilder sb)
+ {
+ if (sb.Length < 4)
+ return;
+ String end = sb.ToString(sb.Length - 4, 4);
+ char c = end[0];
+ char v1 = end[1];
+ char v2 = end[2];
+ char d = end[3];
+ if (v1 == v2 &&
+ d != 'I' &&
+ v1 != 'i' &&
+ isVowel(v1) &&
+ !isVowel(d) &&
+ !isVowel(c))
+ {
+ sb.Remove(sb.Length - 2, 1);
+ }
+ }
+
+ /**
+ * Checks if a term could be stemmed.
+ *
+ * @return true if, and only if, the given term consists in letters.
+ */
+ private bool isStemmable(String term)
+ {
+ for (int c = 0; c < term.Length; c++)
+ {
+ if (!char.IsLetter(term[c])) return false;
+ }
+ return true;
+ }
+
+ /**
+ * Substitute ä, ë, ï, ö, ü, á , é, ÃÂ, ó, ú
+ */
+ private void substitute(StringBuilder buffer)
+ {
+ for (int i = 0; i < buffer.Length; i++)
+ {
+ switch (buffer[i])
+ {
+ case 'ä':
+ case 'á':
+ {
+ buffer[i] = 'a';
+ break;
+ }
+ case 'ë':
+ case 'é':
+ {
+ buffer[i] = 'e';
+ break;
+ }
+ case 'ü':
+ case 'ú':
+ {
+ buffer[i] = 'u';
+ break;
+ }
+ case 'ï':
+ case 'i':
+ {
+ buffer[i] = 'i';
+ break;
+ }
+ case 'ö':
+ case 'ó':
+ {
+ buffer[i] = 'o';
+ break;
+ }
+ }
+ }
+ }
+
+ /*private bool isValidSEnding(StringBuilder sb) {
+ return isValidSEnding(sb, sb.Length - 1);
+ }*/
+
+ private bool isValidSEnding(StringBuilder sb, int index)
+ {
+ char c = sb[index];
+ if (isVowel(c) || c == 'j')
+ return false;
+ return true;
+ }
+
+ /*private bool isValidEnEnding(StringBuilder sb) {
+ return isValidEnEnding(sb, sb.Length - 1);
+ }*/
+
+ private bool isValidEnEnding(StringBuilder sb, int index)
+ {
+ char c = sb[index];
+ if (isVowel(c))
+ return false;
+ if (c < 3)
+ return false;
+ // ends with "gem"?
+ if (c == 'm' && sb[index - 2] == 'g' && sb[index - 1] == 'e')
+ return false;
+ return true;
+ }
+
+ private void unDouble(StringBuilder sb)
+ {
+ unDouble(sb, sb.Length);
+ }
+
+ private void unDouble(StringBuilder sb, int endIndex)
+ {
+ String s = sb.ToString(0, endIndex);
+ if (s.EndsWith("kk") || s.EndsWith("tt") || s.EndsWith("dd") || s.EndsWith("nn") || s.EndsWith("mm") || s.EndsWith("ff"))
+ {
+ sb.Remove(endIndex - 1, 1);
+ }
+ }
+
+ private int getRIndex(StringBuilder sb, int start)
+ {
+ if (start == 0)
+ start = 1;
+ int i = start;
+ for (; i < sb.Length; i++)
+ {
+ //first non-vowel preceded by a vowel
+ if (!isVowel(sb[i]) && isVowel(sb[i - 1]))
+ {
+ return i + 1;
+ }
+ }
+ return i + 1;
+ }
+
+ private void storeYandI(StringBuilder sb)
+ {
+ if (sb[0] == 'y')
+ sb[0] = 'Y';
+
+ int last = sb.Length - 1;
+
+ for (int i = 1; i < last; i++)
+ {
+ switch (sb[i])
+ {
+ case 'i':
+ {
+ if (isVowel(sb[i - 1]) &&
+ isVowel(sb[i + 1])
+ )
+ sb[i] = 'I';
+ break;
+ }
+ case 'y':
+ {
+ if (isVowel(sb[i - 1]))
+ sb[i] = 'Y';
+ break;
+ }
+ }
+ }
+ if (last > 0 && sb[last] == 'y' && isVowel(sb[last - 1]))
+ sb[last] = 'Y';
+ }
+
+ private void reStoreYandI(StringBuilder sb)
+ {
+ String tmp = sb.ToString();
+ sb.Clear();
+ sb.Insert(0, tmp.Replace("I", "i").Replace("Y", "y"));
+ }
+
+ private bool isVowel(char c)
+ {
+ switch (c)
+ {
+ case 'e':
+ case 'a':
+ case 'o':
+ case 'i':
+ case 'u':
+ case 'y':
+ case 'è':
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ protected internal void SetStemDictionary(IDictionary<string, string> dict)
+ {
+ _stemDict = dict;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs Tue Feb 28 22:43:08 2012
@@ -15,6 +15,8 @@
* limitations under the License.
*/
+using Lucene.Net.Support;
+
namespace Lucene.Net.Analyzers.Payloads
{
/// <summary>
@@ -29,7 +31,7 @@ namespace Lucene.Net.Analyzers.Payloads
public static byte[] EncodeFloat(float payload, byte[] data, int offset)
{
- return EncodeInt(SupportClass.Single.FloatToIntBits(payload), data, offset);
+ return EncodeInt(Single.FloatToIntBits(payload), data, offset);
}
public static byte[] EncodeInt(int payload)
@@ -66,7 +68,7 @@ namespace Lucene.Net.Analyzers.Payloads
/// <returns>The float that was encoded</returns>
public static float DecodeFloat(byte[] bytes, int offset)
{
- return SupportClass.Single.IntBitsToFloat(DecodeInt(bytes, offset));
+ return Single.IntBitsToFloat(DecodeInt(bytes, offset));
}
public static int DecodeInt(byte[] bytes, int offset)
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Properties/AssemblyInfo.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Properties/AssemblyInfo.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Properties/AssemblyInfo.cs Tue Feb 28 22:43:08 2012
@@ -53,5 +53,12 @@ using System.Runtime.InteropServices;
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
-[assembly: AssemblyVersion("2.9.2.1")]
-[assembly: AssemblyFileVersion("2.9.2.1")]
+[assembly: AssemblyVersion("3.0.3")]
+[assembly: AssemblyFileVersion("3.0.3")]
+
+// for testing
+[assembly: InternalsVisibleTo("Lucene.Net.Contrib.Analyzers.Test, PublicKey=002400000480000094000000060200000024000052534131000400000100010075a07ce602f88e" +
+ "f263c7db8cb342c58ebd49ecdcc210fac874260b0213fb929ac3dcaf4f5b39744b800f99073eca" +
+ "72aebfac5f7284e1d5f2c82012a804a140f06d7d043d83e830cdb606a04da2ad5374cc92c0a495" +
+ "08437802fb4f8fb80a05e59f80afb99f4ccd0dfe44065743543c4b053b669509d29d332cd32a0c" +
+ "b1e97e84")]
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -1,4 +1,4 @@
-/*
+/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
@@ -20,253 +20,153 @@
*/
using System;
+using System.Collections.Generic;
+using System.Linq;
using System.Text;
using System.IO;
using System.Collections;
using Lucene.Net.Analysis;
+using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.Ru
{
- /// <summary>
- /// Analyzer for Russian language. Supports an external list of stopwords (words that
- /// will not be indexed at all).
- /// A default set of stopwords is used unless an alternative list is specified.
- /// </summary>
- public sealed class RussianAnalyzer : Analyzer
- {
- // letters
- private static char A = (char)0;
- private static char B = (char)1;
- private static char V = (char)2;
- private static char G = (char)3;
- private static char D = (char)4;
- private static char E = (char)5;
- private static char ZH = (char)6;
- private static char Z = (char)7;
- private static char I = (char)8;
- private static char I_ = (char)9;
- private static char K = (char)10;
- private static char L = (char)11;
- private static char M = (char)12;
- private static char N = (char)13;
- private static char O = (char)14;
- private static char P = (char)15;
- private static char R = (char)16;
- private static char S = (char)17;
- private static char T = (char)18;
- private static char U = (char)19;
- //private static char F = (char)20;
- private static char X = (char)21;
- //private static char TS = (char)22;
- private static char CH = (char)23;
- private static char SH = (char)24;
- private static char SHCH = (char)25;
- //private static char HARD = (char)26;
- private static char Y = (char)27;
- private static char SOFT = (char)28;
- private static char AE = (char)29;
- private static char IU = (char)30;
- private static char IA = (char)31;
-
- /// <summary>
- /// List of typical Russian stopwords.
- /// </summary>
- private static char[][] RUSSIAN_STOP_WORDS = {
- new char[] {A},
- new char[] {B, E, Z},
- new char[] {B, O, L, E, E},
- new char[] {B, Y},
- new char[] {B, Y, L},
- new char[] {B, Y, L, A},
- new char[] {B, Y, L, I},
- new char[] {B, Y, L, O},
- new char[] {B, Y, T, SOFT},
- new char[] {V},
- new char[] {V, A, M},
- new char[] {V, A, S},
- new char[] {V, E, S, SOFT},
- new char[] {V, O},
- new char[] {V, O, T},
- new char[] {V, S, E},
- new char[] {V, S, E, G, O},
- new char[] {V, S, E, X},
- new char[] {V, Y},
- new char[] {G, D, E},
- new char[] {D, A},
- new char[] {D, A, ZH, E},
- new char[] {D, L, IA},
- new char[] {D, O},
- new char[] {E, G, O},
- new char[] {E, E},
- new char[] {E, I_,},
- new char[] {E, IU},
- new char[] {E, S, L, I},
- new char[] {E, S, T, SOFT},
- new char[] {E, SHCH, E},
- new char[] {ZH, E},
- new char[] {Z, A},
- new char[] {Z, D, E, S, SOFT},
- new char[] {I},
- new char[] {I, Z},
- new char[] {I, L, I},
- new char[] {I, M},
- new char[] {I, X},
- new char[] {K},
- new char[] {K, A, K},
- new char[] {K, O},
- new char[] {K, O, G, D, A},
- new char[] {K, T, O},
- new char[] {L, I},
- new char[] {L, I, B, O},
- new char[] {M, N, E},
- new char[] {M, O, ZH, E, T},
- new char[] {M, Y},
- new char[] {N, A},
- new char[] {N, A, D, O},
- new char[] {N, A, SH},
- new char[] {N, E},
- new char[] {N, E, G, O},
- new char[] {N, E, E},
- new char[] {N, E, T},
- new char[] {N, I},
- new char[] {N, I, X},
- new char[] {N, O},
- new char[] {N, U},
- new char[] {O},
- new char[] {O, B},
- new char[] {O, D, N, A, K, O},
- new char[] {O, N},
- new char[] {O, N, A},
- new char[] {O, N, I},
- new char[] {O, N, O},
- new char[] {O, T},
- new char[] {O, CH, E, N, SOFT},
- new char[] {P, O},
- new char[] {P, O, D},
- new char[] {P, R, I},
- new char[] {S},
- new char[] {S, O},
- new char[] {T, A, K},
- new char[] {T, A, K, ZH, E},
- new char[] {T, A, K, O, I_},
- new char[] {T, A, M},
- new char[] {T, E},
- new char[] {T, E, M},
- new char[] {T, O},
- new char[] {T, O, G, O},
- new char[] {T, O, ZH, E},
- new char[] {T, O, I_},
- new char[] {T, O, L, SOFT, K, O},
- new char[] {T, O, M},
- new char[] {T, Y},
- new char[] {U},
- new char[] {U, ZH, E},
- new char[] {X, O, T, IA},
- new char[] {CH, E, G, O},
- new char[] {CH, E, I_},
- new char[] {CH, E, M},
- new char[] {CH, T, O},
- new char[] {CH, T, O, B, Y},
- new char[] {CH, SOFT, E},
- new char[] {CH, SOFT, IA},
- new char[] {AE, T, A},
- new char[] {AE, T, I},
- new char[] {AE, T, O},
- new char[] {IA}
- };
-
- /// <summary>
- /// Contains the stopwords used with the StopFilter.
- /// </summary>
- private Hashtable stoptable = new Hashtable();
-
- /// <summary>
- /// Charset for Russian letters.
- /// Represents encoding for 32 lowercase Russian letters.
- /// Predefined charsets can be taken from RussianCharSets class
- /// </summary>
- private char[] charset;
-
- /// <summary>
- /// Builds an analyzer.
- /// </summary>
- public RussianAnalyzer()
- {
- this.charset = RussianCharsets.UnicodeRussian;
- stoptable = StopFilter.MakeStopSet(MakeStopWords(RussianCharsets.UnicodeRussian));
- }
-
- /// <summary>
- /// Builds an analyzer.
- /// </summary>
- /// <param name="charset"></param>
- public RussianAnalyzer(char[] charset)
- {
- this.charset = charset;
- stoptable = StopFilter.MakeStopSet(MakeStopWords(charset));
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="charset"></param>
- /// <param name="stopwords"></param>
- public RussianAnalyzer(char[] charset, String[] stopwords)
- {
- this.charset = charset;
- stoptable = StopFilter.MakeStopSet(stopwords);
- }
-
- /// <summary>
- /// Takes russian stop words and translates them to a String array, using
- /// the given charset
- /// </summary>
- /// <param name="charset"></param>
- /// <returns></returns>
- private static String[] MakeStopWords(char[] charset)
- {
- String[] res = new String[RUSSIAN_STOP_WORDS.Length];
- for (int i = 0; i < res.Length; i++)
- {
- char[] theStopWord = RUSSIAN_STOP_WORDS[i];
- // translate the word,using the charset
- StringBuilder theWord = new StringBuilder();
- for (int j = 0; j < theStopWord.Length; j++)
- {
- theWord.Append(charset[theStopWord[j]]);
- }
- res[i] = theWord.ToString();
- }
- return res;
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="charset"></param>
- /// <param name="stopwords"></param>
- public RussianAnalyzer(char[] charset, Hashtable stopwords)
- {
- this.charset = charset;
- stoptable = stopwords;
- }
-
- /// <summary>
- /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
- /// </summary>
- /// <param name="fieldName"></param>
- /// <param name="reader"></param>
- /// <returns>
- /// A TokenStream build from a RussianLetterTokenizer filtered with
- /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter
- /// </returns>
- public override TokenStream TokenStream(String fieldName, TextReader reader)
- {
- TokenStream result = new RussianLetterTokenizer(reader, charset);
- result = new RussianLowerCaseFilter(result, charset);
- result = new StopFilter(result, stoptable);
- result = new RussianStemFilter(result, charset);
- return result;
- }
- }
+ /// <summary>
+ /// Analyzer for Russian language. Supports an external list of stopwords (words that
+ /// will not be indexed at all).
+ /// A default set of stopwords is used unless an alternative list is specified.
+ /// </summary>
+ public sealed class RussianAnalyzer : Analyzer
+ {
+ /// <summary>
+ /// List of typical Russian stopwords.
+ /// </summary>
+ private static readonly String[] RUSSIAN_STOP_WORDS = {
+ "а", "без", "более", "бÑ", "бÑл", "бÑла", "бÑли",
+ "бÑло", "бÑÑÑ", "в",
+ "вам", "ваÑ", "веÑÑ", "во", "воÑ", "вÑе", "вÑего",
+ "вÑеÑ
", "вÑ", "где",
+ "да", "даже", "длÑ", "до", "его", "ее", "ей", "еÑ",
+ "еÑли", "еÑÑÑ",
+ "еÑе", "же", "за", "здеÑÑ", "и", "из", "или", "им",
+ "иÑ
", "к", "как",
+ "ко", "когда", "кÑо", "ли", "либо", "мне", "можеÑ",
+ "мÑ", "на", "надо",
+ "наÑ", "не", "него", "нее", "неÑ", "ни", "ниÑ
", "но",
+ "нÑ", "о", "об",
+ "однако", "он", "она", "они", "оно", "оÑ", "оÑенÑ",
+ "по", "под", "пÑи",
+ "Ñ", "Ñо", "Ñак", "Ñакже", "Ñакой", "Ñам", "Ñе", "Ñем"
+ , "Ñо", "Ñого",
+ "Ñоже", "Ñой", "ÑолÑко", "Ñом", "ÑÑ", "Ñ", "Ñже",
+ "Ñ
оÑÑ", "Ñего", "Ñей",
+ "Ñем", "ÑÑо", "ÑÑобÑ", "ÑÑе", "ÑÑÑ", "ÑÑа", "ÑÑи",
+ "ÑÑо", "Ñ"
+ };
+
+ private static class DefaultSetHolder
+ {
+ internal static readonly ISet<string> DEFAULT_STOP_SET = CharArraySet.UnmodifiableSet(new CharArraySet(RUSSIAN_STOP_WORDS, false));
+ }
+
+ /// <summary>
+ /// Contains the stopwords used with the StopFilter.
+ /// </summary>
+ private readonly ISet<string> stopSet;
+
+ private readonly Version matchVersion;
+
+
+ public RussianAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
+ */
+ public RussianAnalyzer(Version matchVersion, params string[] stopwords)
+ : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ */
+ public RussianAnalyzer(Version matchVersion, ISet<string> stopwords)
+ {
+ stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ * TODO: create a Set version of this ctor
+ * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
+ */
+ public RussianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+ : this(matchVersion, stopwords.Keys.ToArray())
+ {
+ }
+
+ /**
+ * Creates a {@link TokenStream} which tokenizes all the text in the
+ * provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a
+ * {@link RussianLetterTokenizer} filtered with
+ * {@link RussianLowerCaseFilter}, {@link StopFilter},
+ * and {@link RussianStemFilter}
+ */
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new RussianLetterTokenizer(reader);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ result, stopSet);
+ result = new RussianStemFilter(result);
+ return result;
+ }
+
+ private class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a
+ * {@link RussianLetterTokenizer} filtered with
+ * {@link RussianLowerCaseFilter}, {@link StopFilter},
+ * and {@link RussianStemFilter}
+ */
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new RussianLetterTokenizer(reader);
+ streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.result, stopSet);
+ streams.result = new RussianStemFilter(streams.result);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs Tue Feb 28 22:43:08 2012
@@ -22,42 +22,41 @@
using System;
using System.IO;
using Lucene.Net.Analysis;
+using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Ru
{
- /// <summary>
- /// A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
- /// in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
- /// which doesn't know how to detect letters in encodings like CP1252 and KOI8
- /// (well-known problems with 0xD7 and 0xF7 chars)
- /// </summary>
- public class RussianLetterTokenizer : CharTokenizer
- {
- /// <summary>
- /// Construct a new LetterTokenizer.
- /// </summary>
- private char[] charset;
+ ///<summary>
+ /// A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
+ /// by also allowing the basic latin digits 0-9.
+ ///</summary>
+ public class RussianLetterTokenizer : CharTokenizer
+ {
+ public RussianLetterTokenizer(TextReader _in)
+ : base(_in)
+ {
+ }
- public RussianLetterTokenizer(TextReader _in, char[] charset) : base(_in)
- {
- this.charset = charset;
- }
+ public RussianLetterTokenizer(AttributeSource source, TextReader _in)
+ : base(source, _in)
+ {
+ }
- /// <summary>
- /// Collects only characters which satisfy Char.IsLetter(char).
- /// </summary>
- /// <param name="c"></param>
- /// <returns></returns>
- protected override bool IsTokenChar(char c)
- {
- if (Char.IsLetter(c))
- return true;
- for (int i = 0; i < charset.Length; i++)
- {
- if (c == charset[i])
- return true;
- }
- return false;
- }
- }
+ public RussianLetterTokenizer(AttributeSource.AttributeFactory factory, TextReader __in)
+ : base(factory, __in)
+ {
+ }
+
+ /**
+ * Collects only characters which satisfy
+ * {@link Character#isLetter(char)}.
+ */
+ protected override bool IsTokenChar(char c)
+ {
+ if (char.IsLetter(c) || (c >= '0' && c <= '9'))
+ return true;
+ else
+ return false;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs Tue Feb 28 22:43:08 2012
@@ -21,41 +21,40 @@
using System;
using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
namespace Lucene.Net.Analysis.Ru
{
- /// <summary>
- /// Normalizes token text to lower case, analyzing given ("russian") charset.
- /// </summary>
- public sealed class RussianLowerCaseFilter : TokenFilter
- {
- char[] charset;
-
- public RussianLowerCaseFilter(TokenStream _in, char[] charset) : base(_in)
- {
- this.charset = charset;
- }
-
- public override Token Next()
- {
- Token t = input.Next();
-
- if (t == null)
- return null;
-
- String txt = t.TermText();
-
- char[] chArray = txt.ToCharArray();
- for (int i = 0; i < chArray.Length; i++)
- {
- chArray[i] = RussianCharsets.ToLowerCase(chArray[i], charset);
- }
-
- String newTxt = new String(chArray);
- // create new token
- Token newToken = new Token(newTxt, t.StartOffset(), t.EndOffset());
-
- return newToken;
- }
- }
+ /// <summary>
+ /// Normalizes token text to lower case.
+ /// </summary>
+ [Obsolete("Use LowerCaseFilter instead, which has the same functionality. This filter will be removed in Lucene 4.0")]
+ public sealed class RussianLowerCaseFilter : TokenFilter
+ {
+ private TermAttribute termAtt;
+
+ public RussianLowerCaseFilter(TokenStream _in)
+ : base(_in)
+ {
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ public sealed override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ char[] chArray = termAtt.TermBuffer();
+ int chLen = termAtt.TermLength();
+ for (int i = 0; i < chLen; i++)
+ {
+ chArray[i] = char.ToLower(chArray[i]);
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs Tue Feb 28 22:43:08 2012
@@ -21,59 +21,65 @@
using System;
using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
namespace Lucene.Net.Analysis.Ru
{
- /// <summary>
- /// A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
- /// The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter,
- /// because RussianStemFilter only works with lowercase part of any "russian" charset.
- /// </summary>
- public sealed class RussianStemFilter : TokenFilter
- {
- /// <summary>
- /// The actual token in the input stream.
- /// </summary>
- private Token token = null;
- private RussianStemmer stemmer = null;
-
- public RussianStemFilter(TokenStream _in, char[] charset) : base(_in)
- {
- stemmer = new RussianStemmer(charset);
- }
-
- /// <summary>
- ///
- /// </summary>
- /// <returns>Returns the next token in the stream, or null at EOS</returns>
- public override Token Next()
- {
- if ((token = input.Next()) == null)
- {
- return null;
- }
- else
- {
- String s = stemmer.Stem(token.TermText());
- if (!s.Equals(token.TermText()))
- {
- return new Token(s, token.StartOffset(), token.EndOffset(),
- token.Type());
- }
- return token;
- }
- }
-
- /// <summary>
- /// Set a alternative/custom RussianStemmer for this filter.
- /// </summary>
- /// <param name="stemmer"></param>
- public void SetStemmer(RussianStemmer stemmer)
- {
- if (stemmer != null)
- {
- this.stemmer = stemmer;
- }
- }
- }
+ /**
+ * A {@link TokenFilter} that stems Russian words.
+ * <p>
+ * The implementation was inspired by GermanStemFilter.
+ * The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter ,
+ * because RussianStemFilter only works with lowercase characters.
+ * </p>
+ */
+ public sealed class RussianStemFilter : TokenFilter
+ {
+ /**
+ * The actual token in the input stream.
+ */
+ private RussianStemmer stemmer = null;
+
+ private TermAttribute termAtt;
+
+ public RussianStemFilter(TokenStream _in)
+ : base(_in)
+ {
+ stemmer = new RussianStemmer();
+ termAtt = AddAttribute<TermAttribute>();
+ }
+ /**
+ * Returns the next token in the stream, or null at EOS
+ */
+ public sealed override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ String term = termAtt.Term();
+ String s = stemmer.Stem(term);
+ if (s != null && !s.Equals(term))
+ termAtt.SetTermBuffer(s);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+
+ // I don't get the point of this. All methods in java are private, so they can't be
+ // overridden...You can't really subclass any of its behavior. I've commented it out,
+ // as it doesn't compile as is. - cc
+ ////**
+ // * Set a alternative/custom {@link RussianStemmer} for this filter.
+ // */
+ //public void SetStemmer(RussianStemmer stemmer)
+ //{
+ // if (stemmer != null)
+ // {
+ // this.stemmer = stemmer;
+ // }
+ //}
+ }
}
\ No newline at end of file