You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2011/11/21 05:44:59 UTC
[Lucene.Net] svn commit: r1204353 [6/9] - in
/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src:
contrib/Analyzers/ contrib/Analyzers/AR/ contrib/Analyzers/BR/
contrib/Analyzers/CJK/ contrib/Analyzers/Cn/ contrib/Analyzers/Compound/
contrib/Analyzers/Compoun...
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -32,7 +32,7 @@ namespace Lucene.Net.Analysis.NGram
* MaxGram can't be larger than 1024 because of limitation.
* </p>
*/
- public class EdgeNGramTokenizer : Tokenizer
+ public sealed class EdgeNGramTokenizer : Tokenizer
{
public static Side DEFAULT_SIDE = Side.FRONT;
public static int DEFAULT_MAX_GRAM_SIZE = 1;
@@ -41,38 +41,8 @@ namespace Lucene.Net.Analysis.NGram
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
- // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
/** Specifies which side of the input the n-gram should be generated from */
- public class Side
- {
- private string label;
-
- /** Get the n-gram from the front of the input */
- public static Side FRONT = new Side("front");
-
- /** Get the n-gram from the end of the input */
- public static Side BACK = new Side("back");
-
- // Private ctor
- private Side(string label) { this.label = label; }
-
-
- public string getLabel() { return label; }
-
- // Get the appropriate Side from a string
- public static Side getSide(string sideName)
- {
- if (FRONT.getLabel().Equals(sideName))
- {
- return FRONT;
- }
- else if (BACK.getLabel().Equals(sideName))
- {
- return BACK;
- }
- return null;
- }
- }
+ // Moved Side enum from this class to external definition
private int minGram;
private int maxGram;
@@ -138,7 +108,7 @@ namespace Lucene.Net.Analysis.NGram
* <param name="maxGram">the largest n-gram to generate</param>
*/
public EdgeNGramTokenizer(TextReader input, string sideLabel, int minGram, int maxGram)
- : this(input, Side.getSide(sideLabel), minGram, maxGram)
+ : this(input, SideExtensions.GetSide(sideLabel), minGram, maxGram)
{
}
@@ -153,7 +123,7 @@ namespace Lucene.Net.Analysis.NGram
* <param name="maxGram">the largest n-gram to generate</param>
*/
public EdgeNGramTokenizer(AttributeSource source, TextReader input, string sideLabel, int minGram, int maxGram)
- : this(source, input, Side.getSide(sideLabel), minGram, maxGram)
+ : this(source, input, SideExtensions.GetSide(sideLabel), minGram, maxGram)
{
}
@@ -168,7 +138,7 @@ namespace Lucene.Net.Analysis.NGram
* <param name="maxGram">the largest n-gram to generate</param>
*/
public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram) :
- this(factory, input, Side.getSide(sideLabel), minGram, maxGram)
+ this(factory, input, SideExtensions.GetSide(sideLabel), minGram, maxGram)
{
}
@@ -193,8 +163,8 @@ namespace Lucene.Net.Analysis.NGram
this.maxGram = maxGram;
this.side = side;
- this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
- this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ this.termAtt = AddAttribute<TermAttribute>();
+ this.offsetAtt = AddAttribute<OffsetAttribute>();
}
@@ -240,22 +210,6 @@ namespace Lucene.Net.Analysis.NGram
this.offsetAtt.SetOffset(finalOffset, finalOffset);
}
- /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
- * not be overridden. Delegates to the backwards compatibility layer. */
- [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
- public override Token Next(Token reusableToken)
- {
- return base.Next(reusableToken);
- }
-
- /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
- * not be overridden. Delegates to the backwards compatibility layer. */
- [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
- public override Token Next()
- {
- return base.Next();
- }
-
public override void Reset(TextReader input)
{
base.Reset(input);
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -24,11 +24,10 @@ using Lucene.Net.Util;
namespace Lucene.Net.Analysis.NGram
{
-
/**
* Tokenizes the input into n-grams of the given size(s).
*/
- public class NGramTokenFilter : TokenFilter
+ public sealed class NGramTokenFilter : TokenFilter
{
public static int DEFAULT_MIN_NGRAM_SIZE = 1;
public static int DEFAULT_MAX_NGRAM_SIZE = 2;
@@ -65,8 +64,8 @@ namespace Lucene.Net.Analysis.NGram
this.minGram = minGram;
this.maxGram = maxGram;
- this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
- this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ this.termAtt = AddAttribute<TermAttribute>();
+ this.offsetAtt = AddAttribute<OffsetAttribute>();
}
/**
@@ -116,22 +115,6 @@ namespace Lucene.Net.Analysis.NGram
}
}
- /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
- * not be overridden. Delegates to the backwards compatibility layer. */
- [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
- public override Token Next(Token reusableToken)
- {
- return base.Next(reusableToken);
- }
-
- /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
- * not be overridden. Delegates to the backwards compatibility layer. */
- [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
- public override Token Next()
- {
- return base.Next();
- }
-
public override void Reset()
{
base.Reset();
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -28,7 +28,7 @@ namespace Lucene.Net.Analysis.NGram
/**
* Tokenizes the input into n-grams of the given size(s).
*/
- public class NGramTokenizer : Tokenizer
+ public sealed class NGramTokenizer : Tokenizer
{
public static int DEFAULT_MIN_NGRAM_SIZE = 1;
public static int DEFAULT_MAX_NGRAM_SIZE = 2;
@@ -104,8 +104,8 @@ namespace Lucene.Net.Analysis.NGram
this.minGram = minGram;
this.maxGram = maxGram;
- this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
- this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+ this.termAtt = AddAttribute<TermAttribute>();
+ this.offsetAtt = AddAttribute<OffsetAttribute>();
}
/** Returns the next token in the stream, or null at EOS. */
@@ -145,22 +145,6 @@ namespace Lucene.Net.Analysis.NGram
this.offsetAtt.SetOffset(finalOffset, finalOffset);
}
- /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
- * not be overridden. Delegates to the backwards compatibility layer. */
- [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
- public override Token Next(Token reusableToken)
- {
- return base.Next(reusableToken);
- }
-
- /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
- * not be overridden. Delegates to the backwards compatibility layer. */
- [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
- public override Token Next()
- {
- return base.Next();
- }
-
public override void Reset(TextReader input)
{
base.Reset(input);
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -20,198 +20,269 @@
*/
using System;
+using System.Collections.Generic;
using System.IO;
using System.Collections;
using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Support;
+using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.Nl
{
-
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// Analyzer for Dutch language. Supports an external list of stopwords (words that
- /// will not be indexed at all), an external list of exclusions (word that will
- /// not be stemmed, but indexed) and an external list of word-stem pairs that overrule
- /// the algorithm (dictionary stemming).
- /// A default set of stopwords is used unless an alternative list is specified, the
- /// exclusion list is empty by default.
- /// <version>$Id: DutchAnalyzer.java,v 1.1 2004/03/09 14:55:08 otis Exp $</version>
- /// </summary>
- /// <author>Edwin de Jonge</author>
- public class DutchAnalyzer : Analyzer
- {
- /// <summary>
- /// List of typical german stopwords.
- /// </summary>
- public static string[] DUTCH_STOP_WORDS =
- {
- "de","en","van","ik","te","dat","die","in","een",
- "hij","het","niet","zijn","is","was","op","aan","met","als","voor","had",
- "er","maar","om","hem","dan","zou","of","wat","mijn","men","dit","zo",
- "door","over","ze","zich","bij","ook","tot","je","mij","uit","der","daar",
- "haar","naar","heb","hoe","heeft","hebben","deze","u","want","nog","zal",
- "me","zij","nu","ge","geen","omdat","iets","worden","toch","al","waren",
- "veel","meer","doen","toen","moet","ben","zonder","kan","hun","dus",
- "alles","onder","ja","eens","hier","wie","werd","altijd","doch","wordt",
- "wezen","kunnen","ons","zelf","tegen","na","reeds","wil","kon","niets",
- "uw","iemand","geweest","andere"
- };
- /// <summary>
- /// Contains the stopwords used with the StopFilter.
- /// </summary>
- private Hashtable stoptable = new Hashtable();
-
- /// <summary>
- /// Contains words that should be indexed but not stemmed.
- /// </summary>
- private Hashtable excltable = new Hashtable();
-
- private Hashtable _stemdict = new Hashtable();
-
- /// <summary>
- /// Builds an analyzer.
- /// </summary>
- public DutchAnalyzer()
- {
- stoptable = StopFilter.MakeStopSet( DUTCH_STOP_WORDS );
- _stemdict.Add("fiets","fiets"); //otherwise fiet
- _stemdict.Add("bromfiets","bromfiets"); //otherwise bromfiet
- _stemdict.Add("ei","eier");
- _stemdict.Add("kind","kinder");
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="stopwords"></param>
- public DutchAnalyzer( String[] stopwords )
- {
- stoptable = StopFilter.MakeStopSet( stopwords );
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="stopwords"></param>
- public DutchAnalyzer( Hashtable stopwords )
- {
- stoptable = stopwords;
- }
-
- /// <summary>
- /// Builds an analyzer with the given stop words.
- /// </summary>
- /// <param name="stopwords"></param>
- public DutchAnalyzer( FileInfo stopwords )
- {
- stoptable = WordlistLoader.GetWordtable( stopwords );
- }
-
- /// <summary>
- /// Builds an exclusionlist from an array of Strings.
- /// </summary>
- /// <param name="exclusionlist"></param>
- public void SetStemExclusionTable( String[] exclusionlist )
- {
- excltable = StopFilter.MakeStopSet( exclusionlist );
- }
-
- /// <summary>
- /// Builds an exclusionlist from a Hashtable.
- /// </summary>
- /// <param name="exclusionlist"></param>
- public void SetStemExclusionTable( Hashtable exclusionlist )
- {
- excltable = exclusionlist;
- }
-
- /// <summary>
- /// Builds an exclusionlist from the words contained in the given file.
- /// </summary>
- /// <param name="exclusionlist"></param>
- public void SetStemExclusionTable(FileInfo exclusionlist)
- {
- excltable = WordlistLoader.GetWordtable(exclusionlist);
- }
-
- /// <summary>
- /// Reads a stemdictionary file , that overrules the stemming algorithm
- /// This is a textfile that contains per line
- /// word\tstem
- /// i.e: tabseperated
- /// </summary>
- /// <param name="stemdict"></param>
- public void SetStemDictionary(FileInfo stemdict)
- {
- _stemdict = WordlistLoader.GetStemDict(stemdict);
- }
-
- /// <summary>
- /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
- /// </summary>
- /// <param name="fieldName"></param>
- /// <param name="reader"></param>
- /// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
- public override TokenStream TokenStream(String fieldName, TextReader reader)
- {
- TokenStream result = new StandardTokenizer( reader );
- result = new StandardFilter( result );
- result = new StopFilter( result, stoptable );
- result = new DutchStemFilter( result, excltable, _stemdict);
- return result;
- }
- }
+ /**
+ * {@link Analyzer} for Dutch language.
+ * <p>
+ * Supports an external list of stopwords (words that
+ * will not be indexed at all), an external list of exclusions (word that will
+ * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
+ * the algorithm (dictionary stemming).
+ * A default set of stopwords is used unless an alternative list is specified, but the
+ * exclusion list is empty by default.
+ * </p>
+ *
+ * <p><b>NOTE</b>: This class uses the same {@link Version}
+ * dependent settings as {@link StandardAnalyzer}.</p>
+ */
+ public class DutchAnalyzer : Analyzer
+ {
+ /**
+ * List of typical Dutch stopwords.
+ * @deprecated use {@link #getDefaultStopSet()} instead
+ */
+ public static readonly String[] DUTCH_STOP_WORDS =
+ {
+ "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
+ "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
+ "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
+ "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
+ "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
+ "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
+ "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
+ "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
+ "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
+ "uw", "iemand", "geweest", "andere"
+ };
+ /**
+ * Returns an unmodifiable instance of the default stop-words set.
+ * @return an unmodifiable instance of the default stop-words set.
+ */
+ public static ISet<string> getDefaultStopSet()
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ static class DefaultSetHolder
+ {
+ internal static readonly ISet<string> DEFAULT_STOP_SET = CharArraySet
+ .UnmodifiableSet(new CharArraySet(DUTCH_STOP_WORDS, false));
+ }
+
+
+ /**
+ * Contains the stopwords used with the StopFilter.
+ */
+ private readonly ISet<string> stoptable;
+
+ /**
+ * Contains words that should be indexed but not stemmed.
+ */
+ private ISet<string> excltable = new HashSet<string>();
+
+ private IDictionary<String, String> stemdict = new HashMap<String, String>();
+ private readonly Version matchVersion;
+
+ /**
+ * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS})
+ * and a few default entries for the stem exclusion table.
+ *
+ */
+ public DutchAnalyzer(Version matchVersion)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ stemdict.Add("fiets", "fiets"); //otherwise fiet
+ stemdict.Add("bromfiets", "bromfiets"); //otherwise bromfiet
+ stemdict.Add("ei", "eier");
+ stemdict.Add("kind", "kinder");
+ }
+
+ public DutchAnalyzer(Version matchVersion, ISet<string> stopwords)
+ : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+
+ }
+
+ public DutchAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionTable)
+ {
+ stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+ excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionTable));
+ this.matchVersion = matchVersion;
+ SetOverridesTokenStreamMethod(typeof(DutchAnalyzer));
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion
+ * @param stopwords
+ * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
+ */
+ public DutchAnalyzer(Version matchVersion, params string[] stopwords)
+ : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopwords
+ * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
+ */
+ public DutchAnalyzer(Version matchVersion, HashSet<string> stopwords)
+ : this(matchVersion, (ISet<string>)stopwords)
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopwords
+ * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
+ */
+ public DutchAnalyzer(Version matchVersion, FileInfo stopwords)
+ {
+ // this is completely broken!
+ SetOverridesTokenStreamMethod(typeof(DutchAnalyzer));
+ try
+ {
+ stoptable = WordlistLoader.GetWordSet(stopwords);
+ }
+ catch (IOException e)
+ {
+ // TODO: throw IOException
+ throw new Exception("", e);
+ }
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Builds an exclusionlist from an array of Strings.
+ *
+ * @param exclusionlist
+ * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
+ */
+ public void setStemExclusionTable(params string[] exclusionlist)
+ {
+ excltable = StopFilter.MakeStopSet(exclusionlist);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+
+ /**
+ * Builds an exclusionlist from a Hashtable.
+ * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
+ */
+ public void setStemExclusionTable(HashSet<string> exclusionlist)
+ {
+ excltable = exclusionlist;
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+
+ /**
+ * Builds an exclusionlist from the words contained in the given file.
+ * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
+ */
+ public void setStemExclusionTable(FileInfo exclusionlist)
+ {
+ try
+ {
+ excltable = WordlistLoader.GetWordSet(exclusionlist);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+ catch (IOException e)
+ {
+ // TODO: throw IOException
+ throw new Exception("", e);
+ }
+ }
+
+ /**
+ * Reads a stemdictionary file , that overrules the stemming algorithm
+ * This is a textfile that contains per line
+ * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
+ */
+ public void setStemDictionary(FileInfo stemdictFile)
+ {
+ try
+ {
+ stemdict = WordlistLoader.GetStemDict(stemdictFile);
+ SetPreviousTokenStream(null); // force a new stemmer to be created
+ }
+ catch (IOException e)
+ {
+ // TODO: throw IOException
+ throw new Exception(string.Empty, e);
+ }
+ }
+
+ /**
+ * Creates a {@link TokenStream} which tokenizes all the text in the
+ * provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * and {@link DutchStemFilter}
+ */
+ public override TokenStream TokenStream(String fieldName, TextReader reader)
+ {
+ TokenStream result = new StandardTokenizer(matchVersion, reader);
+ result = new StandardFilter(result);
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ result, stoptable);
+ result = new DutchStemFilter(result, excltable, stemdict);
+ return result;
+ }
+
+ class SavedStreams
+ {
+ protected internal Tokenizer source;
+ protected internal TokenStream result;
+ };
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
+ * text in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link StopFilter},
+ * and {@link DutchStemFilter}
+ */
+ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+ {
+ if (overridesTokenStreamMethod)
+ {
+ // LUCENE-1678: force fallback to tokenStream() if we
+ // have been subclassed and that subclass overrides
+ // tokenStream but not reusableTokenStream
+ return TokenStream(fieldName, reader);
+ }
+
+ SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.source = new StandardTokenizer(matchVersion, reader);
+ streams.result = new StandardFilter(streams.source);
+ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.result, stoptable);
+ streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.source.Reset(reader);
+ }
+ return streams.result;
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs Mon Nov 21 04:44:55 2011
@@ -20,167 +20,113 @@
*/
using System;
+using System.Collections.Generic;
using System.IO;
using System.Collections;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Support;
namespace Lucene.Net.Analysis.Nl
{
-
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// A filter that stems Dutch words. It supports a table of words that should
- /// not be stemmed at all. The stemmer used can be changed at runtime after the
- /// filter object is created (as long as it is a DutchStemmer).
- ///
- /// <version>$Id: DutchStemFilter.java,v 1.1 2004/03/09 14:55:08 otis Exp $</version>
- /// </summary>
- /// <author>Edwin de Jonge</author>
- public sealed class DutchStemFilter : TokenFilter
- {
- /// <summary>
- /// The actual token in the input stream.
- /// </summary>
- private Token token = null;
- private DutchStemmer stemmer = null;
- private Hashtable exclusions = null;
-
- public DutchStemFilter( TokenStream _in ) : base(_in)
- {
- stemmer = new DutchStemmer();
- }
-
- /// <summary>
- /// Builds a DutchStemFilter that uses an exclusiontable.
- /// </summary>
- /// <param name="_in"></param>
- /// <param name="exclusiontable"></param>
- public DutchStemFilter( TokenStream _in, Hashtable exclusiontable ): this(_in)
- {
- exclusions = exclusiontable;
- }
-
- /// <summary>
- ///
- /// </summary>
- /// <param name="_in"></param>
- /// <param name="exclusiontable"></param>
- /// <param name="stemdictionary">Dictionary of word stem pairs, that overrule the algorithm</param>
- public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable stemdictionary): this(_in, exclusiontable)
- {
- stemmer.SetStemDictionary(stemdictionary);
- }
-
- /// <summary>
- /// </summary>
- /// <returns>Returns the next token in the stream, or null at EOS</returns>
- public override Token Next()
-
- {
- if ( ( token = input.Next() ) == null )
- {
- return null;
- }
- // Check the exclusiontable
- else if ( exclusions != null && exclusions.Contains( token.TermText() ) )
- {
- return token;
- }
- else
- {
- String s = stemmer.Stem( token.TermText() );
- // If not stemmed, dont waste the time creating a new token
- if ( !s.Equals( token.TermText() ) )
- {
- return new Token( s, token.StartOffset(),
- token.EndOffset(), token.Type() );
- }
- return token;
- }
- }
-
- /// <summary>
- /// Set a alternative/custom DutchStemmer for this filter.
- /// </summary>
- /// <param name="stemmer"></param>
- public void SetStemmer( DutchStemmer stemmer )
- {
- if ( stemmer != null )
- {
- this.stemmer = stemmer;
- }
- }
-
- /// <summary>
- /// Set an alternative exclusion list for this filter.
- /// </summary>
- /// <param name="exclusiontable"></param>
- public void SetExclusionTable( Hashtable exclusiontable )
- {
- exclusions = exclusiontable;
- }
-
- /// <summary>
- /// Set dictionary for stemming, this dictionary overrules the algorithm,
- /// so you can correct for a particular unwanted word-stem pair.
- /// </summary>
- /// <param name="dict"></param>
- public void SetStemDictionary(Hashtable dict)
- {
- if (stemmer != null)
- stemmer.SetStemDictionary(dict);
- }
- }
+ /**
+ * A {@link TokenFilter} that stems Dutch words.
+ * <p>
+ * It supports a table of words that should
+ * not be stemmed at all. The stemmer used can be changed at runtime after the
+ * filter object is created (as long as it is a {@link DutchStemmer}).
+ * </p>
+ * NOTE: This stemmer does not implement the Snowball algorithm correctly,
+ * specifically doubled consonants. It is recommended that you consider using
+ * the "Dutch" stemmer in the snowball package instead. This stemmer will likely
+ * be deprecated in a future release.
+ */
+ public sealed class DutchStemFilter : TokenFilter
+ {
+ /**
+ * The actual token in the input stream.
+ */
+ private DutchStemmer stemmer = null;
+ private ISet<string> exclusions = null;
+
+ private TermAttribute termAtt;
+
+ public DutchStemFilter(TokenStream _in)
+ : base(_in)
+ {
+ stemmer = new DutchStemmer();
+ termAtt = AddAttribute<TermAttribute>();
+ }
+
+ /**
+ * Builds a DutchStemFilter that uses an exclusion table.
+ */
+ public DutchStemFilter(TokenStream _in, ISet<string> exclusiontable)
+ : this(_in)
+ {
+ exclusions = exclusiontable;
+ }
+
+ /**
+ * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
+ */
+ public DutchStemFilter(TokenStream _in, ISet<string> exclusiontable, IDictionary<string, string> stemdictionary)
+ : this(_in, exclusiontable)
+ {
+ stemmer.SetStemDictionary(stemdictionary);
+ }
+
+ /**
+ * Returns the next token in the stream, or null at EOS
+ */
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ String term = termAtt.Term();
+
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.Contains(term))
+ {
+ String s = stemmer.Stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.Equals(term))
+ termAtt.SetTermBuffer(s);
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /**
+ * Set a alternative/custom {@link DutchStemmer} for this filter.
+ */
+ public void SetStemmer(DutchStemmer stemmer)
+ {
+ if (stemmer != null)
+ {
+ this.stemmer = stemmer;
+ }
+ }
+
+ /**
+ * Set an alternative exclusion list for this filter.
+ */
+ public void SetExclusionTable(HashSet<string> exclusiontable)
+ {
+ exclusions = exclusiontable;
+ }
+
+ /**
+ * Set dictionary for stemming, this dictionary overrules the algorithm,
+ * so you can correct for a particular unwanted word-stem pair.
+ */
+ public void SetStemDictionary(IDictionary<string, string> dict)
+ {
+ if (stemmer != null)
+ stemmer.SetStemDictionary(dict);
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs Mon Nov 21 04:44:55 2011
@@ -23,484 +23,439 @@ using System;
using System.IO;
using System.Text;
using System.Collections;
+using System.Collections.Generic;
namespace Lucene.Net.Analysis.Nl
{
-
- /* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2001 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation" and
- * "Apache Lucene" must not be used to endorse or promote products
- * derived from this software without prior written permission. For
- * written permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * "Apache Lucene", nor may "Apache" appear in their name, without
- * prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- /// <summary>
- /// A stemmer for Dutch words. The algorithm is an implementation of
- /// the <see c="http://snowball.tartarus.org/dutch/stemmer.html">dutch stemming</see>
- /// algorithm in snowball. Snowball is a project of Martin Porter (does Porter Stemmer ring a bell?):
- ///
- /// @version $Id: DutchStemmer.java,v 1.1 2004/03/09 14:55:08 otis Exp $
- /// </summary>
- /// <author>Edwin de Jonge (ejne@cbs.nl)</author>
- public class DutchStemmer
- {
- /// <summary>
- /// Buffer for the terms while stemming them.
- /// </summary>
- private StringBuilder sb = new StringBuilder();
- private bool _removedE;
- private Hashtable _stemDict;
-
-
- private int _R1;
- private int _R2;
-
- /// <summary>
- /// Stemms the given term to an unique <tt>discriminator</tt>.
- /// </summary>
- /// <param name="term">The term that should be stemmed.</param>
- /// <returns>Discriminator for <tt>term</tt></returns>
- //TODO convert to internal
- public string Stem( String term )
- {
- term = term.ToLower();
- if ( !IsStemmable( term ) )
- return term;
- if (_stemDict != null && _stemDict.Contains(term))
- return _stemDict[term] as string;
- // Reset the StringBuilder.
- sb.Remove(0, sb.Length);
- sb.Insert(0, term);
- // Stemming starts here...
- Substitute(sb);
- StoreYandI(sb);
- _R1 = GetRIndex(sb, 0);
- _R1 = Math.Max(3,_R1);
- Step1(sb);
- Step2(sb);
- _R2 = GetRIndex(sb, _R1);
- Step3a(sb);
- Step3b(sb);
- Step4(sb);
- ReStoreYandI(sb);
- return sb.ToString();
- }
-
- private bool enEnding(StringBuilder sb)
- {
- string[] enend = new string[]{"ene","en"};
- foreach(string end in enend)
- {
- string s = sb.ToString();
- int index = s.Length - end.Length;
- if ( s.EndsWith(end) &&
- index >= _R1 &&
- IsValidEnEnding(sb,index-1)
- )
- {
- sb.Remove(index, end.Length);
- UnDouble(sb,index);
- return true;
- }
- }
- return false;
- }
-
-
- private void Step1(StringBuilder sb)
- {
- if (_R1 >= sb.Length)
- return;
-
- string s = sb.ToString();
- int lengthR1 = sb.Length - _R1;
- int index;
-
- if (s.EndsWith("heden"))
- {
- sb.Replace("heden","heid", _R1, lengthR1);
- return;
- }
-
- if (enEnding(sb))
- return;
-
- if (s.EndsWith("se") &&
- (index = s.Length - 2) >= _R1 &&
- IsValidSEnding(sb, index -1)
- )
- {
- sb.Remove(index, 2);
- return;
- }
- if (s.EndsWith("s") &&
- (index = s.Length - 1) >= _R1 &&
- IsValidSEnding(sb, index - 1))
- {
- sb.Remove(index, 1);
- }
- }
-
- /// <summary>
- /// Delete suffix e if in R1 and
- /// preceded by a non-vowel, and then undouble the ending
- /// </summary>
- /// <param name="sb">string being stemmed</param>
- private void Step2(StringBuilder sb)
- {
- _removedE = false;
- if (_R1 >= sb.Length)
- return;
- string s = sb.ToString();
- int index = s.Length - 1;
- if ( index >= _R1 &&
- s.EndsWith("e") &&
- !IsVowel(sb[index-1]))
- {
- sb.Remove(index,1);
- UnDouble(sb);
- _removedE = true;
- }
- }
-
- /// <summary>
- /// Delete "heid"
- /// </summary>
- /// <param name="sb">string being stemmed</param>
- private void Step3a(StringBuilder sb)
- {
- if (_R2 >= sb.Length)
- return;
- string s = sb.ToString();
- int index = s.Length - 4;
- if (s.EndsWith("heid")&& index >= _R2 && sb[index - 1] != 'c')
- {
- sb.Remove(index,4); //remove heid
- enEnding(sb);
- }
- }
-
- /// <summary>
- /// <p>A d-suffix, or derivational suffix, enables a new word,
- /// often with a different grammatical category, or with a different
- /// sense, to be built from another word. Whether a d-suffix can be
- /// attached is discovered not from the rules of grammar, but by
- /// referring to a dictionary. So in English, ness can be added to
- /// certain adjectives to form corresponding nouns (littleness,
- /// kindness, foolishness ...) but not to all adjectives
- /// (not for example, to big, cruel, wise ...) d-suffixes can be
- /// used to change meaning, often in rather exotic ways.</p>
- /// Remove "ing", "end", "ig", "lijk", "baar" and "bar"
- /// </summary>
- /// <param name="sb">string being stemmed</param>
- private void Step3b(StringBuilder sb)
- {
- if (_R2 >= sb.Length)
- return;
- string s = sb.ToString();
- int index;
-
- if ((s.EndsWith("end") || s.EndsWith("ing")) &&
- (index = s.Length - 3) >= _R2
- )
- {
- sb.Remove(index,3);
- if (sb[index - 2] == 'i' &&
- sb[index - 1] == 'g')
- {
- if (sb[index - 3] != 'e' & index-2 >= _R2)
- {
- index -= 2;
- sb.Remove(index,2);
- }
- }
- else
- {
- UnDouble(sb,index);
- }
- return;
- }
- if ( s.EndsWith("ig") &&
- (index = s.Length - 2) >= _R2
- )
- {
- if (sb[index - 1] != 'e')
- sb.Remove(index, 2);
- return;
- }
- if (s.EndsWith("lijk") &&
- (index = s.Length - 4) >= _R2
- )
- {
- sb.Remove(index, 4);
- Step2(sb);
- return;
- }
- if (s.EndsWith("baar") &&
- (index = s.Length - 4) >= _R2
- )
- {
- sb.Remove(index, 4);
- return;
- }
- if (s.EndsWith("bar") &&
- (index = s.Length - 3) >= _R2
- )
- {
- if (_removedE)
- sb.Remove(index, 3);
- return;
- }
- }
-
- /// <summary>
- /// undouble vowel
- /// If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
- /// </summary>
- /// <param name="sb">string being stemmed</param>
- private void Step4(StringBuilder sb)
- {
- if (sb.Length < 4)
- return;
- string end = sb.ToString(sb.Length - 4,4);
- char c = end[0];
- char v1 = end[1];
- char v2 = end[2];
- char d = end[3];
- if (v1 == v2 &&
- d != 'I' &&
- v1 != 'i' &&
- IsVowel(v1) &&
- !IsVowel(d) &&
- !IsVowel(c))
- {
- sb.Remove(sb.Length - 2, 1);
- }
- }
-
- /// <summary>
- /// Checks if a term could be stemmed.
- /// </summary>
- /// <param name="term"></param>
- /// <returns>true if, and only if, the given term consists in letters.</returns>
- private bool IsStemmable( String term )
- {
- for ( int c = 0; c < term.Length; c++ )
- {
- if ( !Char.IsLetter(term[c])) return false;
- }
- return true;
- }
-
- /// <summary>
- /// Substitute ä, ë, ï, ö, ü, á , é, Ã, ó, ú
- /// </summary>
- /// <param name="buffer"></param>
- private void Substitute( StringBuilder buffer )
- {
- for ( int i = 0; i < buffer.Length; i++ )
- {
- switch (buffer[i])
- {
- case 'ä':
- case 'á':
- {
- buffer[i] = 'a';
- break;
- }
- case 'ë':
- case 'é':
- {
- buffer[i] = 'e';
- break;
- }
- case 'ü':
- case 'ú':
- {
- buffer[i] = 'u';
- break;
- }
- case 'ï':
- case 'i':
- {
- buffer[i] = 'i';
- break;
- }
- case 'ö':
- case 'ó':
- {
- buffer[i] = 'o';
- break;
- }
- }
- }
- }
-
-// private bool IsValidSEnding(StringBuilder sb)
-// {
-// return IsValidSEnding(sb,sb.Length - 1);
-// }
-
- private bool IsValidSEnding(StringBuilder sb, int index)
- {
- char c = sb[index];
- if (IsVowel(c) || c == 'j')
- return false;
- return true;
- }
-
-// private bool IsValidEnEnding(StringBuilder sb)
-// {
-// return IsValidEnEnding(sb,sb.Length - 1);
-// }
-
- private bool IsValidEnEnding(StringBuilder sb, int index)
- {
- char c = sb[index];
- if (IsVowel(c))
- return false;
- if (c < 3)
- return false;
- // ends with "gem"?
- if (c == 'm' && sb[index - 2] == 'g' && sb[index-1] == 'e')
- return false;
- return true;
- }
-
- private void UnDouble(StringBuilder sb)
- {
- UnDouble(sb, sb.Length);
- }
-
- private void UnDouble(StringBuilder sb, int endIndex)
- {
- string s = sb.ToString(0, endIndex);
- if (s.EndsWith("kk") || s.EndsWith("tt") || s.EndsWith("dd") || s.EndsWith("nn") || s.EndsWith("mm") || s.EndsWith("ff"))
- {
- sb.Remove(endIndex-1,1);
- }
- }
-
- private int GetRIndex(StringBuilder sb, int start)
- {
- if (start == 0)
- start = 1;
- int i = start;
- for (; i < sb.Length; i++)
- {
- //first non-vowel preceded by a vowel
- if (!IsVowel(sb[i]) && IsVowel(sb[i-1]))
- {
- return i + 1;
- }
- }
- return i + 1;
- }
-
- private void StoreYandI(StringBuilder sb)
- {
- if (sb[0] == 'y')
- sb[0] = 'Y';
- //char c;
- int last = sb.Length - 1;
- for (int i = 1; i < last; i++)
- {
- switch (sb[i])
- {
- case 'i':
- {
- if (IsVowel(sb[i-1]) &&
- IsVowel(sb[i+1])
- )
- sb[i] = 'I';
- break;
- }
- case 'y':
- {
- if (IsVowel(sb[i-1]))
- sb[i] = 'Y';
- break;
- }
- }
- }
- if (last > 0 && sb[last]=='y' && IsVowel(sb[last-1]))
- sb[last]='Y';
- }
-
- private void ReStoreYandI(StringBuilder sb)
- {
- sb.Replace("I","i");
- sb.Replace("Y","y");
- }
-
- private bool IsVowel(char c)
- {
- switch (c)
- {
- case 'e':
- case 'a':
- case 'o':
- case 'i':
- case 'u':
- case 'y':
- case 'è':
- {
- return true;
- }
- }
- return false;
- }
-
- internal void SetStemDictionary(Hashtable dict)
- {
- _stemDict = dict;
- }
- }
+ /**
+ * A stemmer for Dutch words.
+ * <p>
+ * The algorithm is an implementation of
+ * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
+ * algorithm in Martin Porter's snowball project.
+ * </p>
+ */
+
+ public class DutchStemmer
+ {
+ /**
+ * Buffer for the terms while stemming them.
+ */
+ private StringBuilder sb = new StringBuilder();
+ private bool _removedE;
+ private IDictionary<string, string> _stemDict;
+
+ private int _R1;
+ private int _R2;
+
+ //TODO convert to internal
+ /*
+ * Stems the given term to an unique <tt>discriminator</tt>.
+ *
+ * @param term The term that should be stemmed.
+ * @return Discriminator for <tt>term</tt>
+ */
+ public String Stem(String term)
+ {
+ term = term.ToLower();
+ if (!isStemmable(term))
+ return term;
+ if (_stemDict != null && _stemDict.ContainsKey(term))
+ if (_stemDict[term] is String)
+ return (String)_stemDict[term];
+ else
+ return null;
+
+ // Reset the StringBuilder.
+ sb.Clear();
+ sb.Insert(0, term);
+ // Stemming starts here...
+ substitute(sb);
+ storeYandI(sb);
+ _R1 = getRIndex(sb, 0);
+ _R1 = Math.Max(3, _R1);
+ step1(sb);
+ step2(sb);
+ _R2 = getRIndex(sb, _R1);
+ step3a(sb);
+ step3b(sb);
+ step4(sb);
+ reStoreYandI(sb);
+ return sb.ToString();
+ }
+
+ private bool enEnding(StringBuilder sb)
+ {
+ String[] enend = new String[] { "ene", "en" };
+ for (int i = 0; i < enend.Length; i++)
+ {
+ String end = enend[i];
+ String s = sb.ToString();
+ int index = s.Length - end.Length;
+ if (s.EndsWith(end) &&
+ index >= _R1 &&
+ isValidEnEnding(sb, index - 1)
+ )
+ {
+ sb.Remove(index, index + end.Length);
+ unDouble(sb, index);
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ private void step1(StringBuilder sb)
+ {
+ if (_R1 >= sb.Length)
+ return;
+
+ String s = sb.ToString();
+ int LengthR1 = sb.Length - _R1;
+ int index;
+
+ if (s.EndsWith("heden"))
+ {
+ sb.Remove(_R1, LengthR1 + _R1);
+ sb.Insert(_R1, sb.ToString(_R1, LengthR1 + _R1).Replace("heden", "heid"));
+ return;
+ }
+
+ if (enEnding(sb))
+ return;
+
+ if (s.EndsWith("se") &&
+ (index = s.Length - 2) >= _R1 &&
+ isValidSEnding(sb, index - 1)
+ )
+ {
+ sb.Remove(index, index + 2);
+ return;
+ }
+ if (s.EndsWith("s") &&
+ (index = s.Length - 1) >= _R1 &&
+ isValidSEnding(sb, index - 1))
+ {
+ sb.Remove(index, index + 1);
+ }
+ }
+
+ /**
+ * Remove suffix e if in R1 and
+ * preceded by a non-vowel, and then undouble the ending
+ *
+ * @param sb String being stemmed
+ */
+ private void step2(StringBuilder sb)
+ {
+ _removedE = false;
+ if (_R1 >= sb.Length)
+ return;
+ String s = sb.ToString();
+ int index = s.Length - 1;
+ if (index >= _R1 &&
+ s.EndsWith("e") &&
+ !isVowel(sb[index - 1]))
+ {
+ sb.Remove(index, index + 1);
+ unDouble(sb);
+ _removedE = true;
+ }
+ }
+
+ /**
+ * Remove "heid"
+ *
+ * @param sb String being stemmed
+ */
+ private void step3a(StringBuilder sb)
+ {
+ if (_R2 >= sb.Length)
+ return;
+ String s = sb.ToString();
+ int index = s.Length - 4;
+ if (s.EndsWith("heid") && index >= _R2 && sb[index - 1] != 'c')
+ {
+ sb.Remove(index, index + 4); //remove heid
+ enEnding(sb);
+ }
+ }
+
+ /**
+ * <p>A d-suffix, or derivational suffix, enables a new word,
+ * often with a different grammatical category, or with a different
+ * sense, to be built from another word. Whether a d-suffix can be
+ * attached is discovered not from the rules of grammar, but by
+ * referring to a dictionary. So in English, ness can be added to
+ * certain adjectives to form corresponding nouns (littleness,
+ * kindness, foolishness ...) but not to all adjectives
+ * (not for example, to big, cruel, wise ...) d-suffixes can be
+ * used to change meaning, often in rather exotic ways.</p>
+ * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
+ *
+ * @param sb String being stemmed
+ */
+ private void step3b(StringBuilder sb)
+ {
+ if (_R2 >= sb.Length)
+ return;
+ String s = sb.ToString();
+ int index = 0;
+
+ if ((s.EndsWith("end") || s.EndsWith("ing")) &&
+ (index = s.Length - 3) >= _R2)
+ {
+ sb.Remove(index, index + 3);
+ if (sb[index - 2] == 'i' &&
+ sb[index - 1] == 'g')
+ {
+ if (sb[index - 3] != 'e' & index - 2 >= _R2)
+ {
+ index -= 2;
+ sb.Remove(index, index + 2);
+ }
+ }
+ else
+ {
+ unDouble(sb, index);
+ }
+ return;
+ }
+ if (s.EndsWith("ig") &&
+ (index = s.Length - 2) >= _R2
+ )
+ {
+ if (sb[index - 1] != 'e')
+ sb.Remove(index, index + 2);
+ return;
+ }
+ if (s.EndsWith("lijk") &&
+ (index = s.Length - 4) >= _R2
+ )
+ {
+ sb.Remove(index, index + 4);
+ step2(sb);
+ return;
+ }
+ if (s.EndsWith("baar") &&
+ (index = s.Length - 4) >= _R2
+ )
+ {
+ sb.Remove(index, index + 4);
+ return;
+ }
+ if (s.EndsWith("bar") &&
+ (index = s.Length - 3) >= _R2
+ )
+ {
+ if (_removedE)
+ sb.Remove(index, index + 3);
+ return;
+ }
+ }
+
+ /**
+ * undouble vowel
+ * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
+ *
+ * @param sb String being stemmed
+ */
+ private void step4(StringBuilder sb)
+ {
+ if (sb.Length < 4)
+ return;
+ String end = sb.ToString(sb.Length - 4, sb.Length);
+ char c = end[0];
+ char v1 = end[1];
+ char v2 = end[2];
+ char d = end[3];
+ if (v1 == v2 &&
+ d != 'I' &&
+ v1 != 'i' &&
+ isVowel(v1) &&
+ !isVowel(d) &&
+ !isVowel(c))
+ {
+ sb.Remove(sb.Length - 2, sb.Length - 1);
+ }
+ }
+
+ /**
+ * Checks if a term could be stemmed.
+ *
+ * @return true if, and only if, the given term consists in letters.
+ */
+ private bool isStemmable(String term)
+ {
+ for (int c = 0; c < term.Length; c++)
+ {
+ if (!char.IsLetter(term[c])) return false;
+ }
+ return true;
+ }
+
+ /**
+ * Substitute ä, ë, ï, ö, ü, á , é, ÃÂ, ó, ú
+ */
+ private void substitute(StringBuilder buffer)
+ {
+ for (int i = 0; i < buffer.Length; i++)
+ {
+ switch (buffer[i])
+ {
+ case 'ä':
+ case 'á':
+ {
+ buffer[i] = 'a';
+ break;
+ }
+ case 'ë':
+ case 'é':
+ {
+ buffer[i] = 'e';
+ break;
+ }
+ case 'ü':
+ case 'ú':
+ {
+ buffer[i] = 'u';
+ break;
+ }
+ case 'ï':
+ case 'i':
+ {
+ buffer[i] = 'i';
+ break;
+ }
+ case 'ö':
+ case 'ó':
+ {
+ buffer[i] = 'o';
+ break;
+ }
+ }
+ }
+ }
+
+ /*private bool isValidSEnding(StringBuilder sb) {
+ return isValidSEnding(sb, sb.Length - 1);
+ }*/
+
+ private bool isValidSEnding(StringBuilder sb, int index)
+ {
+ char c = sb[index];
+ if (isVowel(c) || c == 'j')
+ return false;
+ return true;
+ }
+
+ /*private bool isValidEnEnding(StringBuilder sb) {
+ return isValidEnEnding(sb, sb.Length - 1);
+ }*/
+
+ private bool isValidEnEnding(StringBuilder sb, int index)
+ {
+ char c = sb[index];
+ if (isVowel(c))
+ return false;
+ if (c < 3)
+ return false;
+ // ends with "gem"?
+ if (c == 'm' && sb[index - 2] == 'g' && sb[index - 1] == 'e')
+ return false;
+ return true;
+ }
+
+ private void unDouble(StringBuilder sb)
+ {
+ unDouble(sb, sb.Length);
+ }
+
+ private void unDouble(StringBuilder sb, int endIndex)
+ {
+ String s = sb.ToString(0, endIndex);
+ if (s.EndsWith("kk") || s.EndsWith("tt") || s.EndsWith("dd") || s.EndsWith("nn") || s.EndsWith("mm") || s.EndsWith("ff"))
+ {
+ sb.Remove(endIndex - 1, endIndex);
+ }
+ }
+
+ private int getRIndex(StringBuilder sb, int start)
+ {
+ if (start == 0)
+ start = 1;
+ int i = start;
+ for (; i < sb.Length; i++)
+ {
+ //first non-vowel preceded by a vowel
+ if (!isVowel(sb[i]) && isVowel(sb[i - 1]))
+ {
+ return i + 1;
+ }
+ }
+ return i + 1;
+ }
+
+ private void storeYandI(StringBuilder sb)
+ {
+ if (sb[0] == 'y')
+ sb[0] = 'Y';
+
+ int last = sb.Length - 1;
+
+ for (int i = 1; i < last; i++)
+ {
+ switch (sb[i])
+ {
+ case 'i':
+ {
+ if (isVowel(sb[i - 1]) &&
+ isVowel(sb[i + 1])
+ )
+ sb[i] = 'I';
+ break;
+ }
+ case 'y':
+ {
+ if (isVowel(sb[i - 1]))
+ sb[i] = 'Y';
+ break;
+ }
+ }
+ }
+ if (last > 0 && sb[last] == 'y' && isVowel(sb[last - 1]))
+ sb[last] = 'Y';
+ }
+
+ private void reStoreYandI(StringBuilder sb)
+ {
+ String tmp = sb.ToString();
+ sb.Remove(0, sb.Length);
+ sb.Insert(0, tmp.Replace("I", "i").Replace("Y", "y"));
+ }
+
+ private bool isVowel(char c)
+ {
+ switch (c)
+ {
+ case 'e':
+ case 'a':
+ case 'o':
+ case 'i':
+ case 'u':
+ case 'y':
+ case 'è':
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ protected internal void SetStemDictionary(IDictionary<string, string> dict)
+ {
+ _stemDict = dict;
+ }
+ }
}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/AbstractEncoder.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/AbstractEncoder.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/AbstractEncoder.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/AbstractEncoder.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+ /// <summary>
+ /// Base class for payload encoders.
+ /// </summary>
+ public abstract class AbstractEncoder : PayloadEncoder
+ {
+ public Payload Encode(char[] buffer)
+ {
+ return Encode(buffer, 0, buffer.Length);
+ }
+
+ public abstract Payload Encode(char[] buffer, int offset, int length);
+ }
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/DelimitedPayloadTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/DelimitedPayloadTokenFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/DelimitedPayloadTokenFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/DelimitedPayloadTokenFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+ /// <summary>
+ /// Characters before the delimiter are the "token", those after are the payload.
+ /// <p/>
+ /// For example, if the delimiter is '|', then for the string "foo|bar", foo is the token
+ /// and "bar" is a payload.
+ /// <p/>
+ /// Note, you can also include a {@link org.apache.lucene.analysis.payloads.PayloadEncoder} to convert the
+ /// payload in an appropriate way (from characters to bytes).
+ /// <p/>
+ /// Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
+ /// </summary>
+ /// <seealso cref="PayloadEncoder"/>
+ public sealed class DelimitedPayloadTokenFilter : TokenFilter
+ {
+ public static readonly char DEFAULT_DELIMITER = '|';
+ internal char delimiter = DEFAULT_DELIMITER;
+ internal TermAttribute termAtt;
+ internal PayloadAttribute payAtt;
+ internal PayloadEncoder encoder;
+
+ /// <summary>
+ /// Construct a token stream filtering the given input.
+ /// </summary>
+ internal DelimitedPayloadTokenFilter(TokenStream input)
+ : this(input, DEFAULT_DELIMITER, new IdentityEncoder())
+ {
+
+ }
+
+
+ public DelimitedPayloadTokenFilter(TokenStream input, char delimiter, PayloadEncoder encoder)
+ : base(input)
+ {
+ termAtt = AddAttribute<TermAttribute>();
+ payAtt = AddAttribute<PayloadAttribute>();
+ this.delimiter = delimiter;
+ this.encoder = encoder;
+ }
+
+ public override bool IncrementToken()
+ {
+ bool result = false;
+ if (input.IncrementToken())
+ {
+ char[] buffer = termAtt.TermBuffer();
+ int length = termAtt.TermLength();
+ //look for the delimiter
+ bool seen = false;
+ for (int i = 0; i < length; i++)
+ {
+ if (buffer[i] == delimiter)
+ {
+ termAtt.SetTermBuffer(buffer, 0, i);
+ payAtt.SetPayload(encoder.Encode(buffer, i + 1, (length - (i + 1))));
+ seen = true;
+ break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same
+ }
+ }
+ if (seen == false)
+ {
+ //no delimiter
+ payAtt.SetPayload(null);
+ }
+ result = true;
+ }
+ return result;
+ }
+ }
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/FloatEncoder.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/FloatEncoder.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/FloatEncoder.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/FloatEncoder.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+ /// <summary>
+ /// Encode a character array Float as a {@link org.apache.lucene.index.Payload}.
+ /// </summary>
+ /// <seealso cref="PayloadHelper.EncodeFloat(float, byte[], int)"/>
+ public class FloatEncoder : AbstractEncoder, PayloadEncoder
+ {
+ public override Payload Encode(char[] buffer, int offset, int length)
+ {
+ Payload result = new Payload();
+ float payload = float.Parse(new string(buffer, offset, length)); // TODO: improve this so that we don't have to new Strings
+ byte[] bytes = PayloadHelper.EncodeFloat(payload);
+ result.SetData(bytes);
+ return result;
+ }
+ }
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IdentityEncoder.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IdentityEncoder.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IdentityEncoder.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IdentityEncoder.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+ /// <summary>
+ /// Does nothing other than convert the char array to a byte array using the specified encoding.
+ /// </summary>
+ public class IdentityEncoder : AbstractEncoder, PayloadEncoder
+ {
+
+ protected internal Encoding encoding = Encoding.UTF8;
+ protected internal String encodingName = "UTF-8"; //argh, stupid 1.4
+
+ public IdentityEncoder()
+ {
+ }
+
+ public IdentityEncoder(Encoding encoding)
+ {
+ this.encoding = encoding;
+ encodingName = encoding.EncodingName;
+ }
+
+
+ public override Payload Encode(char[] buffer, int offset, int length)
+ {
+ //what's the most efficient way to get a byte [] from a char[] array
+ //Do we have to go through String?
+ String tmp = new String(buffer, offset, length);
+ Payload result = null;//Can we avoid allocating by knowing where using the new API?
+ try
+ {
+ result = new Payload(encoding.GetBytes(tmp));
+ }
+ catch (EncoderFallbackException e)
+ {
+ //should never hit this, since we get the name from the Charset
+ }
+
+ return result;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IntegerEncoder.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IntegerEncoder.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IntegerEncoder.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IntegerEncoder.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Index;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+ /// <summary>
+ /// Encode a character array Integer as a {@link org.apache.lucene.index.Payload}.
+ /// </summary>
+ /// <seealso cref="PayloadHelper.EncodeInt(int, byte[], int)"/>
+ public class IntegerEncoder : AbstractEncoder, PayloadEncoder
+ {
+ public override Payload Encode(char[] buffer, int offset, int length)
+ {
+ Payload result = new Payload();
+ int payload = ArrayUtil.ParseInt(buffer, offset, length);//TODO: improve this so that we don't have to new Strings
+ byte[] bytes = PayloadHelper.EncodeInt(payload);
+ result.SetData(bytes);
+ return result;
+ }
+ }
+}
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/NumericPayloadTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/NumericPayloadTokenFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/NumericPayloadTokenFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/NumericPayloadTokenFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,46 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+ /// <summary>
+ /// Assigns a payload to a token based on the <see cref="Token.Type()"/>
+ /// </summary>
+ public class NumericPayloadTokenFilter : TokenFilter
+ {
+ private String typeMatch;
+ private Payload thePayload;
+
+ private PayloadAttribute payloadAtt;
+ private TypeAttribute typeAtt;
+
+ public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch)
+ : base(input)
+ {
+ //Need to encode the payload
+ thePayload = new Payload(PayloadHelper.EncodeFloat(payload));
+ this.typeMatch = typeMatch;
+ payloadAtt = AddAttribute<PayloadAttribute>();
+ typeAtt = AddAttribute<TypeAttribute>();
+ }
+
+ public sealed override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ if (typeAtt.Type().Equals(typeMatch))
+ payloadAtt.SetPayload(thePayload);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadEncoder.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadEncoder.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadEncoder.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadEncoder.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+ /// <summary>
+ /// Mainly for use with the DelimitedPayloadTokenFilter, converts char buffers to Payload
+ /// <p/>
+ /// NOTE: this interface is subject to change
+ /// </summary>
+ public interface PayloadEncoder
+ {
+ Payload Encode(char[] buffer);
+
+ /// <summary>
+ /// Convert a char array to a <see cref="Payload"/>
+ /// </summary>
+ /// <returns>An encoded <see cref="Payload"/></returns>
+ Payload Encode(char[] buffer, int offset, int length);
+ }
+}
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs Mon Nov 21 04:44:55 2011
@@ -15,6 +15,8 @@
* limitations under the License.
*/
+using Lucene.Net.Support;
+
namespace Lucene.Net.Analyzers.Payloads
{
/// <summary>
@@ -29,7 +31,7 @@ namespace Lucene.Net.Analyzers.Payloads
public static byte[] EncodeFloat(float payload, byte[] data, int offset)
{
- return EncodeInt(SupportClass.Single.FloatToIntBits(payload), data, offset);
+ return EncodeInt(Single.FloatToIntBits(payload), data, offset);
}
public static byte[] EncodeInt(int payload)
@@ -66,7 +68,7 @@ namespace Lucene.Net.Analyzers.Payloads
/// <returns>The float that was encoded</returns>
public static float DecodeFloat(byte[] bytes, int offset)
{
- return SupportClass.Single.IntBitsToFloat(DecodeInt(bytes, offset));
+ return Single.IntBitsToFloat(DecodeInt(bytes, offset));
}
public static int DecodeInt(byte[] bytes, int offset)
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TokenOffsetPayloadTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TokenOffsetPayloadTokenFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TokenOffsetPayloadTokenFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TokenOffsetPayloadTokenFilter.cs Mon Nov 21 04:44:55 2011
@@ -0,0 +1,45 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+ /// <summary>
+ /// Adds the <see cref="Token.SetStartOffset(int)"/>
+ /// and <see cref="Token.SetEndOffset(int)"/>
+ /// First 4 bytes are the start
+ /// </summary>
+ public class TokenOffsetPayloadTokenFilter : TokenFilter
+ {
+ protected OffsetAttribute offsetAtt;
+ protected PayloadAttribute payAtt;
+
+ public TokenOffsetPayloadTokenFilter(TokenStream input)
+ : base(input)
+ {
+ offsetAtt = AddAttribute<OffsetAttribute>();
+ payAtt = AddAttribute<PayloadAttribute>();
+ }
+
+ public sealed override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ byte[] data = new byte[8];
+ PayloadHelper.EncodeInt(offsetAtt.StartOffset(), data, 0);
+ PayloadHelper.EncodeInt(offsetAtt.EndOffset(), data, 4);
+ Payload payload = new Payload(data);
+ payAtt.SetPayload(payload);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}