You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucenenet.apache.org by Bär,
Arian <ar...@schema.de> on 2009/04/06 14:04:43 UTC
SnowballFilter speed improvment
Hi,
I'm using Lucene.Net along with snowball stemming to index text from a database. The class Lucene.Net.Analysis.Snowball.SnowballFilter uses the reflection API and the invoke method to call the stem methods of snowball. I have written a Snowball filter which creates a delegate and uses this delegate to stem the words afterwards. This approach improves the indexing speed of my indexing program by about 10%. I would be happy if you include this code into lucene.net.
With kind Regards,
Arian
Code:
using System;
using Lucene.Net.Analysis;
using SF.Snowball;
using SF.Snowball.Ext;
namespace Index.Search.Analyzers
{
/// <summary>A filter that stems words using a Snowball-generated stemmer.
///
/// Available stemmers are listed in {@link SF.Snowball.Ext}. The name of a
/// stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
/// {@link EnglishStemmer} is named "English".
/// </summary>
public class FailOverSnowballFilter : TokenFilter
{
private static readonly System.Object[] EMPTY_ARGS = new System.Object[0];
string stemmerName = string.Empty;
private delegate bool BoolVoidDelegate();
private BoolVoidDelegate tehMeth0d;
private SnowballProgram stemmer;
private System.Reflection.MethodInfo stemMethod;
/// <summary>Construct a stemmer for a certain language.
///
/// </summary>
/// <param name="in">the input tokens to stem
/// </param>
/// <param name="name">the language name of a stemmer
/// </param>
public FailOverSnowballFilter(TokenStream in_Renamed, System.String name)
: base(in_Renamed)
{
stemmerName = name + "Stemmer";
try
{
stemmer = (SnowballProgram)Activator.CreateInstance("Snowball.Net", "SF.Snowball.Ext." + stemmerName).Unwrap();
stemMethod = stemmer.GetType().GetMethod("Stem", (new Type[0] == null) ? new Type[0] : (Type[])new Type[0]);
tehMeth0d = (BoolVoidDelegate)Delegate.CreateDelegate(typeof(BoolVoidDelegate), stemmer, stemMethod);
}
catch (System.Exception e)
{
throw new System.SystemException(e.ToString());
}
}
/// <summary>Returns the next input Token, after being stemmed </summary>
public override Token Next()
{
Token token = input.Next();
if (token == null)
return null;
stemmer.SetCurrent(token.TermText());
try
{
tehMeth0d();
//stemMethod.Invoke(stemmer, (System.Object[])EMPTY_ARGS);
}
catch (System.Exception e)
{
Console.WriteLine(string.Format( "{0} was not able to stemm token \"{1}\", using token directly.\n {2}", stemmerName, token.TermText(), e.ToString()));
}
Token newToken = new Token(stemmer.GetCurrent(), token.StartOffset(), token.EndOffset(), token.Type());
newToken.SetPositionIncrement(token.GetPositionIncrement());
return newToken;
}
}
}
---------------------------------------------------------------------------
An- und Abmeldung zur SCHEMA Mailingliste unter http://www.schema.de/mail
---------------------------------------------------------------------------
RE: SnowballFilter speed improvment
Posted by George Aroush <ge...@aroush.net>.
Hi Arian,
In order for us to accept code, please use JIRA to submit a patch.
Lucene.Net JIRA is here: https://issues.apache.org/jira/browse/LUCENENET
Regards,
-- George
> -----Original Message-----
> From: Bär, Arian [mailto:arian.baer@schema.de]
> Sent: Monday, April 06, 2009 8:05 AM
> To: lucene-net-dev@incubator.apache.org
> Subject: SnowballFilter speed improvment
>
> Hi,
>
> I'm using Lucene.Net along with snowball stemming to index
> text from a database. The class
> Lucene.Net.Analysis.Snowball.SnowballFilter uses the
> reflection API and the invoke method to call the stem methods
> of snowball. I have written a Snowball filter which creates a
> delegate and uses this delegate to stem the words afterwards.
> This approach improves the indexing speed of my indexing
> program by about 10%. I would be happy if you include this
> code into lucene.net.
>
> With kind Regards,
> Arian
>
> Code:
>
> using System;
> using Lucene.Net.Analysis;
> using SF.Snowball;
> using SF.Snowball.Ext;
>
> namespace Index.Search.Analyzers
> {
>
> /// <summary>A filter that stems words using a
> Snowball-generated stemmer.
> ///
> /// Available stemmers are listed in {@link
> SF.Snowball.Ext}. The name of a
> /// stemmer is the part of the class name before
> "Stemmer", e.g., the stemmer in
> /// {@link EnglishStemmer} is named "English".
> /// </summary>
>
> public class FailOverSnowballFilter : TokenFilter
> {
> private static readonly System.Object[]
> EMPTY_ARGS = new System.Object[0];
> string stemmerName = string.Empty;
>
> private delegate bool BoolVoidDelegate();
> private BoolVoidDelegate tehMeth0d;
>
> private SnowballProgram stemmer;
> private System.Reflection.MethodInfo stemMethod;
>
> /// <summary>Construct a stemmer for a certain language.
> ///
> /// </summary>
> /// <param name="in">the input tokens to stem
> /// </param>
> /// <param name="name">the language name of a stemmer
> /// </param>
> public FailOverSnowballFilter(TokenStream
> in_Renamed, System.String name)
> : base(in_Renamed)
> {
> stemmerName = name + "Stemmer";
> try
> {
> stemmer =
> (SnowballProgram)Activator.CreateInstance("Snowball.Net",
> "SF.Snowball.Ext." + stemmerName).Unwrap();
>
> stemMethod =
> stemmer.GetType().GetMethod("Stem", (new Type[0] == null) ?
> new Type[0] : (Type[])new Type[0]);
> tehMeth0d =
> (BoolVoidDelegate)Delegate.CreateDelegate(typeof(BoolVoidDeleg
> ate), stemmer, stemMethod);
> }
> catch (System.Exception e)
> {
> throw new
> System.SystemException(e.ToString());
> }
> }
>
> /// <summary>Returns the next input Token,
> after being stemmed </summary>
> public override Token Next()
> {
> Token token = input.Next();
> if (token == null)
> return null;
> stemmer.SetCurrent(token.TermText());
> try
> {
> tehMeth0d();
> //stemMethod.Invoke(stemmer,
> (System.Object[])EMPTY_ARGS);
> }
> catch (System.Exception e)
> {
>
> Console.WriteLine(string.Format( "{0} was not able to stemm
> token \"{1}\", using token directly.\n {2}", stemmerName,
> token.TermText(), e.ToString()));
> }
>
> Token newToken = new
> Token(stemmer.GetCurrent(), token.StartOffset(),
> token.EndOffset(), token.Type());
>
> newToken.SetPositionIncrement(token.GetPositionIncrement());
> return newToken;
> }
> }
> }
>
>
> --------------------------------------------------------------
> -------------
>
> An- und Abmeldung zur SCHEMA Mailingliste unter
> http://www.schema.de/mail
>
> --------------------------------------------------------------
> -------------
>
>