You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucenenet.apache.org by Bär, Arian <ar...@schema.de> on 2009/04/06 14:04:43 UTC

SnowballFilter speed improvment

Hi,

I'm using Lucene.Net along with snowball stemming to index text from a database. The class Lucene.Net.Analysis.Snowball.SnowballFilter uses the reflection API and the invoke method to call the stem methods of snowball. I have written a Snowball filter which creates a delegate and uses this delegate to stem the words afterwards. This approach improves the indexing speed of my indexing program by about 10%. I would be happy if you include this code into lucene.net.

With kind Regards,
Arian

Code:

using System;
using Lucene.Net.Analysis;
using SF.Snowball;
using SF.Snowball.Ext;

namespace Index.Search.Analyzers
{

	/// <summary>A filter that stems words using a Snowball-generated stemmer.
	/// 
	/// Available stemmers are listed in {@link SF.Snowball.Ext}.  The name of a
	/// stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
	/// {@link EnglishStemmer} is named "English".
	/// </summary>

	public class FailOverSnowballFilter : TokenFilter
	{
		private static readonly System.Object[] EMPTY_ARGS = new System.Object[0];
		string stemmerName = string.Empty;

		private delegate bool BoolVoidDelegate();
		private BoolVoidDelegate tehMeth0d;

		private SnowballProgram stemmer;
		private System.Reflection.MethodInfo stemMethod;

		/// <summary>Construct a stemmer for a certain language.
		/// 
		/// </summary>
		/// <param name="in">the input tokens to stem
		/// </param>
		/// <param name="name">the language name of a stemmer
		/// </param>
		public FailOverSnowballFilter(TokenStream in_Renamed, System.String name)
			: base(in_Renamed)
		{
			stemmerName = name + "Stemmer";
			try
			{
				stemmer = (SnowballProgram)Activator.CreateInstance("Snowball.Net", "SF.Snowball.Ext." + stemmerName).Unwrap();

				stemMethod = stemmer.GetType().GetMethod("Stem", (new Type[0] == null) ? new Type[0] : (Type[])new Type[0]);
				tehMeth0d = (BoolVoidDelegate)Delegate.CreateDelegate(typeof(BoolVoidDelegate), stemmer, stemMethod);
			}
			catch (System.Exception e)
			{
				throw new System.SystemException(e.ToString());
			}
		}

		/// <summary>Returns the next input Token, after being stemmed </summary>
		public override Token Next()
		{
			Token token = input.Next();
			if (token == null)
				return null;
			stemmer.SetCurrent(token.TermText());
			try
			{
				tehMeth0d();
				//stemMethod.Invoke(stemmer, (System.Object[])EMPTY_ARGS);
			}
			catch (System.Exception e)
			{
				Console.WriteLine(string.Format( "{0} was not able to stemm token \"{1}\", using token directly.\n {2}", stemmerName, token.TermText(), e.ToString()));				
			}

			Token newToken = new Token(stemmer.GetCurrent(), token.StartOffset(), token.EndOffset(), token.Type());
			newToken.SetPositionIncrement(token.GetPositionIncrement());
			return newToken;
		}
	}
}


---------------------------------------------------------------------------

 An- und Abmeldung zur SCHEMA Mailingliste unter http://www.schema.de/mail

---------------------------------------------------------------------------



RE: SnowballFilter speed improvment

Posted by George Aroush <ge...@aroush.net>.
Hi Arian,

In order for us to accept code, please use JIRA to submit a patch.
Lucene.Net JIRA is here: https://issues.apache.org/jira/browse/LUCENENET

Regards,

-- George 

> -----Original Message-----
> From: Bär, Arian [mailto:arian.baer@schema.de] 
> Sent: Monday, April 06, 2009 8:05 AM
> To: lucene-net-dev@incubator.apache.org
> Subject: SnowballFilter speed improvment
> 
> Hi,
> 
> I'm using Lucene.Net along with snowball stemming to index 
> text from a database. The class 
> Lucene.Net.Analysis.Snowball.SnowballFilter uses the 
> reflection API and the invoke method to call the stem methods 
> of snowball. I have written a Snowball filter which creates a 
> delegate and uses this delegate to stem the words afterwards. 
> This approach improves the indexing speed of my indexing 
> program by about 10%. I would be happy if you include this 
> code into lucene.net.
> 
> With kind Regards,
> Arian
> 
> Code:
> 
> using System;
> using Lucene.Net.Analysis;
> using SF.Snowball;
> using SF.Snowball.Ext;
> 
> namespace Index.Search.Analyzers
> {
> 
> 	/// <summary>A filter that stems words using a 
> Snowball-generated stemmer.
> 	/// 
> 	/// Available stemmers are listed in {@link 
> SF.Snowball.Ext}.  The name of a
> 	/// stemmer is the part of the class name before 
> "Stemmer", e.g., the stemmer in
> 	/// {@link EnglishStemmer} is named "English".
> 	/// </summary>
> 
> 	public class FailOverSnowballFilter : TokenFilter
> 	{
> 		private static readonly System.Object[] 
> EMPTY_ARGS = new System.Object[0];
> 		string stemmerName = string.Empty;
> 
> 		private delegate bool BoolVoidDelegate();
> 		private BoolVoidDelegate tehMeth0d;
> 
> 		private SnowballProgram stemmer;
> 		private System.Reflection.MethodInfo stemMethod;
> 
> 		/// <summary>Construct a stemmer for a certain language.
> 		/// 
> 		/// </summary>
> 		/// <param name="in">the input tokens to stem
> 		/// </param>
> 		/// <param name="name">the language name of a stemmer
> 		/// </param>
> 		public FailOverSnowballFilter(TokenStream 
> in_Renamed, System.String name)
> 			: base(in_Renamed)
> 		{
> 			stemmerName = name + "Stemmer";
> 			try
> 			{
> 				stemmer = 
> (SnowballProgram)Activator.CreateInstance("Snowball.Net", 
> "SF.Snowball.Ext." + stemmerName).Unwrap();
> 
> 				stemMethod = 
> stemmer.GetType().GetMethod("Stem", (new Type[0] == null) ? 
> new Type[0] : (Type[])new Type[0]);
> 				tehMeth0d = 
> (BoolVoidDelegate)Delegate.CreateDelegate(typeof(BoolVoidDeleg
> ate), stemmer, stemMethod);
> 			}
> 			catch (System.Exception e)
> 			{
> 				throw new 
> System.SystemException(e.ToString());
> 			}
> 		}
> 
> 		/// <summary>Returns the next input Token, 
> after being stemmed </summary>
> 		public override Token Next()
> 		{
> 			Token token = input.Next();
> 			if (token == null)
> 				return null;
> 			stemmer.SetCurrent(token.TermText());
> 			try
> 			{
> 				tehMeth0d();
> 				//stemMethod.Invoke(stemmer, 
> (System.Object[])EMPTY_ARGS);
> 			}
> 			catch (System.Exception e)
> 			{
> 				
> Console.WriteLine(string.Format( "{0} was not able to stemm 
> token \"{1}\", using token directly.\n {2}", stemmerName, 
> token.TermText(), e.ToString()));				
> 			}
> 
> 			Token newToken = new 
> Token(stemmer.GetCurrent(), token.StartOffset(), 
> token.EndOffset(), token.Type());
> 			
> newToken.SetPositionIncrement(token.GetPositionIncrement());
> 			return newToken;
> 		}
> 	}
> }
> 
> 
> --------------------------------------------------------------
> -------------
> 
>  An- und Abmeldung zur SCHEMA Mailingliste unter 
> http://www.schema.de/mail
> 
> --------------------------------------------------------------
> -------------
> 
>