You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by di...@apache.org on 2011/02/10 22:17:45 UTC
svn commit: r1069573 [1/3] - in /incubator/lucene.net:
tags/Lucene.Net_2_9_2/contrib/Analyzers/
tags/Lucene.Net_2_9_2/contrib/Analyzers/BR/
tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/
tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net...
Author: digy
Date: Thu Feb 10 21:17:43 2011
New Revision: 1069573
URL: http://svn.apache.org/viewvc?rev=1069573&view=rev
Log:
Rearrangement of contrib/Analyzers + Arabic Analyzer for 2.9.2 tag and trunk
Added:
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStopWords.txt (with props)
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Properties/
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Properties/
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Properties/AssemblyInfo.cs
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Test/Test.csproj
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStopWords.txt (with props)
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Properties/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/Properties/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/Properties/AssemblyInfo.cs
incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/Test.csproj
Removed:
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/BR/
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers.csproj
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Properties/
incubator/lucene.net/trunk/C#/contrib/Analyzers/BR/
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.csproj
incubator/lucene.net/trunk/C#/contrib/Analyzers/Properties/
Modified:
incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers.sln
incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln
Modified: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers.sln
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers.sln?rev=1069573&r1=1069572&r2=1069573&view=diff
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers.sln (original)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers.sln Thu Feb 10 21:17:43 2011
@@ -1,7 +1,9 @@

-Microsoft Visual Studio Solution File, Format Version 9.00
-# Visual C# Express 2005
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analyzers", "Lucene.Net.Analyzers.csproj", "{A4AF790F-900A-48D2-85A7-B948E5214C16}"
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual C# Express 2008
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analyzers", "Lucene.Net.Analyzers\Lucene.Net.Analyzers.csproj", "{4286E961-9143-4821-B46D-3D39D3736386}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{67D27628-F1D5-4499-9818-B669731925C8}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -9,10 +11,14 @@ Global
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
- {A4AF790F-900A-48D2-85A7-B948E5214C16}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
- {A4AF790F-900A-48D2-85A7-B948E5214C16}.Debug|Any CPU.Build.0 = Debug|Any CPU
- {A4AF790F-900A-48D2-85A7-B948E5214C16}.Release|Any CPU.ActiveCfg = Release|Any CPU
- {A4AF790F-900A-48D2-85A7-B948E5214C16}.Release|Any CPU.Build.0 = Release|Any CPU
+ {4286E961-9143-4821-B46D-3D39D3736386}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {4286E961-9143-4821-B46D-3D39D3736386}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {4286E961-9143-4821-B46D-3D39D3736386}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {4286E961-9143-4821-B46D-3D39D3736386}.Release|Any CPU.Build.0 = Release|Any CPU
+ {67D27628-F1D5-4499-9818-B669731925C8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {67D27628-F1D5-4499-9818-B669731925C8}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {67D27628-F1D5-4499-9818-B669731925C8}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {67D27628-F1D5-4499-9818-B669731925C8}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,202 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.AR
+{
+ /**
+ * {@link Analyzer} for Arabic.
+ * <p>
+ * This analyzer implements light-stemming as specified by:
+ * <i>
+ * Light Stemming for Arabic Information Retrieval
+ * </i>
+ * http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
+ * <p>
+ * The analysis package contains three primary components:
+ * <ul>
+ * <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
+ * <li>{@link ArabicStemFilter}: Arabic light stemming
+ * <li>Arabic stop words file: a set of default Arabic stop words.
+ * </ul>
+ *
+ */
+ public class ArabicAnalyzer : Analyzer
+ {
+
+ /**
+ * File containing default Arabic stopwords.
+ *
+ * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+ * The stopword list is BSD-Licensed.
+ */
+ public static string DEFAULT_STOPWORD_FILE = "ArabicStopWords.txt";
+
+ /**
+ * Contains the stopwords used with the StopFilter.
+ */
+ private Hashtable stoptable = new Hashtable();
+ /**
+ * The comment character in the stopwords file. All lines prefixed with this will be ignored
+ */
+ public static string STOPWORDS_COMMENT = "#";
+
+ private Version matchVersion;
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ *
+ * @deprecated Use {@link #ArabicAnalyzer(Version)} instead
+ */
+ public ArabicAnalyzer() : this(Version.LUCENE_24)
+ {
+
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public ArabicAnalyzer(Version matchVersion)
+ {
+ this.matchVersion = matchVersion;
+
+ using (StreamReader reader = new StreamReader(System.Reflection.Assembly.GetAssembly(this.GetType()).GetManifestResourceStream("Lucene.Net.Analyzers.AR." + DEFAULT_STOPWORD_FILE)))
+ {
+ while (!reader.EndOfStream)
+ {
+ string word = reader.ReadLine();
+ stoptable.Add(word, word);
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @deprecated Use {@link #ArabicAnalyzer(Version, String[])} instead
+ */
+ public ArabicAnalyzer(string[] stopwords): this(Version.LUCENE_24, stopwords)
+ {
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public ArabicAnalyzer(Version matchVersion, string[] stopwords)
+ {
+ stoptable = StopFilter.MakeStopSet(stopwords);
+ this.matchVersion = matchVersion;
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @deprecated Use {@link #ArabicAnalyzer(Version, Hashtable)} instead
+ */
+ public ArabicAnalyzer(Hashtable stopwords) : this(Version.LUCENE_24, stopwords)
+ {
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public ArabicAnalyzer(Version matchVersion, Hashtable stopwords)
+ {
+ stoptable = new Hashtable(stopwords);
+ this.matchVersion = matchVersion;
+ }
+
+ //DIGY
+ ///**
+ // * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
+ // *
+ // * @deprecated Use {@link #ArabicAnalyzer(Version, File)} instead
+ // */
+ //public ArabicAnalyzer(File stopwords)
+ //{
+ // this(Version.LUCENE_24, stopwords);
+ //}
+
+ ///**
+ // * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
+ // */
+ //public ArabicAnalyzer(Version matchVersion, File stopwords)
+ //{
+ // stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT);
+ // this.matchVersion = matchVersion;
+ //}
+
+
+ /**
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
+ * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
+ * and {@link ArabicStemFilter}.
+ */
+ public override TokenStream TokenStream(string fieldName, TextReader reader)
+ {
+ TokenStream result = new ArabicLetterTokenizer(reader);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable);
+ result = new ArabicNormalizationFilter(result);
+ result = new ArabicStemFilter(result);
+
+ return result;
+ }
+
+ private class SavedStreams
+ {
+ internal Tokenizer Source;
+ internal TokenStream Result;
+ };
+
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
+ * in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
+ * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
+ * and {@link ArabicStemFilter}.
+ */
+ public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
+ {
+ SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+ if (streams == null)
+ {
+ streams = new SavedStreams();
+ streams.Source = new ArabicLetterTokenizer(reader);
+ streams.Result = new LowerCaseFilter(streams.Source);
+ streams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+ streams.Result, stoptable);
+ streams.Result = new ArabicNormalizationFilter(streams.Result);
+ streams.Result = new ArabicStemFilter(streams.Result);
+ SetPreviousTokenStream(streams);
+ }
+ else
+ {
+ streams.Source.Reset(reader);
+ }
+ return streams.Result;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.AR
+{
+
+ /**
+ * Tokenizer that breaks text into runs of letters and diacritics.
+ * <p>
+ * The problem with the standard Letter tokenizer is that it fails on diacritics.
+ * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
+ * </p>
+ *
+ */
+ public class ArabicLetterTokenizer : LetterTokenizer
+ {
+
+ public ArabicLetterTokenizer(TextReader @in): base(@in)
+ {
+
+ }
+
+ public ArabicLetterTokenizer(AttributeSource source, TextReader @in) : base(source, @in)
+ {
+
+ }
+
+ public ArabicLetterTokenizer(AttributeFactory factory, TextReader @in) : base(factory, @in)
+ {
+
+ }
+
+ /**
+ * Allows for Letter category or NonspacingMark category
+ * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
+ */
+ protected override bool IsTokenChar(char c)
+ {
+ return base.IsTokenChar(c) || char.GetUnicodeCategory(c)==System.Globalization.UnicodeCategory.NonSpacingMark ;
+ }
+
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+
+ /**
+ * A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography.
+ *
+ */
+
+ public class ArabicNormalizationFilter : TokenFilter
+ {
+
+ protected ArabicNormalizer normalizer = null;
+ private TermAttribute termAtt;
+
+ public ArabicNormalizationFilter(TokenStream input) : base(input)
+ {
+
+ normalizer = new ArabicNormalizer();
+ termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ }
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ int newlen = normalizer.Normalize(termAtt.TermBuffer(), termAtt.TermLength());
+ termAtt.SetTermLength(newlen);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+ /**
+ * Normalizer for Arabic.
+ * <p>
+ * Normalization is done in-place for efficiency, operating on a termbuffer.
+ * <p>
+ * Normalization is defined as:
+ * <ul>
+ * <li> Normalization of hamza with alef seat to a bare alef.
+ * <li> Normalization of teh marbuta to heh
+ * <li> Normalization of dotless yeh (alef maksura) to yeh.
+ * <li> Removal of Arabic diacritics (the harakat)
+ * <li> Removal of tatweel (stretching character).
+ * </ul>
+ *
+ */
+ public class ArabicNormalizer
+ {
+ public static char ALEF = '\u0627';
+ public static char ALEF_MADDA = '\u0622';
+ public static char ALEF_HAMZA_ABOVE = '\u0623';
+ public static char ALEF_HAMZA_BELOW = '\u0625';
+
+ public static char YEH = '\u064A';
+ public static char DOTLESS_YEH = '\u0649';
+
+ public static char TEH_MARBUTA = '\u0629';
+ public static char HEH = '\u0647';
+
+ public static char TATWEEL = '\u0640';
+
+ public static char FATHATAN = '\u064B';
+ public static char DAMMATAN = '\u064C';
+ public static char KASRATAN = '\u064D';
+ public static char FATHA = '\u064E';
+ public static char DAMMA = '\u064F';
+ public static char KASRA = '\u0650';
+ public static char SHADDA = '\u0651';
+ public static char SUKUN = '\u0652';
+
+ /**
+ * Normalize an input buffer of Arabic text
+ *
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return length of input buffer after normalization
+ */
+ public int Normalize(char[] s, int len)
+ {
+
+ for (int i = 0; i < len; i++)
+ {
+ if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
+ s[i] = ALEF;
+
+ if (s[i] == DOTLESS_YEH)
+ s[i] = YEH;
+
+ if (s[i] == TEH_MARBUTA)
+ s[i] = HEH;
+
+ if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
+ s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN)
+ {
+ len = Delete(s, i, len);
+ i--;
+ }
+ }
+
+ return len;
+ }
+
+ /**
+ * Delete a character in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len length of input buffer
+ * @return length of input buffer after deletion
+ */
+ protected int Delete(char[] s, int pos, int len)
+ {
+ if (pos < len)
+ Array.Copy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+ /**
+ * A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
+ *
+ */
+
+ public class ArabicStemFilter : TokenFilter
+ {
+
+ protected ArabicStemmer stemmer = null;
+ private TermAttribute termAtt;
+
+ public ArabicStemFilter(TokenStream input) : base(input)
+ {
+ stemmer = new ArabicStemmer();
+ termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+ }
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ int newlen = stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength());
+ termAtt.SetTermLength(newlen);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStemmer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,208 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+ /**
+ * Stemmer for Arabic.
+ * <p>
+ * Stemming is done in-place for efficiency, operating on a termbuffer.
+ * <p>
+ * Stemming is defined as:
+ * <ul>
+ * <li> Removal of attached definite article, conjunction, and prepositions.
+ * <li> Stemming of common suffixes.
+ * </ul>
+ *
+ */
+ public class ArabicStemmer
+ {
+ public static char ALEF = '\u0627';
+ public static char BEH = '\u0628';
+ public static char TEH_MARBUTA = '\u0629';
+ public static char TEH = '\u062A';
+ public static char FEH = '\u0641';
+ public static char KAF = '\u0643';
+ public static char LAM = '\u0644';
+ public static char NOON = '\u0646';
+ public static char HEH = '\u0647';
+ public static char WAW = '\u0648';
+ public static char YEH = '\u064A';
+
+ public static char[][] prefixes = {
+ ("" + ALEF + LAM).ToCharArray(),
+ ("" + WAW + ALEF + LAM).ToCharArray(),
+ ("" + BEH + ALEF + LAM).ToCharArray(),
+ ("" + KAF + ALEF + LAM).ToCharArray(),
+ ("" + FEH + ALEF + LAM).ToCharArray(),
+ ("" + LAM + LAM).ToCharArray(),
+ ("" + WAW).ToCharArray(),
+ };
+
+ public static char[][] suffixes = {
+ ("" + HEH + ALEF).ToCharArray(),
+ ("" + ALEF + NOON).ToCharArray(),
+ ("" + ALEF + TEH).ToCharArray(),
+ ("" + WAW + NOON).ToCharArray(),
+ ("" + YEH + NOON).ToCharArray(),
+ ("" + YEH + HEH).ToCharArray(),
+ ("" + YEH + TEH_MARBUTA).ToCharArray(),
+ ("" + HEH).ToCharArray(),
+ ("" + TEH_MARBUTA).ToCharArray(),
+ ("" + YEH).ToCharArray(),
+ };
+
+
+ /**
+ * Stem an input buffer of Arabic text.
+ *
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return length of input buffer after normalization
+ */
+ public int Stem(char[] s, int len)
+ {
+ len = StemPrefix(s, len);
+ len = StemSuffix(s, len);
+
+ return len;
+ }
+
+ /**
+ * Stem a prefix off an Arabic word.
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return new length of input buffer after stemming.
+ */
+ public int StemPrefix(char[] s, int len)
+ {
+ for (int i = 0; i < prefixes.Length; i++)
+ if (StartsWith(s, len, prefixes[i]))
+ return DeleteN(s, 0, len, prefixes[i].Length);
+ return len;
+ }
+
+ /**
+ * Stem suffix(es) off an Arabic word.
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return new length of input buffer after stemming
+ */
+ public int StemSuffix(char[] s, int len)
+ {
+ for (int i = 0; i < suffixes.Length; i++)
+ if (EndsWith(s, len, suffixes[i]))
+ len = DeleteN(s, len - suffixes[i].Length, len, suffixes[i].Length);
+ return len;
+ }
+
+ /**
+ * Returns true if the prefix matches and can be stemmed
+ * @param s input buffer
+ * @param len length of input buffer
+ * @param prefix prefix to check
+ * @return true if the prefix matches and can be stemmed
+ */
+ bool StartsWith(char[] s, int len, char[] prefix)
+ {
+ if (prefix.Length == 1 && len < 4)
+ { // wa- prefix requires at least 3 characters
+ return false;
+ }
+ else if (len < prefix.Length + 2)
+ { // other prefixes require only 2.
+ return false;
+ }
+ else
+ {
+ for (int i = 0; i < prefix.Length; i++)
+ if (s[i] != prefix[i])
+ return false;
+
+ return true;
+ }
+ }
+
+ /**
+ * Returns true if the suffix matches and can be stemmed
+ * @param s input buffer
+ * @param len length of input buffer
+ * @param suffix suffix to check
+ * @return true if the suffix matches and can be stemmed
+ */
+ bool EndsWith(char[] s, int len, char[] suffix)
+ {
+ if (len < suffix.Length + 2)
+ { // all suffixes require at least 2 characters after stemming
+ return false;
+ }
+ else
+ {
+ for (int i = 0; i < suffix.Length; i++)
+ if (s[len - suffix.Length + i] != suffix[i])
+ return false;
+
+ return true;
+ }
+ }
+
+
+ /**
+ * Delete n characters in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len Length of input buffer
+ * @param nChars number of characters to delete
+ * @return length of input buffer after deletion
+ */
+ protected int DeleteN(char[] s, int pos, int len, int nChars)
+ {
+ for (int i = 0; i < nChars; i++)
+ len = Delete(s, pos, len);
+ return len;
+ }
+
+ /**
+ * Delete a character in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len length of input buffer
+ * @return length of input buffer after deletion
+ */
+ protected int Delete(char[] s, int pos, int len)
+ {
+ if (pos < len)
+ Array.Copy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStopWords.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStopWords.txt?rev=1069573&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicStopWords.txt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Standard;
+using System.IO;
+
+/**
+ * Analyzer for Brazilian language. Supports an external list of stopwords (words that
+ * will not be indexed at all) and an external list of exclusions (word that will
+ * not be stemmed, but indexed).
+ *
+ */
+namespace Lucene.Net.Analysis.BR
+{
+ public sealed class BrazilianAnalyzer : Analyzer
+ {
+
+ /**
+ * List of typical Brazilian stopwords.
+ */
+ public static string[] BRAZILIAN_STOP_WORDS = {
+ "a","ainda","alem","ambas","ambos","antes",
+ "ao","aonde","aos","apos","aquele","aqueles",
+ "as","assim","com","como","contra","contudo",
+ "cuja","cujas","cujo","cujos","da","das","de",
+ "dela","dele","deles","demais","depois","desde",
+ "desta","deste","dispoe","dispoem","diversa",
+ "diversas","diversos","do","dos","durante","e",
+ "ela","elas","ele","eles","em","entao","entre",
+ "essa","essas","esse","esses","esta","estas",
+ "este","estes","ha","isso","isto","logo","mais",
+ "mas","mediante","menos","mesma","mesmas","mesmo",
+ "mesmos","na","nas","nao","nas","nem","nesse","neste",
+ "nos","o","os","ou","outra","outras","outro","outros",
+ "pelas","pelas","pelo","pelos","perante","pois","por",
+ "porque","portanto","proprio","propios","quais","qual",
+ "qualquer","quando","quanto","que","quem","quer","se",
+ "seja","sem","sendo","seu","seus","sob","sobre","sua",
+ "suas","tal","tambem","teu","teus","toda","todas","todo",
+ "todos","tua","tuas","tudo","um","uma","umas","uns"};
+
+
+ /**
+ * Contains the stopwords used with the StopFilter.
+ */
+ private Hashtable stoptable = new Hashtable();
+
+ /**
+ * Contains words that should be indexed but not stemmed.
+ */
+ private Hashtable excltable = new Hashtable();
+
+ /**
+ * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
+ */
+ public BrazilianAnalyzer()
+ {
+ stoptable = StopFilter.MakeStopSet(BRAZILIAN_STOP_WORDS);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public BrazilianAnalyzer(string[] stopwords)
+ {
+ stoptable = StopFilter.MakeStopSet(stopwords);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public BrazilianAnalyzer(Hashtable stopwords)
+ {
+ stoptable = stopwords;
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public BrazilianAnalyzer(FileInfo stopwords)
+ {
+ stoptable = WordlistLoader.GetWordSet(stopwords);
+ }
+
+ /**
+ * Builds an exclusionlist from an array of Strings.
+ */
+ public void SetStemExclusionTable(string[] exclusionlist)
+ {
+ excltable = StopFilter.MakeStopSet(exclusionlist);
+ }
+ /**
+ * Builds an exclusionlist from a Hashtable.
+ */
+ public void SetStemExclusionTable(Hashtable exclusionlist)
+ {
+ excltable = exclusionlist;
+ }
+ /**
+ * Builds an exclusionlist from the words contained in the given file.
+ */
+ public void SetStemExclusionTable(FileInfo exclusionlist)
+ {
+ excltable = WordlistLoader.GetWordSet(exclusionlist);
+ }
+
+ /**
+ * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ *
+ * @return A TokenStream build from a StandardTokenizer filtered with
+ * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
+ */
+ public override TokenStream TokenStream(string fieldName, TextReader reader)
+ {
+ TokenStream result = new StandardTokenizer(reader);
+ result = new LowerCaseFilter(result);
+ result = new StandardFilter(result);
+ result = new StopFilter(result, stoptable);
+ result = new BrazilianStemFilter(result, excltable);
+ return result;
+ }
+ }
+}
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Analysis;
+using System.Collections;
+
+
+/**
+ * Based on GermanStemFilter
+ *
+ */
+namespace Lucene.Net.Analysis.BR
+{
+
+ public sealed class BrazilianStemFilter : TokenFilter
+ {
+
+ /**
+ * The actual token in the input stream.
+ */
+ private BrazilianStemmer stemmer = null;
+ private Hashtable exclusions = null;
+
+ public BrazilianStemFilter(TokenStream input)
+ : base(input)
+ {
+ stemmer = new BrazilianStemmer();
+ }
+
+ public BrazilianStemFilter(TokenStream input, Hashtable exclusiontable)
+ : this(input)
+ {
+ this.exclusions = exclusiontable;
+ }
+
+ /**
+ * @return Returns the next token in the stream, or null at EOS.
+ */
+ public override Token Next(Token reusableToken)
+ {
+ System.Diagnostics.Trace.Assert(reusableToken != null);
+
+ Token nextToken = input.Next(reusableToken);
+ if (nextToken == null)
+ return null;
+
+ string term = nextToken.TermText();
+
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.Contains(term))
+ {
+ string s = stemmer.Stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.Equals(term))
+ nextToken.SetTermBuffer(s.ToCharArray(), 0, s.Length);//was SetTermBuffer(s)
+ }
+ return nextToken;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs (added)
+++ incubator/lucene.net/tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,1264 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A stemmer for Brazilian words.
+ */
+namespace Lucene.Net.Analysis.BR
+{
+
+ public class BrazilianStemmer
+ {
+
+ /**
+ * Changed term
+ */
+ private string TERM;
+ private string CT;
+ private string R1;
+ private string R2;
+ private string RV;
+
+
+ public BrazilianStemmer()
+ {
+ }
+
+ /**
+ * Stemms the given term to an unique <tt>discriminator</tt>.
+ *
+ * @param term The term that should be stemmed.
+ * @return Discriminator for <tt>term</tt>
+ */
+ public string Stem(string term)
+ {
+ bool altered = false; // altered the term
+
+ // creates CT
+ createCT(term);
+
+ if (!isIndexable(CT))
+ {
+ return null;
+ }
+ if (!isStemmable(CT))
+ {
+ return CT;
+ }
+
+ R1 = getR1(CT);
+ R2 = getR1(R1);
+ RV = getRV(CT);
+ TERM = term + ";" + CT;
+
+ altered = step1();
+ if (!altered)
+ {
+ altered = step2();
+ }
+
+ if (altered)
+ {
+ step3();
+ }
+ else
+ {
+ step4();
+ }
+
+ step5();
+
+ return CT;
+ }
+
+ /**
+ * Checks a term if it can be processed correctly.
+ *
+ * @return true if, and only if, the given term consists in letters.
+ */
+ private bool isStemmable(string term)
+ {
+ for (int c = 0; c < term.Length; c++)
+ {
+ // Discard terms that contain non-letter characters.
+ if (!char.IsLetter(term[c]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Checks a term if it can be processed indexed.
+ *
+ * @return true if it can be indexed
+ */
+ private bool isIndexable(string term)
+ {
+ return (term.Length < 30) && (term.Length > 2);
+ }
+
+ /**
+ * See if string is 'a','e','i','o','u'
+ *
+ * @return true if is vowel
+ */
+ private bool isVowel(char value)
+ {
+ return (value == 'a') ||
+ (value == 'e') ||
+ (value == 'i') ||
+ (value == 'o') ||
+ (value == 'u');
+ }
+
+ /**
+ * Gets R1
+ *
+ * R1 - is the region after the first non-vowel follwing a vowel,
+ * or is the null region at the end of the word if there is
+ * no such non-vowel.
+ *
+ * @return null or a string representing R1
+ */
+ private string getR1(string value)
+ {
+ int i;
+ int j;
+
+ // be-safe !!!
+ if (value == null)
+ {
+ return null;
+ }
+
+ // find 1st vowel
+ i = value.Length - 1;
+ for (j = 0; j < i; j++)
+ {
+ if (isVowel(value[j]))
+ {
+ break;
+ }
+ }
+
+ if (!(j < i))
+ {
+ return null;
+ }
+
+ // find 1st non-vowel
+ for (; j < i; j++)
+ {
+ if (!(isVowel(value[j])))
+ {
+ break;
+ }
+ }
+
+ if (!(j < i))
+ {
+ return null;
+ }
+
+ return value.Substring(j + 1);
+ }
+
+ /**
+ * Gets RV
+ *
+ * RV - IF the second letter is a consoant, RV is the region after
+ * the next following vowel,
+ *
+ * OR if the first two letters are vowels, RV is the region
+ * after the next consoant,
+ *
+ * AND otherwise (consoant-vowel case) RV is the region after
+ * the third letter.
+ *
+ * BUT RV is the end of the word if this positions cannot be
+ * found.
+ *
+ * @return null or a string representing RV
+ */
+ private string getRV(string value)
+ {
+ int i;
+ int j;
+
+ // be-safe !!!
+ if (value == null)
+ {
+ return null;
+ }
+
+ i = value.Length - 1;
+
+ // RV - IF the second letter is a consoant, RV is the region after
+ // the next following vowel,
+ if ((i > 0) && !isVowel(value[1]))
+ {
+ // find 1st vowel
+ for (j = 2; j < i; j++)
+ {
+ if (isVowel(value[j]))
+ {
+ break;
+ }
+ }
+
+ if (j < i)
+ {
+ return value.Substring(j + 1);
+ }
+ }
+
+
+ // RV - OR if the first two letters are vowels, RV is the region
+ // after the next consoant,
+ if ((i > 1) &&
+ isVowel(value[0]) &&
+ isVowel(value[1]))
+ {
+ // find 1st consoant
+ for (j = 2; j < i; j++)
+ {
+ if (!isVowel(value[j]))
+ {
+ break;
+ }
+ }
+
+ if (j < i)
+ {
+ return value.Substring(j + 1);
+ }
+ }
+
+ // RV - AND otherwise (consoant-vowel case) RV is the region after
+ // the third letter.
+ if (i > 2)
+ {
+ return value.Substring(3);
+ }
+
+ return null;
+ }
+
+ /**
+ * 1) Turn to lowercase
+ * 2) Remove accents
+ * 3) ã -> a ; õ -> o
+ * 4) ç -> c
+ *
+ * @return null or a string transformed
+ */
+ private string changeTerm(string value)
+ {
+ int j;
+ string r = "";
+
+ // be-safe !!!
+ if (value == null)
+ {
+ return null;
+ }
+
+ value = value.ToLower();
+ for (j = 0; j < value.Length; j++)
+ {
+ if ((value[j] == 'á') ||
+ (value[j] == 'â') ||
+ (value[j] == 'ã'))
+ {
+ r = r + "a"; continue;
+ }
+ if ((value[j] == 'é') ||
+ (value[j] == 'ê'))
+ {
+ r = r + "e"; continue;
+ }
+ if (value[j] == 'Ã')
+ {
+ r = r + "i"; continue;
+ }
+ if ((value[j] == 'ó') ||
+ (value[j] == 'ô') ||
+ (value[j] == 'õ'))
+ {
+ r = r + "o"; continue;
+ }
+ if ((value[j] == 'ú') ||
+ (value[j] == 'ü'))
+ {
+ r = r + "u"; continue;
+ }
+ if (value[j] == 'ç')
+ {
+ r = r + "c"; continue;
+ }
+ if (value[j] == 'ñ')
+ {
+ r = r + "n"; continue;
+ }
+
+ r = r + value[j];
+ }
+
+ return r;
+ }
+
+ /**
+ * Check if a string ends with a suffix
+ *
+ * @return true if the string ends with the specified suffix
+ */
+ private bool suffix(string value, string suffix)
+ {
+
+ // be-safe !!!
+ if ((value == null) || (suffix == null))
+ {
+ return false;
+ }
+
+ if (suffix.Length > value.Length)
+ {
+ return false;
+ }
+
+ return value.Substring(value.Length - suffix.Length).Equals(suffix);
+ }
+
+ /**
+ * Replace a string suffix by another
+ *
+ * @return the replaced string
+ */
+ private string replaceSuffix(string value, string toReplace, string changeTo)
+ {
+ string vvalue;
+
+ // be-safe !!!
+ if ((value == null) ||
+ (toReplace == null) ||
+ (changeTo == null))
+ {
+ return value;
+ }
+
+ vvalue = removeSuffix(value, toReplace);
+
+ if (value.Equals(vvalue))
+ {
+ return value;
+ }
+ else
+ {
+ return vvalue + changeTo;
+ }
+ }
+
+ /**
+ * Remove a string suffix
+ *
+ * @return the string without the suffix
+ */
+ private string removeSuffix(string value, string toRemove)
+ {
+ // be-safe !!!
+ if ((value == null) ||
+ (toRemove == null) ||
+ !suffix(value, toRemove))
+ {
+ return value;
+ }
+
+ return value.Substring(0, value.Length - toRemove.Length);
+ }
+
+ /**
+ * See if a suffix is preceded by a string
+ *
+ * @return true if the suffix is preceded
+ */
+ private bool suffixPreceded(string value, string _suffix, string preceded)
+ {
+ // be-safe !!!
+ if ((value == null) ||
+ (_suffix == null) ||
+ (preceded == null) ||
+ !suffix(value, _suffix))
+ {
+ return false;
+ }
+
+ return suffix(removeSuffix(value, _suffix), preceded);
+ }
+
+
+
+
+ /**
+ * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
+ */
+ private void createCT(string term)
+ {
+ CT = changeTerm(term);
+
+ if (CT.Length < 2) return;
+
+ // if the first character is ... , remove it
+ if ((CT[0] == '"') ||
+ (CT[0] == '\'') ||
+ (CT[0] == '-') ||
+ (CT[0] == ',') ||
+ (CT[0] == ';') ||
+ (CT[0] == '.') ||
+ (CT[0] == '?') ||
+ (CT[0] == '!')
+ )
+ {
+ CT = CT.Substring(1);
+ }
+
+ if (CT.Length < 2) return;
+
+ // if the last character is ... , remove it
+ if ((CT[CT.Length - 1] == '-') ||
+ (CT[CT.Length - 1] == ',') ||
+ (CT[CT.Length - 1] == ';') ||
+ (CT[CT.Length - 1] == '.') ||
+ (CT[CT.Length - 1] == '?') ||
+ (CT[CT.Length - 1] == '!') ||
+ (CT[CT.Length - 1] == '\'') ||
+ (CT[CT.Length - 1] == '"')
+ )
+ {
+ CT = CT.Substring(0, CT.Length - 1);
+ }
+ }
+
+
+ /**
+ * Standart suffix removal.
+ * Search for the longest among the following suffixes, and perform
+ * the following actions:
+ *
+ * @return false if no ending was removed
+ */
+ private bool step1()
+ {
+ if (CT == null) return false;
+
+ // suffix lenght = 7
+ if (suffix(CT, "uciones") && suffix(R2, "uciones"))
+ {
+ CT = replaceSuffix(CT, "uciones", "u"); return true;
+ }
+
+ // suffix lenght = 6
+ if (CT.Length >= 6)
+ {
+ if (suffix(CT, "imentos") && suffix(R2, "imentos"))
+ {
+ CT = removeSuffix(CT, "imentos"); return true;
+ }
+ if (suffix(CT, "amentos") && suffix(R2, "amentos"))
+ {
+ CT = removeSuffix(CT, "amentos"); return true;
+ }
+ if (suffix(CT, "adores") && suffix(R2, "adores"))
+ {
+ CT = removeSuffix(CT, "adores"); return true;
+ }
+ if (suffix(CT, "adoras") && suffix(R2, "adoras"))
+ {
+ CT = removeSuffix(CT, "adoras"); return true;
+ }
+ if (suffix(CT, "logias") && suffix(R2, "logias"))
+ {
+ replaceSuffix(CT, "logias", "log"); return true;
+ }
+ if (suffix(CT, "encias") && suffix(R2, "encias"))
+ {
+ CT = replaceSuffix(CT, "encias", "ente"); return true;
+ }
+ if (suffix(CT, "amente") && suffix(R1, "amente"))
+ {
+ CT = removeSuffix(CT, "amente"); return true;
+ }
+ if (suffix(CT, "idades") && suffix(R2, "idades"))
+ {
+ CT = removeSuffix(CT, "idades"); return true;
+ }
+ }
+
+ // suffix lenght = 5
+ if (CT.Length >= 5)
+ {
+ if (suffix(CT, "acoes") && suffix(R2, "acoes"))
+ {
+ CT = removeSuffix(CT, "acoes"); return true;
+ }
+ if (suffix(CT, "imento") && suffix(R2, "imento"))
+ {
+ CT = removeSuffix(CT, "imento"); return true;
+ }
+ if (suffix(CT, "amento") && suffix(R2, "amento"))
+ {
+ CT = removeSuffix(CT, "amento"); return true;
+ }
+ if (suffix(CT, "adora") && suffix(R2, "adora"))
+ {
+ CT = removeSuffix(CT, "adora"); return true;
+ }
+ if (suffix(CT, "ismos") && suffix(R2, "ismos"))
+ {
+ CT = removeSuffix(CT, "ismos"); return true;
+ }
+ if (suffix(CT, "istas") && suffix(R2, "istas"))
+ {
+ CT = removeSuffix(CT, "istas"); return true;
+ }
+ if (suffix(CT, "logia") && suffix(R2, "logia"))
+ {
+ CT = replaceSuffix(CT, "logia", "log"); return true;
+ }
+ if (suffix(CT, "ucion") && suffix(R2, "ucion"))
+ {
+ CT = replaceSuffix(CT, "ucion", "u"); return true;
+ }
+ if (suffix(CT, "encia") && suffix(R2, "encia"))
+ {
+ CT = replaceSuffix(CT, "encia", "ente"); return true;
+ }
+ if (suffix(CT, "mente") && suffix(R2, "mente"))
+ {
+ CT = removeSuffix(CT, "mente"); return true;
+ }
+ if (suffix(CT, "idade") && suffix(R2, "idade"))
+ {
+ CT = removeSuffix(CT, "idade"); return true;
+ }
+ }
+
+ // suffix lenght = 4
+ if (CT.Length >= 4)
+ {
+ if (suffix(CT, "acao") && suffix(R2, "acao"))
+ {
+ CT = removeSuffix(CT, "acao"); return true;
+ }
+ if (suffix(CT, "ezas") && suffix(R2, "ezas"))
+ {
+ CT = removeSuffix(CT, "ezas"); return true;
+ }
+ if (suffix(CT, "icos") && suffix(R2, "icos"))
+ {
+ CT = removeSuffix(CT, "icos"); return true;
+ }
+ if (suffix(CT, "icas") && suffix(R2, "icas"))
+ {
+ CT = removeSuffix(CT, "icas"); return true;
+ }
+ if (suffix(CT, "ismo") && suffix(R2, "ismo"))
+ {
+ CT = removeSuffix(CT, "ismo"); return true;
+ }
+ if (suffix(CT, "avel") && suffix(R2, "avel"))
+ {
+ CT = removeSuffix(CT, "avel"); return true;
+ }
+ if (suffix(CT, "ivel") && suffix(R2, "ivel"))
+ {
+ CT = removeSuffix(CT, "ivel"); return true;
+ }
+ if (suffix(CT, "ista") && suffix(R2, "ista"))
+ {
+ CT = removeSuffix(CT, "ista"); return true;
+ }
+ if (suffix(CT, "osos") && suffix(R2, "osos"))
+ {
+ CT = removeSuffix(CT, "osos"); return true;
+ }
+ if (suffix(CT, "osas") && suffix(R2, "osas"))
+ {
+ CT = removeSuffix(CT, "osas"); return true;
+ }
+ if (suffix(CT, "ador") && suffix(R2, "ador"))
+ {
+ CT = removeSuffix(CT, "ador"); return true;
+ }
+ if (suffix(CT, "ivas") && suffix(R2, "ivas"))
+ {
+ CT = removeSuffix(CT, "ivas"); return true;
+ }
+ if (suffix(CT, "ivos") && suffix(R2, "ivos"))
+ {
+ CT = removeSuffix(CT, "ivos"); return true;
+ }
+ if (suffix(CT, "iras") &&
+ suffix(RV, "iras") &&
+ suffixPreceded(CT, "iras", "e"))
+ {
+ CT = replaceSuffix(CT, "iras", "ir"); return true;
+ }
+ }
+
+ // suffix lenght = 3
+ if (CT.Length >= 3)
+ {
+ if (suffix(CT, "eza") && suffix(R2, "eza"))
+ {
+ CT = removeSuffix(CT, "eza"); return true;
+ }
+ if (suffix(CT, "ico") && suffix(R2, "ico"))
+ {
+ CT = removeSuffix(CT, "ico"); return true;
+ }
+ if (suffix(CT, "ica") && suffix(R2, "ica"))
+ {
+ CT = removeSuffix(CT, "ica"); return true;
+ }
+ if (suffix(CT, "oso") && suffix(R2, "oso"))
+ {
+ CT = removeSuffix(CT, "oso"); return true;
+ }
+ if (suffix(CT, "osa") && suffix(R2, "osa"))
+ {
+ CT = removeSuffix(CT, "osa"); return true;
+ }
+ if (suffix(CT, "iva") && suffix(R2, "iva"))
+ {
+ CT = removeSuffix(CT, "iva"); return true;
+ }
+ if (suffix(CT, "ivo") && suffix(R2, "ivo"))
+ {
+ CT = removeSuffix(CT, "ivo"); return true;
+ }
+ if (suffix(CT, "ira") &&
+ suffix(RV, "ira") &&
+ suffixPreceded(CT, "ira", "e"))
+ {
+ CT = replaceSuffix(CT, "ira", "ir"); return true;
+ }
+ }
+
+ // no ending was removed by step1
+ return false;
+ }
+
+
+ /**
+ * Verb suffixes.
+ *
+ * Search for the longest among the following suffixes in RV,
+ * and if found, delete.
+ *
+ * @return false if no ending was removed
+ */
+ private bool step2()
+ {
+ if (RV == null) return false;
+
+ // suffix lenght = 7
+ if (RV.Length >= 7)
+ {
+ if (suffix(RV, "issemos"))
+ {
+ CT = removeSuffix(CT, "issemos"); return true;
+ }
+ if (suffix(RV, "essemos"))
+ {
+ CT = removeSuffix(CT, "essemos"); return true;
+ }
+ if (suffix(RV, "assemos"))
+ {
+ CT = removeSuffix(CT, "assemos"); return true;
+ }
+ if (suffix(RV, "ariamos"))
+ {
+ CT = removeSuffix(CT, "ariamos"); return true;
+ }
+ if (suffix(RV, "eriamos"))
+ {
+ CT = removeSuffix(CT, "eriamos"); return true;
+ }
+ if (suffix(RV, "iriamos"))
+ {
+ CT = removeSuffix(CT, "iriamos"); return true;
+ }
+ }
+
+ // suffix lenght = 6
+ if (RV.Length >= 6)
+ {
+ if (suffix(RV, "iremos"))
+ {
+ CT = removeSuffix(CT, "iremos"); return true;
+ }
+ if (suffix(RV, "eremos"))
+ {
+ CT = removeSuffix(CT, "eremos"); return true;
+ }
+ if (suffix(RV, "aremos"))
+ {
+ CT = removeSuffix(CT, "aremos"); return true;
+ }
+ if (suffix(RV, "avamos"))
+ {
+ CT = removeSuffix(CT, "avamos"); return true;
+ }
+ if (suffix(RV, "iramos"))
+ {
+ CT = removeSuffix(CT, "iramos"); return true;
+ }
+ if (suffix(RV, "eramos"))
+ {
+ CT = removeSuffix(CT, "eramos"); return true;
+ }
+ if (suffix(RV, "aramos"))
+ {
+ CT = removeSuffix(CT, "aramos"); return true;
+ }
+ if (suffix(RV, "asseis"))
+ {
+ CT = removeSuffix(CT, "asseis"); return true;
+ }
+ if (suffix(RV, "esseis"))
+ {
+ CT = removeSuffix(CT, "esseis"); return true;
+ }
+ if (suffix(RV, "isseis"))
+ {
+ CT = removeSuffix(CT, "isseis"); return true;
+ }
+ if (suffix(RV, "arieis"))
+ {
+ CT = removeSuffix(CT, "arieis"); return true;
+ }
+ if (suffix(RV, "erieis"))
+ {
+ CT = removeSuffix(CT, "erieis"); return true;
+ }
+ if (suffix(RV, "irieis"))
+ {
+ CT = removeSuffix(CT, "irieis"); return true;
+ }
+ }
+
+
+ // suffix lenght = 5
+ if (RV.Length >= 5)
+ {
+ if (suffix(RV, "irmos"))
+ {
+ CT = removeSuffix(CT, "irmos"); return true;
+ }
+ if (suffix(RV, "iamos"))
+ {
+ CT = removeSuffix(CT, "iamos"); return true;
+ }
+ if (suffix(RV, "armos"))
+ {
+ CT = removeSuffix(CT, "armos"); return true;
+ }
+ if (suffix(RV, "ermos"))
+ {
+ CT = removeSuffix(CT, "ermos"); return true;
+ }
+ if (suffix(RV, "areis"))
+ {
+ CT = removeSuffix(CT, "areis"); return true;
+ }
+ if (suffix(RV, "ereis"))
+ {
+ CT = removeSuffix(CT, "ereis"); return true;
+ }
+ if (suffix(RV, "ireis"))
+ {
+ CT = removeSuffix(CT, "ireis"); return true;
+ }
+ if (suffix(RV, "asses"))
+ {
+ CT = removeSuffix(CT, "asses"); return true;
+ }
+ if (suffix(RV, "esses"))
+ {
+ CT = removeSuffix(CT, "esses"); return true;
+ }
+ if (suffix(RV, "isses"))
+ {
+ CT = removeSuffix(CT, "isses"); return true;
+ }
+ if (suffix(RV, "astes"))
+ {
+ CT = removeSuffix(CT, "astes"); return true;
+ }
+ if (suffix(RV, "assem"))
+ {
+ CT = removeSuffix(CT, "assem"); return true;
+ }
+ if (suffix(RV, "essem"))
+ {
+ CT = removeSuffix(CT, "essem"); return true;
+ }
+ if (suffix(RV, "issem"))
+ {
+ CT = removeSuffix(CT, "issem"); return true;
+ }
+ if (suffix(RV, "ardes"))
+ {
+ CT = removeSuffix(CT, "ardes"); return true;
+ }
+ if (suffix(RV, "erdes"))
+ {
+ CT = removeSuffix(CT, "erdes"); return true;
+ }
+ if (suffix(RV, "irdes"))
+ {
+ CT = removeSuffix(CT, "irdes"); return true;
+ }
+ if (suffix(RV, "ariam"))
+ {
+ CT = removeSuffix(CT, "ariam"); return true;
+ }
+ if (suffix(RV, "eriam"))
+ {
+ CT = removeSuffix(CT, "eriam"); return true;
+ }
+ if (suffix(RV, "iriam"))
+ {
+ CT = removeSuffix(CT, "iriam"); return true;
+ }
+ if (suffix(RV, "arias"))
+ {
+ CT = removeSuffix(CT, "arias"); return true;
+ }
+ if (suffix(RV, "erias"))
+ {
+ CT = removeSuffix(CT, "erias"); return true;
+ }
+ if (suffix(RV, "irias"))
+ {
+ CT = removeSuffix(CT, "irias"); return true;
+ }
+ if (suffix(RV, "estes"))
+ {
+ CT = removeSuffix(CT, "estes"); return true;
+ }
+ if (suffix(RV, "istes"))
+ {
+ CT = removeSuffix(CT, "istes"); return true;
+ }
+ if (suffix(RV, "areis"))
+ {
+ CT = removeSuffix(CT, "areis"); return true;
+ }
+ if (suffix(RV, "aveis"))
+ {
+ CT = removeSuffix(CT, "aveis"); return true;
+ }
+ }
+
+ // suffix lenght = 4
+ if (RV.Length >= 4)
+ {
+ if (suffix(RV, "aria"))
+ {
+ CT = removeSuffix(CT, "aria"); return true;
+ }
+ if (suffix(RV, "eria"))
+ {
+ CT = removeSuffix(CT, "eria"); return true;
+ }
+ if (suffix(RV, "iria"))
+ {
+ CT = removeSuffix(CT, "iria"); return true;
+ }
+ if (suffix(RV, "asse"))
+ {
+ CT = removeSuffix(CT, "asse"); return true;
+ }
+ if (suffix(RV, "esse"))
+ {
+ CT = removeSuffix(CT, "esse"); return true;
+ }
+ if (suffix(RV, "isse"))
+ {
+ CT = removeSuffix(CT, "isse"); return true;
+ }
+ if (suffix(RV, "aste"))
+ {
+ CT = removeSuffix(CT, "aste"); return true;
+ }
+ if (suffix(RV, "este"))
+ {
+ CT = removeSuffix(CT, "este"); return true;
+ }
+ if (suffix(RV, "iste"))
+ {
+ CT = removeSuffix(CT, "iste"); return true;
+ }
+ if (suffix(RV, "arei"))
+ {
+ CT = removeSuffix(CT, "arei"); return true;
+ }
+ if (suffix(RV, "erei"))
+ {
+ CT = removeSuffix(CT, "erei"); return true;
+ }
+ if (suffix(RV, "irei"))
+ {
+ CT = removeSuffix(CT, "irei"); return true;
+ }
+ if (suffix(RV, "aram"))
+ {
+ CT = removeSuffix(CT, "aram"); return true;
+ }
+ if (suffix(RV, "eram"))
+ {
+ CT = removeSuffix(CT, "eram"); return true;
+ }
+ if (suffix(RV, "iram"))
+ {
+ CT = removeSuffix(CT, "iram"); return true;
+ }
+ if (suffix(RV, "avam"))
+ {
+ CT = removeSuffix(CT, "avam"); return true;
+ }
+ if (suffix(RV, "arem"))
+ {
+ CT = removeSuffix(CT, "arem"); return true;
+ }
+ if (suffix(RV, "erem"))
+ {
+ CT = removeSuffix(CT, "erem"); return true;
+ }
+ if (suffix(RV, "irem"))
+ {
+ CT = removeSuffix(CT, "irem"); return true;
+ }
+ if (suffix(RV, "ando"))
+ {
+ CT = removeSuffix(CT, "ando"); return true;
+ }
+ if (suffix(RV, "endo"))
+ {
+ CT = removeSuffix(CT, "endo"); return true;
+ }
+ if (suffix(RV, "indo"))
+ {
+ CT = removeSuffix(CT, "indo"); return true;
+ }
+ if (suffix(RV, "arao"))
+ {
+ CT = removeSuffix(CT, "arao"); return true;
+ }
+ if (suffix(RV, "erao"))
+ {
+ CT = removeSuffix(CT, "erao"); return true;
+ }
+ if (suffix(RV, "irao"))
+ {
+ CT = removeSuffix(CT, "irao"); return true;
+ }
+ if (suffix(RV, "adas"))
+ {
+ CT = removeSuffix(CT, "adas"); return true;
+ }
+ if (suffix(RV, "idas"))
+ {
+ CT = removeSuffix(CT, "idas"); return true;
+ }
+ if (suffix(RV, "aras"))
+ {
+ CT = removeSuffix(CT, "aras"); return true;
+ }
+ if (suffix(RV, "eras"))
+ {
+ CT = removeSuffix(CT, "eras"); return true;
+ }
+ if (suffix(RV, "iras"))
+ {
+ CT = removeSuffix(CT, "iras"); return true;
+ }
+ if (suffix(RV, "avas"))
+ {
+ CT = removeSuffix(CT, "avas"); return true;
+ }
+ if (suffix(RV, "ares"))
+ {
+ CT = removeSuffix(CT, "ares"); return true;
+ }
+ if (suffix(RV, "eres"))
+ {
+ CT = removeSuffix(CT, "eres"); return true;
+ }
+ if (suffix(RV, "ires"))
+ {
+ CT = removeSuffix(CT, "ires"); return true;
+ }
+ if (suffix(RV, "ados"))
+ {
+ CT = removeSuffix(CT, "ados"); return true;
+ }
+ if (suffix(RV, "idos"))
+ {
+ CT = removeSuffix(CT, "idos"); return true;
+ }
+ if (suffix(RV, "amos"))
+ {
+ CT = removeSuffix(CT, "amos"); return true;
+ }
+ if (suffix(RV, "emos"))
+ {
+ CT = removeSuffix(CT, "emos"); return true;
+ }
+ if (suffix(RV, "imos"))
+ {
+ CT = removeSuffix(CT, "imos"); return true;
+ }
+ if (suffix(RV, "iras"))
+ {
+ CT = removeSuffix(CT, "iras"); return true;
+ }
+ if (suffix(RV, "ieis"))
+ {
+ CT = removeSuffix(CT, "ieis"); return true;
+ }
+ }
+
+ // suffix lenght = 3
+ if (RV.Length >= 3)
+ {
+ if (suffix(RV, "ada"))
+ {
+ CT = removeSuffix(CT, "ada"); return true;
+ }
+ if (suffix(RV, "ida"))
+ {
+ CT = removeSuffix(CT, "ida"); return true;
+ }
+ if (suffix(RV, "ara"))
+ {
+ CT = removeSuffix(CT, "ara"); return true;
+ }
+ if (suffix(RV, "era"))
+ {
+ CT = removeSuffix(CT, "era"); return true;
+ }
+ if (suffix(RV, "ira"))
+ {
+ CT = removeSuffix(CT, "ava"); return true;
+ }
+ if (suffix(RV, "iam"))
+ {
+ CT = removeSuffix(CT, "iam"); return true;
+ }
+ if (suffix(RV, "ado"))
+ {
+ CT = removeSuffix(CT, "ado"); return true;
+ }
+ if (suffix(RV, "ido"))
+ {
+ CT = removeSuffix(CT, "ido"); return true;
+ }
+ if (suffix(RV, "ias"))
+ {
+ CT = removeSuffix(CT, "ias"); return true;
+ }
+ if (suffix(RV, "ais"))
+ {
+ CT = removeSuffix(CT, "ais"); return true;
+ }
+ if (suffix(RV, "eis"))
+ {
+ CT = removeSuffix(CT, "eis"); return true;
+ }
+ if (suffix(RV, "ira"))
+ {
+ CT = removeSuffix(CT, "ira"); return true;
+ }
+ if (suffix(RV, "ear"))
+ {
+ CT = removeSuffix(CT, "ear"); return true;
+ }
+ }
+
+ // suffix lenght = 2
+ if (RV.Length >= 2)
+ {
+ if (suffix(RV, "ia"))
+ {
+ CT = removeSuffix(CT, "ia"); return true;
+ }
+ if (suffix(RV, "ei"))
+ {
+ CT = removeSuffix(CT, "ei"); return true;
+ }
+ if (suffix(RV, "am"))
+ {
+ CT = removeSuffix(CT, "am"); return true;
+ }
+ if (suffix(RV, "em"))
+ {
+ CT = removeSuffix(CT, "em"); return true;
+ }
+ if (suffix(RV, "ar"))
+ {
+ CT = removeSuffix(CT, "ar"); return true;
+ }
+ if (suffix(RV, "er"))
+ {
+ CT = removeSuffix(CT, "er"); return true;
+ }
+ if (suffix(RV, "ir"))
+ {
+ CT = removeSuffix(CT, "ir"); return true;
+ }
+ if (suffix(RV, "as"))
+ {
+ CT = removeSuffix(CT, "as"); return true;
+ }
+ if (suffix(RV, "es"))
+ {
+ CT = removeSuffix(CT, "es"); return true;
+ }
+ if (suffix(RV, "is"))
+ {
+ CT = removeSuffix(CT, "is"); return true;
+ }
+ if (suffix(RV, "eu"))
+ {
+ CT = removeSuffix(CT, "eu"); return true;
+ }
+ if (suffix(RV, "iu"))
+ {
+ CT = removeSuffix(CT, "iu"); return true;
+ }
+ if (suffix(RV, "iu"))
+ {
+ CT = removeSuffix(CT, "iu"); return true;
+ }
+ if (suffix(RV, "ou"))
+ {
+ CT = removeSuffix(CT, "ou"); return true;
+ }
+ }
+
+ // no ending was removed by step2
+ return false;
+ }
+
+ /**
+ * Delete suffix 'i' if in RV and preceded by 'c'
+ *
+ */
+ private void step3()
+ {
+ if (RV == null) return;
+
+ if (suffix(RV, "i") && suffixPreceded(RV, "i", "c"))
+ {
+ CT = removeSuffix(CT, "i");
+ }
+
+ }
+
+ /**
+ * Residual suffix
+ *
+ * If the word ends with one of the suffixes (os a i o á à ó)
+ * in RV, delete it
+ *
+ */
+ private void step4()
+ {
+ if (RV == null) return;
+
+ if (suffix(RV, "os"))
+ {
+ CT = removeSuffix(CT, "os"); return;
+ }
+ if (suffix(RV, "a"))
+ {
+ CT = removeSuffix(CT, "a"); return;
+ }
+ if (suffix(RV, "i"))
+ {
+ CT = removeSuffix(CT, "i"); return;
+ }
+ if (suffix(RV, "o"))
+ {
+ CT = removeSuffix(CT, "o"); return;
+ }
+
+ }
+
+ /**
+ * If the word ends with one of ( e é ê) in RV,delete it,
+ * and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV,
+ * delete the 'u' (or 'i')
+ *
+ * Or if the word ends ç remove the cedilha
+ *
+ */
+ private void step5()
+ {
+ if (RV == null) return;
+
+ if (suffix(RV, "e"))
+ {
+ if (suffixPreceded(RV, "e", "gu"))
+ {
+ CT = removeSuffix(CT, "e");
+ CT = removeSuffix(CT, "u");
+ return;
+ }
+
+ if (suffixPreceded(RV, "e", "ci"))
+ {
+ CT = removeSuffix(CT, "e");
+ CT = removeSuffix(CT, "i");
+ return;
+ }
+
+ CT = removeSuffix(CT, "e"); return;
+ }
+ }
+
+ /**
+ * For log and debug purpose
+ *
+ * @return TERM, CT, RV, R1 and R2
+ */
+ public string Log()
+ {
+ return " (TERM = " + TERM + ")" +
+ " (CT = " + CT + ")" +
+ " (RV = " + RV + ")" +
+ " (R1 = " + R1 + ")" +
+ " (R2 = " + R2 + ")";
+ }
+
+ }
+
+}
\ No newline at end of file