You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by di...@apache.org on 2011/02/10 22:17:45 UTC
svn commit: r1069573 [3/3] - in /incubator/lucene.net:
tags/Lucene.Net_2_9_2/contrib/Analyzers/
tags/Lucene.Net_2_9_2/contrib/Analyzers/BR/
tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net.Analyzers/
tags/Lucene.Net_2_9_2/contrib/Analyzers/Lucene.Net...
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianStemmer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,1264 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A stemmer for Brazilian words.
+ */
+namespace Lucene.Net.Analysis.BR
+{
+
+ public class BrazilianStemmer
+ {
+
+ /**
+ * Changed term
+ */
+ private string TERM;
+ private string CT;
+ private string R1;
+ private string R2;
+ private string RV;
+
+
+ public BrazilianStemmer()
+ {
+ }
+
+ /**
+ * Stemms the given term to an unique <tt>discriminator</tt>.
+ *
+ * @param term The term that should be stemmed.
+ * @return Discriminator for <tt>term</tt>
+ */
+ public string Stem(string term)
+ {
+ bool altered = false; // altered the term
+
+ // creates CT
+ createCT(term);
+
+ if (!isIndexable(CT))
+ {
+ return null;
+ }
+ if (!isStemmable(CT))
+ {
+ return CT;
+ }
+
+ R1 = getR1(CT);
+ R2 = getR1(R1);
+ RV = getRV(CT);
+ TERM = term + ";" + CT;
+
+ altered = step1();
+ if (!altered)
+ {
+ altered = step2();
+ }
+
+ if (altered)
+ {
+ step3();
+ }
+ else
+ {
+ step4();
+ }
+
+ step5();
+
+ return CT;
+ }
+
+ /**
+ * Checks a term if it can be processed correctly.
+ *
+ * @return true if, and only if, the given term consists in letters.
+ */
+ private bool isStemmable(string term)
+ {
+ for (int c = 0; c < term.Length; c++)
+ {
+ // Discard terms that contain non-letter characters.
+ if (!char.IsLetter(term[c]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Checks a term if it can be processed indexed.
+ *
+ * @return true if it can be indexed
+ */
+ private bool isIndexable(string term)
+ {
+ return (term.Length < 30) && (term.Length > 2);
+ }
+
+ /**
+ * See if string is 'a','e','i','o','u'
+ *
+ * @return true if is vowel
+ */
+ private bool isVowel(char value)
+ {
+ return (value == 'a') ||
+ (value == 'e') ||
+ (value == 'i') ||
+ (value == 'o') ||
+ (value == 'u');
+ }
+
+ /**
+ * Gets R1
+ *
+ * R1 - is the region after the first non-vowel follwing a vowel,
+ * or is the null region at the end of the word if there is
+ * no such non-vowel.
+ *
+ * @return null or a string representing R1
+ */
+ private string getR1(string value)
+ {
+ int i;
+ int j;
+
+ // be-safe !!!
+ if (value == null)
+ {
+ return null;
+ }
+
+ // find 1st vowel
+ i = value.Length - 1;
+ for (j = 0; j < i; j++)
+ {
+ if (isVowel(value[j]))
+ {
+ break;
+ }
+ }
+
+ if (!(j < i))
+ {
+ return null;
+ }
+
+ // find 1st non-vowel
+ for (; j < i; j++)
+ {
+ if (!(isVowel(value[j])))
+ {
+ break;
+ }
+ }
+
+ if (!(j < i))
+ {
+ return null;
+ }
+
+ return value.Substring(j + 1);
+ }
+
+ /**
+ * Gets RV
+ *
+ * RV - IF the second letter is a consoant, RV is the region after
+ * the next following vowel,
+ *
+ * OR if the first two letters are vowels, RV is the region
+ * after the next consoant,
+ *
+ * AND otherwise (consoant-vowel case) RV is the region after
+ * the third letter.
+ *
+ * BUT RV is the end of the word if this positions cannot be
+ * found.
+ *
+ * @return null or a string representing RV
+ */
+ private string getRV(string value)
+ {
+ int i;
+ int j;
+
+ // be-safe !!!
+ if (value == null)
+ {
+ return null;
+ }
+
+ i = value.Length - 1;
+
+ // RV - IF the second letter is a consoant, RV is the region after
+ // the next following vowel,
+ if ((i > 0) && !isVowel(value[1]))
+ {
+ // find 1st vowel
+ for (j = 2; j < i; j++)
+ {
+ if (isVowel(value[j]))
+ {
+ break;
+ }
+ }
+
+ if (j < i)
+ {
+ return value.Substring(j + 1);
+ }
+ }
+
+
+ // RV - OR if the first two letters are vowels, RV is the region
+ // after the next consoant,
+ if ((i > 1) &&
+ isVowel(value[0]) &&
+ isVowel(value[1]))
+ {
+ // find 1st consoant
+ for (j = 2; j < i; j++)
+ {
+ if (!isVowel(value[j]))
+ {
+ break;
+ }
+ }
+
+ if (j < i)
+ {
+ return value.Substring(j + 1);
+ }
+ }
+
+ // RV - AND otherwise (consoant-vowel case) RV is the region after
+ // the third letter.
+ if (i > 2)
+ {
+ return value.Substring(3);
+ }
+
+ return null;
+ }
+
+ /**
+ * 1) Turn to lowercase
+ * 2) Remove accents
+ * 3) ã -> a ; õ -> o
+ * 4) ç -> c
+ *
+ * @return null or a string transformed
+ */
+ private string changeTerm(string value)
+ {
+ int j;
+ string r = "";
+
+ // be-safe !!!
+ if (value == null)
+ {
+ return null;
+ }
+
+ value = value.ToLower();
+ for (j = 0; j < value.Length; j++)
+ {
+ if ((value[j] == 'á') ||
+ (value[j] == 'â') ||
+ (value[j] == 'ã'))
+ {
+ r = r + "a"; continue;
+ }
+ if ((value[j] == 'é') ||
+ (value[j] == 'ê'))
+ {
+ r = r + "e"; continue;
+ }
+ if (value[j] == 'Ã')
+ {
+ r = r + "i"; continue;
+ }
+ if ((value[j] == 'ó') ||
+ (value[j] == 'ô') ||
+ (value[j] == 'õ'))
+ {
+ r = r + "o"; continue;
+ }
+ if ((value[j] == 'ú') ||
+ (value[j] == 'ü'))
+ {
+ r = r + "u"; continue;
+ }
+ if (value[j] == 'ç')
+ {
+ r = r + "c"; continue;
+ }
+ if (value[j] == 'ñ')
+ {
+ r = r + "n"; continue;
+ }
+
+ r = r + value[j];
+ }
+
+ return r;
+ }
+
+ /**
+ * Check if a string ends with a suffix
+ *
+ * @return true if the string ends with the specified suffix
+ */
+ private bool suffix(string value, string suffix)
+ {
+
+ // be-safe !!!
+ if ((value == null) || (suffix == null))
+ {
+ return false;
+ }
+
+ if (suffix.Length > value.Length)
+ {
+ return false;
+ }
+
+ return value.Substring(value.Length - suffix.Length).Equals(suffix);
+ }
+
+ /**
+ * Replace a string suffix by another
+ *
+ * @return the replaced string
+ */
+ private string replaceSuffix(string value, string toReplace, string changeTo)
+ {
+ string vvalue;
+
+ // be-safe !!!
+ if ((value == null) ||
+ (toReplace == null) ||
+ (changeTo == null))
+ {
+ return value;
+ }
+
+ vvalue = removeSuffix(value, toReplace);
+
+ if (value.Equals(vvalue))
+ {
+ return value;
+ }
+ else
+ {
+ return vvalue + changeTo;
+ }
+ }
+
+ /**
+ * Remove a string suffix
+ *
+ * @return the string without the suffix
+ */
+ private string removeSuffix(string value, string toRemove)
+ {
+ // be-safe !!!
+ if ((value == null) ||
+ (toRemove == null) ||
+ !suffix(value, toRemove))
+ {
+ return value;
+ }
+
+ return value.Substring(0, value.Length - toRemove.Length);
+ }
+
+ /**
+ * See if a suffix is preceded by a string
+ *
+ * @return true if the suffix is preceded
+ */
+ private bool suffixPreceded(string value, string _suffix, string preceded)
+ {
+ // be-safe !!!
+ if ((value == null) ||
+ (_suffix == null) ||
+ (preceded == null) ||
+ !suffix(value, _suffix))
+ {
+ return false;
+ }
+
+ return suffix(removeSuffix(value, _suffix), preceded);
+ }
+
+
+
+
+ /**
+ * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
+ */
+ private void createCT(string term)
+ {
+ CT = changeTerm(term);
+
+ if (CT.Length < 2) return;
+
+ // if the first character is ... , remove it
+ if ((CT[0] == '"') ||
+ (CT[0] == '\'') ||
+ (CT[0] == '-') ||
+ (CT[0] == ',') ||
+ (CT[0] == ';') ||
+ (CT[0] == '.') ||
+ (CT[0] == '?') ||
+ (CT[0] == '!')
+ )
+ {
+ CT = CT.Substring(1);
+ }
+
+ if (CT.Length < 2) return;
+
+ // if the last character is ... , remove it
+ if ((CT[CT.Length - 1] == '-') ||
+ (CT[CT.Length - 1] == ',') ||
+ (CT[CT.Length - 1] == ';') ||
+ (CT[CT.Length - 1] == '.') ||
+ (CT[CT.Length - 1] == '?') ||
+ (CT[CT.Length - 1] == '!') ||
+ (CT[CT.Length - 1] == '\'') ||
+ (CT[CT.Length - 1] == '"')
+ )
+ {
+ CT = CT.Substring(0, CT.Length - 1);
+ }
+ }
+
+
+ /**
+ * Standart suffix removal.
+ * Search for the longest among the following suffixes, and perform
+ * the following actions:
+ *
+ * @return false if no ending was removed
+ */
+ private bool step1()
+ {
+ if (CT == null) return false;
+
+ // suffix lenght = 7
+ if (suffix(CT, "uciones") && suffix(R2, "uciones"))
+ {
+ CT = replaceSuffix(CT, "uciones", "u"); return true;
+ }
+
+ // suffix lenght = 6
+ if (CT.Length >= 6)
+ {
+ if (suffix(CT, "imentos") && suffix(R2, "imentos"))
+ {
+ CT = removeSuffix(CT, "imentos"); return true;
+ }
+ if (suffix(CT, "amentos") && suffix(R2, "amentos"))
+ {
+ CT = removeSuffix(CT, "amentos"); return true;
+ }
+ if (suffix(CT, "adores") && suffix(R2, "adores"))
+ {
+ CT = removeSuffix(CT, "adores"); return true;
+ }
+ if (suffix(CT, "adoras") && suffix(R2, "adoras"))
+ {
+ CT = removeSuffix(CT, "adoras"); return true;
+ }
+ if (suffix(CT, "logias") && suffix(R2, "logias"))
+ {
+ replaceSuffix(CT, "logias", "log"); return true;
+ }
+ if (suffix(CT, "encias") && suffix(R2, "encias"))
+ {
+ CT = replaceSuffix(CT, "encias", "ente"); return true;
+ }
+ if (suffix(CT, "amente") && suffix(R1, "amente"))
+ {
+ CT = removeSuffix(CT, "amente"); return true;
+ }
+ if (suffix(CT, "idades") && suffix(R2, "idades"))
+ {
+ CT = removeSuffix(CT, "idades"); return true;
+ }
+ }
+
+ // suffix lenght = 5
+ if (CT.Length >= 5)
+ {
+ if (suffix(CT, "acoes") && suffix(R2, "acoes"))
+ {
+ CT = removeSuffix(CT, "acoes"); return true;
+ }
+ if (suffix(CT, "imento") && suffix(R2, "imento"))
+ {
+ CT = removeSuffix(CT, "imento"); return true;
+ }
+ if (suffix(CT, "amento") && suffix(R2, "amento"))
+ {
+ CT = removeSuffix(CT, "amento"); return true;
+ }
+ if (suffix(CT, "adora") && suffix(R2, "adora"))
+ {
+ CT = removeSuffix(CT, "adora"); return true;
+ }
+ if (suffix(CT, "ismos") && suffix(R2, "ismos"))
+ {
+ CT = removeSuffix(CT, "ismos"); return true;
+ }
+ if (suffix(CT, "istas") && suffix(R2, "istas"))
+ {
+ CT = removeSuffix(CT, "istas"); return true;
+ }
+ if (suffix(CT, "logia") && suffix(R2, "logia"))
+ {
+ CT = replaceSuffix(CT, "logia", "log"); return true;
+ }
+ if (suffix(CT, "ucion") && suffix(R2, "ucion"))
+ {
+ CT = replaceSuffix(CT, "ucion", "u"); return true;
+ }
+ if (suffix(CT, "encia") && suffix(R2, "encia"))
+ {
+ CT = replaceSuffix(CT, "encia", "ente"); return true;
+ }
+ if (suffix(CT, "mente") && suffix(R2, "mente"))
+ {
+ CT = removeSuffix(CT, "mente"); return true;
+ }
+ if (suffix(CT, "idade") && suffix(R2, "idade"))
+ {
+ CT = removeSuffix(CT, "idade"); return true;
+ }
+ }
+
+ // suffix lenght = 4
+ if (CT.Length >= 4)
+ {
+ if (suffix(CT, "acao") && suffix(R2, "acao"))
+ {
+ CT = removeSuffix(CT, "acao"); return true;
+ }
+ if (suffix(CT, "ezas") && suffix(R2, "ezas"))
+ {
+ CT = removeSuffix(CT, "ezas"); return true;
+ }
+ if (suffix(CT, "icos") && suffix(R2, "icos"))
+ {
+ CT = removeSuffix(CT, "icos"); return true;
+ }
+ if (suffix(CT, "icas") && suffix(R2, "icas"))
+ {
+ CT = removeSuffix(CT, "icas"); return true;
+ }
+ if (suffix(CT, "ismo") && suffix(R2, "ismo"))
+ {
+ CT = removeSuffix(CT, "ismo"); return true;
+ }
+ if (suffix(CT, "avel") && suffix(R2, "avel"))
+ {
+ CT = removeSuffix(CT, "avel"); return true;
+ }
+ if (suffix(CT, "ivel") && suffix(R2, "ivel"))
+ {
+ CT = removeSuffix(CT, "ivel"); return true;
+ }
+ if (suffix(CT, "ista") && suffix(R2, "ista"))
+ {
+ CT = removeSuffix(CT, "ista"); return true;
+ }
+ if (suffix(CT, "osos") && suffix(R2, "osos"))
+ {
+ CT = removeSuffix(CT, "osos"); return true;
+ }
+ if (suffix(CT, "osas") && suffix(R2, "osas"))
+ {
+ CT = removeSuffix(CT, "osas"); return true;
+ }
+ if (suffix(CT, "ador") && suffix(R2, "ador"))
+ {
+ CT = removeSuffix(CT, "ador"); return true;
+ }
+ if (suffix(CT, "ivas") && suffix(R2, "ivas"))
+ {
+ CT = removeSuffix(CT, "ivas"); return true;
+ }
+ if (suffix(CT, "ivos") && suffix(R2, "ivos"))
+ {
+ CT = removeSuffix(CT, "ivos"); return true;
+ }
+ if (suffix(CT, "iras") &&
+ suffix(RV, "iras") &&
+ suffixPreceded(CT, "iras", "e"))
+ {
+ CT = replaceSuffix(CT, "iras", "ir"); return true;
+ }
+ }
+
+ // suffix lenght = 3
+ if (CT.Length >= 3)
+ {
+ if (suffix(CT, "eza") && suffix(R2, "eza"))
+ {
+ CT = removeSuffix(CT, "eza"); return true;
+ }
+ if (suffix(CT, "ico") && suffix(R2, "ico"))
+ {
+ CT = removeSuffix(CT, "ico"); return true;
+ }
+ if (suffix(CT, "ica") && suffix(R2, "ica"))
+ {
+ CT = removeSuffix(CT, "ica"); return true;
+ }
+ if (suffix(CT, "oso") && suffix(R2, "oso"))
+ {
+ CT = removeSuffix(CT, "oso"); return true;
+ }
+ if (suffix(CT, "osa") && suffix(R2, "osa"))
+ {
+ CT = removeSuffix(CT, "osa"); return true;
+ }
+ if (suffix(CT, "iva") && suffix(R2, "iva"))
+ {
+ CT = removeSuffix(CT, "iva"); return true;
+ }
+ if (suffix(CT, "ivo") && suffix(R2, "ivo"))
+ {
+ CT = removeSuffix(CT, "ivo"); return true;
+ }
+ if (suffix(CT, "ira") &&
+ suffix(RV, "ira") &&
+ suffixPreceded(CT, "ira", "e"))
+ {
+ CT = replaceSuffix(CT, "ira", "ir"); return true;
+ }
+ }
+
+ // no ending was removed by step1
+ return false;
+ }
+
+
+ /**
+ * Verb suffixes.
+ *
+ * Search for the longest among the following suffixes in RV,
+ * and if found, delete.
+ *
+ * @return false if no ending was removed
+ */
+ private bool step2()
+ {
+ if (RV == null) return false;
+
+ // suffix lenght = 7
+ if (RV.Length >= 7)
+ {
+ if (suffix(RV, "issemos"))
+ {
+ CT = removeSuffix(CT, "issemos"); return true;
+ }
+ if (suffix(RV, "essemos"))
+ {
+ CT = removeSuffix(CT, "essemos"); return true;
+ }
+ if (suffix(RV, "assemos"))
+ {
+ CT = removeSuffix(CT, "assemos"); return true;
+ }
+ if (suffix(RV, "ariamos"))
+ {
+ CT = removeSuffix(CT, "ariamos"); return true;
+ }
+ if (suffix(RV, "eriamos"))
+ {
+ CT = removeSuffix(CT, "eriamos"); return true;
+ }
+ if (suffix(RV, "iriamos"))
+ {
+ CT = removeSuffix(CT, "iriamos"); return true;
+ }
+ }
+
+ // suffix lenght = 6
+ if (RV.Length >= 6)
+ {
+ if (suffix(RV, "iremos"))
+ {
+ CT = removeSuffix(CT, "iremos"); return true;
+ }
+ if (suffix(RV, "eremos"))
+ {
+ CT = removeSuffix(CT, "eremos"); return true;
+ }
+ if (suffix(RV, "aremos"))
+ {
+ CT = removeSuffix(CT, "aremos"); return true;
+ }
+ if (suffix(RV, "avamos"))
+ {
+ CT = removeSuffix(CT, "avamos"); return true;
+ }
+ if (suffix(RV, "iramos"))
+ {
+ CT = removeSuffix(CT, "iramos"); return true;
+ }
+ if (suffix(RV, "eramos"))
+ {
+ CT = removeSuffix(CT, "eramos"); return true;
+ }
+ if (suffix(RV, "aramos"))
+ {
+ CT = removeSuffix(CT, "aramos"); return true;
+ }
+ if (suffix(RV, "asseis"))
+ {
+ CT = removeSuffix(CT, "asseis"); return true;
+ }
+ if (suffix(RV, "esseis"))
+ {
+ CT = removeSuffix(CT, "esseis"); return true;
+ }
+ if (suffix(RV, "isseis"))
+ {
+ CT = removeSuffix(CT, "isseis"); return true;
+ }
+ if (suffix(RV, "arieis"))
+ {
+ CT = removeSuffix(CT, "arieis"); return true;
+ }
+ if (suffix(RV, "erieis"))
+ {
+ CT = removeSuffix(CT, "erieis"); return true;
+ }
+ if (suffix(RV, "irieis"))
+ {
+ CT = removeSuffix(CT, "irieis"); return true;
+ }
+ }
+
+
+ // suffix lenght = 5
+ if (RV.Length >= 5)
+ {
+ if (suffix(RV, "irmos"))
+ {
+ CT = removeSuffix(CT, "irmos"); return true;
+ }
+ if (suffix(RV, "iamos"))
+ {
+ CT = removeSuffix(CT, "iamos"); return true;
+ }
+ if (suffix(RV, "armos"))
+ {
+ CT = removeSuffix(CT, "armos"); return true;
+ }
+ if (suffix(RV, "ermos"))
+ {
+ CT = removeSuffix(CT, "ermos"); return true;
+ }
+ if (suffix(RV, "areis"))
+ {
+ CT = removeSuffix(CT, "areis"); return true;
+ }
+ if (suffix(RV, "ereis"))
+ {
+ CT = removeSuffix(CT, "ereis"); return true;
+ }
+ if (suffix(RV, "ireis"))
+ {
+ CT = removeSuffix(CT, "ireis"); return true;
+ }
+ if (suffix(RV, "asses"))
+ {
+ CT = removeSuffix(CT, "asses"); return true;
+ }
+ if (suffix(RV, "esses"))
+ {
+ CT = removeSuffix(CT, "esses"); return true;
+ }
+ if (suffix(RV, "isses"))
+ {
+ CT = removeSuffix(CT, "isses"); return true;
+ }
+ if (suffix(RV, "astes"))
+ {
+ CT = removeSuffix(CT, "astes"); return true;
+ }
+ if (suffix(RV, "assem"))
+ {
+ CT = removeSuffix(CT, "assem"); return true;
+ }
+ if (suffix(RV, "essem"))
+ {
+ CT = removeSuffix(CT, "essem"); return true;
+ }
+ if (suffix(RV, "issem"))
+ {
+ CT = removeSuffix(CT, "issem"); return true;
+ }
+ if (suffix(RV, "ardes"))
+ {
+ CT = removeSuffix(CT, "ardes"); return true;
+ }
+ if (suffix(RV, "erdes"))
+ {
+ CT = removeSuffix(CT, "erdes"); return true;
+ }
+ if (suffix(RV, "irdes"))
+ {
+ CT = removeSuffix(CT, "irdes"); return true;
+ }
+ if (suffix(RV, "ariam"))
+ {
+ CT = removeSuffix(CT, "ariam"); return true;
+ }
+ if (suffix(RV, "eriam"))
+ {
+ CT = removeSuffix(CT, "eriam"); return true;
+ }
+ if (suffix(RV, "iriam"))
+ {
+ CT = removeSuffix(CT, "iriam"); return true;
+ }
+ if (suffix(RV, "arias"))
+ {
+ CT = removeSuffix(CT, "arias"); return true;
+ }
+ if (suffix(RV, "erias"))
+ {
+ CT = removeSuffix(CT, "erias"); return true;
+ }
+ if (suffix(RV, "irias"))
+ {
+ CT = removeSuffix(CT, "irias"); return true;
+ }
+ if (suffix(RV, "estes"))
+ {
+ CT = removeSuffix(CT, "estes"); return true;
+ }
+ if (suffix(RV, "istes"))
+ {
+ CT = removeSuffix(CT, "istes"); return true;
+ }
+ if (suffix(RV, "areis"))
+ {
+ CT = removeSuffix(CT, "areis"); return true;
+ }
+ if (suffix(RV, "aveis"))
+ {
+ CT = removeSuffix(CT, "aveis"); return true;
+ }
+ }
+
+ // suffix lenght = 4
+ if (RV.Length >= 4)
+ {
+ if (suffix(RV, "aria"))
+ {
+ CT = removeSuffix(CT, "aria"); return true;
+ }
+ if (suffix(RV, "eria"))
+ {
+ CT = removeSuffix(CT, "eria"); return true;
+ }
+ if (suffix(RV, "iria"))
+ {
+ CT = removeSuffix(CT, "iria"); return true;
+ }
+ if (suffix(RV, "asse"))
+ {
+ CT = removeSuffix(CT, "asse"); return true;
+ }
+ if (suffix(RV, "esse"))
+ {
+ CT = removeSuffix(CT, "esse"); return true;
+ }
+ if (suffix(RV, "isse"))
+ {
+ CT = removeSuffix(CT, "isse"); return true;
+ }
+ if (suffix(RV, "aste"))
+ {
+ CT = removeSuffix(CT, "aste"); return true;
+ }
+ if (suffix(RV, "este"))
+ {
+ CT = removeSuffix(CT, "este"); return true;
+ }
+ if (suffix(RV, "iste"))
+ {
+ CT = removeSuffix(CT, "iste"); return true;
+ }
+ if (suffix(RV, "arei"))
+ {
+ CT = removeSuffix(CT, "arei"); return true;
+ }
+ if (suffix(RV, "erei"))
+ {
+ CT = removeSuffix(CT, "erei"); return true;
+ }
+ if (suffix(RV, "irei"))
+ {
+ CT = removeSuffix(CT, "irei"); return true;
+ }
+ if (suffix(RV, "aram"))
+ {
+ CT = removeSuffix(CT, "aram"); return true;
+ }
+ if (suffix(RV, "eram"))
+ {
+ CT = removeSuffix(CT, "eram"); return true;
+ }
+ if (suffix(RV, "iram"))
+ {
+ CT = removeSuffix(CT, "iram"); return true;
+ }
+ if (suffix(RV, "avam"))
+ {
+ CT = removeSuffix(CT, "avam"); return true;
+ }
+ if (suffix(RV, "arem"))
+ {
+ CT = removeSuffix(CT, "arem"); return true;
+ }
+ if (suffix(RV, "erem"))
+ {
+ CT = removeSuffix(CT, "erem"); return true;
+ }
+ if (suffix(RV, "irem"))
+ {
+ CT = removeSuffix(CT, "irem"); return true;
+ }
+ if (suffix(RV, "ando"))
+ {
+ CT = removeSuffix(CT, "ando"); return true;
+ }
+ if (suffix(RV, "endo"))
+ {
+ CT = removeSuffix(CT, "endo"); return true;
+ }
+ if (suffix(RV, "indo"))
+ {
+ CT = removeSuffix(CT, "indo"); return true;
+ }
+ if (suffix(RV, "arao"))
+ {
+ CT = removeSuffix(CT, "arao"); return true;
+ }
+ if (suffix(RV, "erao"))
+ {
+ CT = removeSuffix(CT, "erao"); return true;
+ }
+ if (suffix(RV, "irao"))
+ {
+ CT = removeSuffix(CT, "irao"); return true;
+ }
+ if (suffix(RV, "adas"))
+ {
+ CT = removeSuffix(CT, "adas"); return true;
+ }
+ if (suffix(RV, "idas"))
+ {
+ CT = removeSuffix(CT, "idas"); return true;
+ }
+ if (suffix(RV, "aras"))
+ {
+ CT = removeSuffix(CT, "aras"); return true;
+ }
+ if (suffix(RV, "eras"))
+ {
+ CT = removeSuffix(CT, "eras"); return true;
+ }
+ if (suffix(RV, "iras"))
+ {
+ CT = removeSuffix(CT, "iras"); return true;
+ }
+ if (suffix(RV, "avas"))
+ {
+ CT = removeSuffix(CT, "avas"); return true;
+ }
+ if (suffix(RV, "ares"))
+ {
+ CT = removeSuffix(CT, "ares"); return true;
+ }
+ if (suffix(RV, "eres"))
+ {
+ CT = removeSuffix(CT, "eres"); return true;
+ }
+ if (suffix(RV, "ires"))
+ {
+ CT = removeSuffix(CT, "ires"); return true;
+ }
+ if (suffix(RV, "ados"))
+ {
+ CT = removeSuffix(CT, "ados"); return true;
+ }
+ if (suffix(RV, "idos"))
+ {
+ CT = removeSuffix(CT, "idos"); return true;
+ }
+ if (suffix(RV, "amos"))
+ {
+ CT = removeSuffix(CT, "amos"); return true;
+ }
+ if (suffix(RV, "emos"))
+ {
+ CT = removeSuffix(CT, "emos"); return true;
+ }
+ if (suffix(RV, "imos"))
+ {
+ CT = removeSuffix(CT, "imos"); return true;
+ }
+ if (suffix(RV, "iras"))
+ {
+ CT = removeSuffix(CT, "iras"); return true;
+ }
+ if (suffix(RV, "ieis"))
+ {
+ CT = removeSuffix(CT, "ieis"); return true;
+ }
+ }
+
+ // suffix lenght = 3
+ if (RV.Length >= 3)
+ {
+ if (suffix(RV, "ada"))
+ {
+ CT = removeSuffix(CT, "ada"); return true;
+ }
+ if (suffix(RV, "ida"))
+ {
+ CT = removeSuffix(CT, "ida"); return true;
+ }
+ if (suffix(RV, "ara"))
+ {
+ CT = removeSuffix(CT, "ara"); return true;
+ }
+ if (suffix(RV, "era"))
+ {
+ CT = removeSuffix(CT, "era"); return true;
+ }
+ if (suffix(RV, "ira"))
+ {
+ CT = removeSuffix(CT, "ava"); return true;
+ }
+ if (suffix(RV, "iam"))
+ {
+ CT = removeSuffix(CT, "iam"); return true;
+ }
+ if (suffix(RV, "ado"))
+ {
+ CT = removeSuffix(CT, "ado"); return true;
+ }
+ if (suffix(RV, "ido"))
+ {
+ CT = removeSuffix(CT, "ido"); return true;
+ }
+ if (suffix(RV, "ias"))
+ {
+ CT = removeSuffix(CT, "ias"); return true;
+ }
+ if (suffix(RV, "ais"))
+ {
+ CT = removeSuffix(CT, "ais"); return true;
+ }
+ if (suffix(RV, "eis"))
+ {
+ CT = removeSuffix(CT, "eis"); return true;
+ }
+ if (suffix(RV, "ira"))
+ {
+ CT = removeSuffix(CT, "ira"); return true;
+ }
+ if (suffix(RV, "ear"))
+ {
+ CT = removeSuffix(CT, "ear"); return true;
+ }
+ }
+
+ // suffix lenght = 2
+ if (RV.Length >= 2)
+ {
+ if (suffix(RV, "ia"))
+ {
+ CT = removeSuffix(CT, "ia"); return true;
+ }
+ if (suffix(RV, "ei"))
+ {
+ CT = removeSuffix(CT, "ei"); return true;
+ }
+ if (suffix(RV, "am"))
+ {
+ CT = removeSuffix(CT, "am"); return true;
+ }
+ if (suffix(RV, "em"))
+ {
+ CT = removeSuffix(CT, "em"); return true;
+ }
+ if (suffix(RV, "ar"))
+ {
+ CT = removeSuffix(CT, "ar"); return true;
+ }
+ if (suffix(RV, "er"))
+ {
+ CT = removeSuffix(CT, "er"); return true;
+ }
+ if (suffix(RV, "ir"))
+ {
+ CT = removeSuffix(CT, "ir"); return true;
+ }
+ if (suffix(RV, "as"))
+ {
+ CT = removeSuffix(CT, "as"); return true;
+ }
+ if (suffix(RV, "es"))
+ {
+ CT = removeSuffix(CT, "es"); return true;
+ }
+ if (suffix(RV, "is"))
+ {
+ CT = removeSuffix(CT, "is"); return true;
+ }
+ if (suffix(RV, "eu"))
+ {
+ CT = removeSuffix(CT, "eu"); return true;
+ }
+ if (suffix(RV, "iu"))
+ {
+ CT = removeSuffix(CT, "iu"); return true;
+ }
+ if (suffix(RV, "iu"))
+ {
+ CT = removeSuffix(CT, "iu"); return true;
+ }
+ if (suffix(RV, "ou"))
+ {
+ CT = removeSuffix(CT, "ou"); return true;
+ }
+ }
+
+ // no ending was removed by step2
+ return false;
+ }
+
+ /**
+ * Delete suffix 'i' if in RV and preceded by 'c'
+ *
+ */
+ private void step3()
+ {
+ if (RV == null) return;
+
+ if (suffix(RV, "i") && suffixPreceded(RV, "i", "c"))
+ {
+ CT = removeSuffix(CT, "i");
+ }
+
+ }
+
+ /**
+ * Residual suffix
+ *
+ * If the word ends with one of the suffixes (os a i o á à ó)
+ * in RV, delete it
+ *
+ */
+ private void step4()
+ {
+ if (RV == null) return;
+
+ if (suffix(RV, "os"))
+ {
+ CT = removeSuffix(CT, "os"); return;
+ }
+ if (suffix(RV, "a"))
+ {
+ CT = removeSuffix(CT, "a"); return;
+ }
+ if (suffix(RV, "i"))
+ {
+ CT = removeSuffix(CT, "i"); return;
+ }
+ if (suffix(RV, "o"))
+ {
+ CT = removeSuffix(CT, "o"); return;
+ }
+
+ }
+
+ /**
+ * If the word ends with one of ( e é ê) in RV,delete it,
+ * and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV,
+ * delete the 'u' (or 'i')
+ *
+ * Or if the word ends ç remove the cedilha
+ *
+ */
+ private void step5()
+ {
+ if (RV == null) return;
+
+ if (suffix(RV, "e"))
+ {
+ if (suffixPreceded(RV, "e", "gu"))
+ {
+ CT = removeSuffix(CT, "e");
+ CT = removeSuffix(CT, "u");
+ return;
+ }
+
+ if (suffixPreceded(RV, "e", "ci"))
+ {
+ CT = removeSuffix(CT, "e");
+ CT = removeSuffix(CT, "i");
+ return;
+ }
+
+ CT = removeSuffix(CT, "e"); return;
+ }
+ }
+
+ /**
+ * For log and debug purpose
+ *
+ * @return TERM, CT, RV, R1 and R2
+ */
+ public string Log()
+ {
+ return " (TERM = " + TERM + ")" +
+ " (CT = " + CT + ")" +
+ " (RV = " + RV + ")" +
+ " (R1 = " + R1 + ")" +
+ " (R2 = " + R2 + ")";
+ }
+
+ }
+
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj Thu Feb 10 21:17:43 2011
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="3.5" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <ProductVersion>9.0.21022</ProductVersion>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{4286E961-9143-4821-B46D-3D39D3736386}</ProjectGuid>
+ <OutputType>Library</OutputType>
+ <AppDesignerFolder>Properties</AppDesignerFolder>
+ <RootNamespace>Lucene.Net.Analyzers</RootNamespace>
+ <AssemblyName>Lucene.Net.Analyzers</AssemblyName>
+ <TargetFrameworkVersion>v2.0</TargetFrameworkVersion>
+ <FileAlignment>512</FileAlignment>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+ <DebugSymbols>true</DebugSymbols>
+ <DebugType>full</DebugType>
+ <Optimize>false</Optimize>
+ <OutputPath>bin\Debug\</OutputPath>
+ <DefineConstants>DEBUG;TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+ <DebugType>pdbonly</DebugType>
+ <Optimize>true</Optimize>
+ <OutputPath>bin\Release\</OutputPath>
+ <DefineConstants>TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <ItemGroup>
+ <Reference Include="Lucene.Net, Version=2.9.2.2, Culture=neutral, processorArchitecture=MSIL">
+ <SpecificVersion>False</SpecificVersion>
+ <HintPath>..\Lucene.Net.dll</HintPath>
+ </Reference>
+ <Reference Include="System" />
+ <Reference Include="System.Data" />
+ <Reference Include="System.Xml" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="AR\ArabicAnalyzer.cs" />
+ <Compile Include="AR\ArabicLetterTokenizer.cs" />
+ <Compile Include="AR\ArabicNormalizationFilter.cs" />
+ <Compile Include="AR\ArabicNormalizer.cs" />
+ <Compile Include="AR\ArabicStemFilter.cs" />
+ <Compile Include="AR\ArabicStemmer.cs" />
+ <Compile Include="BR\BrazilianAnalyzer.cs" />
+ <Compile Include="BR\BrazilianStemFilter.cs" />
+ <Compile Include="BR\BrazilianStemmer.cs" />
+ <Compile Include="Properties\AssemblyInfo.cs" />
+ </ItemGroup>
+ <ItemGroup>
+ <EmbeddedResource Include="AR\ArabicStopWords.txt" />
+ </ItemGroup>
+ <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+ <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+ Other similar extension points exist, see Microsoft.Common.targets.
+ <Target Name="BeforeBuild">
+ </Target>
+ <Target Name="AfterBuild">
+ </Target>
+ -->
+</Project>
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj.user Thu Feb 10 21:17:43 2011
@@ -0,0 +1 @@
+<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003" />
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Properties/AssemblyInfo.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Analyzers")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("The Apache Software Foundation")]
+[assembly: AssemblyProduct("Lucene.Net.Analyzers")]
+[assembly: AssemblyCopyright("Copyright 2006 - 2011 The Apache Software Foundation")]
+[assembly: AssemblyTrademark("Copyright 2006 - 2011 The Apache Software Foundation")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components. If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("36a962fb-a8be-4238-88c4-32568216e247")]
+
+// Version information for an assembly consists of the following four values:
+//
+// Major Version
+// Minor Version
+// Build Number
+// Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("2.9.2.1")]
+[assembly: AssemblyFileVersion("2.9.2.1")]
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicAnalyzer.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+ /**
+ * Test the Arabic Analyzer
+ *
+ */
+ [TestFixture]
+ public class TestArabicAnalyzer : BaseTokenStreamTestCase
+ {
+
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ [Test]
+ public void TestResourcesAvailable()
+ {
+ new ArabicAnalyzer();
+ }
+
+ /**
+ * Some simple tests showing some features of the analyzer, how some regular forms will conflate
+ */
+ [Test]
+ public void TestBasicFeatures()
+ {
+ ArabicAnalyzer a = new ArabicAnalyzer();
+ AssertAnalyzesTo(a, "ÙبÙر", new String[] { "ÙبÙر" });
+ AssertAnalyzesTo(a, "ÙبÙرة", new String[] { "ÙبÙر" }); // feminine marker
+
+ AssertAnalyzesTo(a, "Ù
شرÙب", new String[] { "Ù
شرÙب" });
+ AssertAnalyzesTo(a, "Ù
شرÙبات", new String[] { "Ù
شرÙب" }); // plural -at
+
+ AssertAnalyzesTo(a, "Ø£Ù
رÙÙÙÙÙ", new String[] { "اÙ
رÙÙ" }); // plural -in
+ AssertAnalyzesTo(a, "اÙ
رÙÙÙ", new String[] { "اÙ
رÙÙ" }); // singular with bare alif
+
+ AssertAnalyzesTo(a, "Ùتاب", new String[] { "Ùتاب" });
+ AssertAnalyzesTo(a, "اÙÙتاب", new String[] { "Ùتاب" }); // definite article
+
+ AssertAnalyzesTo(a, "Ù
ا Ù
ÙÙت Ø£ÙÙ
اÙÙÙ
", new String[] { "Ù
ÙÙت", "اÙÙ
اÙÙÙ
" });
+ AssertAnalyzesTo(a, "اÙØ°ÙÙ Ù
ÙÙت Ø£ÙÙ
اÙÙÙ
", new String[] { "Ù
ÙÙت", "اÙÙ
اÙÙÙ
" }); // stopwords
+ }
+
+ /**
+ * Simple tests to show things are getting reset correctly, etc.
+ */
+ [Test]
+ public void TestReusableTokenStream()
+ {
+ ArabicAnalyzer a = new ArabicAnalyzer();
+ AssertAnalyzesToReuse(a, "ÙبÙر", new String[] { "ÙبÙر" });
+ AssertAnalyzesToReuse(a, "ÙبÙرة", new String[] { "ÙبÙر" }); // feminine marker
+ }
+
+ /**
+ * Non-arabic text gets treated in a similar way as SimpleAnalyzer.
+ */
+ [Test]
+ public void TestEnglishInput()
+ {
+ AssertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] {
+ "english", "text" });
+ }
+
+ /**
+ * Test that custom stopwords work, and are not case-sensitive.
+ */
+ [Test]
+ public void TestCustomStopwords()
+ {
+ ArabicAnalyzer a = new ArabicAnalyzer(new String[] { "the", "and", "a" });
+ AssertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick", "brown", "fox" });
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicNormalizationFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,131 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+ /**
+ * Test the Arabic Normalization Filter
+ *
+ */
+ [TestFixture]
+ public class TestArabicNormalizationFilter : BaseTokenStreamTestCase
+ {
+
+ [Test]
+ public void TestAlifMadda()
+ {
+ Check("آجÙ", "اجÙ");
+ }
+
+ [Test]
+ public void TestAlifHamzaAbove()
+ {
+ Check("Ø£ØÙ
د", "اØÙ
د");
+ }
+
+ [Test]
+ public void TestAlifHamzaBelow()
+ {
+ Check("إعاذ", "اعاذ");
+ }
+
+ [Test]
+ public void TestAlifMaksura()
+ {
+ Check("بÙÙ", "بÙÙ");
+ }
+
+ [Test]
+ public void TestTehMarbuta()
+ {
+ Check("ÙاطÙ
Ø©", "ÙاطÙ
Ù");
+ }
+
+ [Test]
+ public void TestTatweel()
+ {
+ Check("رÙبرÙÙÙÙÙت", "رÙبرت");
+ }
+
+ [Test]
+ public void TestFatha()
+ {
+ Check("Ù
ÙبÙا", "Ù
بÙا");
+ }
+
+ [Test]
+ public void TestKasra()
+ {
+ Check("عÙÙÙ", "عÙÙ");
+ }
+
+ [Test]
+ public void TestDamma()
+ {
+ Check("بÙÙات", "بÙات");
+ }
+
+ [Test]
+ public void TestFathatan()
+ {
+ Check("ÙÙداÙ", "ÙÙدا");
+ }
+
+ [Test]
+ public void TestKasratan()
+ {
+ Check("ÙÙدÙ", "ÙÙد");
+ }
+
+ [Test]
+ public void TestDammatan()
+ {
+ Check("ÙÙدÙ", "ÙÙد");
+ }
+
+ [Test]
+ public void TestSukun()
+ {
+ Check("ÙÙÙسÙÙ", "ÙÙسÙÙ");
+ }
+
+ [Test]
+ public void TestShaddah()
+ {
+ Check("ÙتÙ
ÙÙ", "ÙتÙ
Ù");
+ }
+
+ private void Check(string input, string expected)
+ {
+ ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
+ ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
+ AssertTokenStreamContents(filter, new String[] { expected });
+ }
+
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/AR/TestArabicStemFilter.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+using NUnit.Framework;
+
+
+namespace Lucene.Net.Analysis.AR
+{
+
+
+ /**
+ * Test the Arabic Normalization Filter
+ *
+ */
+ [NUnit.Framework.TestFixture]
+ public class TestArabicStemFilter : BaseTokenStreamTestCase
+ {
+ [Test]
+ public void TestAlPrefix()
+ {
+ Check("اÙØسÙ", "ØسÙ");
+ }
+
+ [Test]
+ public void TestWalPrefix()
+ {
+ Check("ÙاÙØسÙ", "ØسÙ");
+ }
+
+ [Test]
+ public void TestBalPrefix()
+ {
+ Check("باÙØسÙ", "ØسÙ");
+ }
+
+ [Test]
+ public void TestKalPrefix()
+ {
+ Check("ÙاÙØسÙ", "ØسÙ");
+ }
+
+ [Test]
+ public void TestFalPrefix()
+ {
+ Check("ÙاÙØسÙ", "ØسÙ");
+ }
+
+ [Test]
+ public void TestLlPrefix()
+ {
+ Check("ÙÙاخر", "اخر");
+ }
+
+ [Test]
+ public void TestWaPrefix()
+ {
+ Check("ÙØسÙ", "ØسÙ");
+ }
+
+ [Test]
+ public void TestAhSuffix()
+ {
+ Check("زÙجÙا", "زÙج");
+ }
+
+ [Test]
+ public void TestAnSuffix()
+ {
+ Check("ساÙداÙ", "ساÙد");
+ }
+
+ [Test]
+ public void TestAtSuffix()
+ {
+ Check("ساÙدات", "ساÙد");
+ }
+
+ [Test]
+ public void TestWnSuffix()
+ {
+ Check("ساÙدÙÙ", "ساÙد");
+ }
+
+ [Test]
+ public void TestYnSuffix()
+ {
+ Check("ساÙدÙÙ", "ساÙد");
+ }
+
+ [Test]
+ public void TestYhSuffix()
+ {
+ Check("ساÙدÙÙ", "ساÙد");
+ }
+
+ [Test]
+ public void TestYpSuffix()
+ {
+ Check("ساÙدÙØ©", "ساÙد");
+ }
+
+ [Test]
+ public void TestHSuffix()
+ {
+ Check("ساÙدÙ", "ساÙد");
+ }
+
+ [Test]
+ public void TestPSuffix()
+ {
+ Check("ساÙدة", "ساÙد");
+ }
+
+ [Test]
+ public void TestYSuffix()
+ {
+ Check("ساÙدÙ", "ساÙد");
+ }
+
+ [Test]
+ public void TestComboPrefSuf()
+ {
+ Check("ÙساÙدÙÙ", "ساÙد");
+ }
+
+ [Test]
+ public void TestComboSuf()
+ {
+ Check("ساÙدÙات", "ساÙد");
+ }
+
+ [Test]
+ public void TestShouldntStem()
+ {
+ Check("اÙÙ", "اÙÙ");
+ }
+
+ [Test]
+ public void TestNonArabic()
+ {
+ Check("English", "English");
+ }
+
+ private void Check(string input, string expected)
+ {
+ ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
+ ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
+ AssertTokenStreamContents(filter, new String[] { expected });
+ }
+
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Test/Properties/AssemblyInfo.cs?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/Properties/AssemblyInfo.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/Properties/AssemblyInfo.cs Thu Feb 10 21:17:43 2011
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Analyzers.Test")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("The Apache Software Foundation")]
+[assembly: AssemblyProduct("Lucene.Net.Analyzers.Test")]
+[assembly: AssemblyCopyright("Copyright 2006 - 2011 The Apache Software Foundation")]
+[assembly: AssemblyTrademark("Copyright 2006 - 2011 The Apache Software Foundation")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components. If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("36a962fb-a8be-4238-88c4-32568216e247")]
+
+// Version information for an assembly consists of the following four values:
+//
+// Major Version
+// Minor Version
+// Build Number
+// Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("2.9.2.1")]
+[assembly: AssemblyFileVersion("2.9.2.1")]
Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/Test.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Test/Test.csproj?rev=1069573&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/Test.csproj (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Test/Test.csproj Thu Feb 10 21:17:43 2011
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="3.5" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <ProductVersion>9.0.21022</ProductVersion>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{67D27628-F1D5-4499-9818-B669731925C8}</ProjectGuid>
+ <OutputType>Library</OutputType>
+ <AppDesignerFolder>Properties</AppDesignerFolder>
+ <RootNamespace>Lucene.Net.Analyzers</RootNamespace>
+ <AssemblyName>Lucene.Net.Analyzers.Test</AssemblyName>
+ <TargetFrameworkVersion>v2.0</TargetFrameworkVersion>
+ <FileAlignment>512</FileAlignment>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+ <DebugSymbols>true</DebugSymbols>
+ <DebugType>full</DebugType>
+ <Optimize>false</Optimize>
+ <OutputPath>bin\Debug\</OutputPath>
+ <DefineConstants>DEBUG;TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+ <DebugType>pdbonly</DebugType>
+ <Optimize>true</Optimize>
+ <OutputPath>bin\Release\</OutputPath>
+ <DefineConstants>TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <ItemGroup>
+ <Reference Include="Lucene.Net, Version=2.9.2.2, Culture=neutral, processorArchitecture=MSIL">
+ <SpecificVersion>False</SpecificVersion>
+ <HintPath>..\Lucene.Net.dll</HintPath>
+ </Reference>
+ <Reference Include="Lucene.Net.Test, Version=2.9.2.1, Culture=neutral, processorArchitecture=MSIL">
+ <SpecificVersion>False</SpecificVersion>
+ <HintPath>..\Lucene.Net.Test.dll</HintPath>
+ </Reference>
+ <Reference Include="nunit.framework, Version=2.5.2.9222, Culture=neutral, PublicKeyToken=96d09a1eb7f44a77, processorArchitecture=MSIL" />
+ <Reference Include="System" />
+ <Reference Include="System.Data" />
+ <Reference Include="System.Xml" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="AR\TestArabicAnalyzer.cs" />
+ <Compile Include="AR\TestArabicNormalizationFilter.cs" />
+ <Compile Include="AR\TestArabicStemFilter.cs" />
+ <Compile Include="Properties\AssemblyInfo.cs" />
+ </ItemGroup>
+ <ItemGroup>
+ <ProjectReference Include="..\Lucene.Net.Analyzers\Lucene.Net.Analyzers.csproj">
+ <Project>{4286E961-9143-4821-B46D-3D39D3736386}</Project>
+ <Name>Lucene.Net.Analyzers</Name>
+ </ProjectReference>
+ </ItemGroup>
+ <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+ <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+ Other similar extension points exist, see Microsoft.Common.targets.
+ <Target Name="BeforeBuild">
+ </Target>
+ <Target Name="AfterBuild">
+ </Target>
+ -->
+</Project>
\ No newline at end of file