You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2009/08/05 20:22:22 UTC
svn commit: r801348 - in /lucene/java/trunk/contrib: ./
analyzers/common/src/java/org/apache/lucene/analysis/ar/
analyzers/common/src/resources/org/apache/lucene/analysis/ar/
analyzers/common/src/test/org/apache/lucene/analysis/ar/
Author: rmuir
Date: Wed Aug 5 18:22:22 2009
New Revision: 801348
URL: http://svn.apache.org/viewvc?rev=801348&view=rev
Log:
LUCENE-1758: Update ArabicAnalyzer to light10 stemming, stopwords improvements, lowercase non-arabic text
Modified:
lucene/java/trunk/contrib/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=801348&r1=801347&r2=801348&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Wed Aug 5 18:22:22 2009
@@ -8,6 +8,12 @@
number conversion. You'll need to fully re-index any previously created indexes.
This isn't a break in back-compatibility because local Lucene has not yet
been released. (Mike McCandless)
+
+ 2. LUCENE-1758: ArabicAnalyzer now uses the light10 algorithm, has a refined
+ default stopword list, and lowercases non-Arabic text.
+ You'll need to fully re-index any previously created indexes. This isn't a
+ break in back-compatibility because ArabicAnalyzer has not yet been
+ released. (Robert Muir)
API Changes
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=801348&r1=801347&r2=801348&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Wed Aug 5 18:22:22 2009
@@ -27,6 +27,7 @@
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
@@ -36,10 +37,9 @@
* <p>
* This analyzer implements light-stemming as specified by:
* <i>
- * Improving Stemming for Arabic Information Retrieval:
- * Light Stemming and Co-occurrence Analysis
+ * Light Stemming for Arabic Information Retrieval
* </i>
- * http://ciir.cs.umass.edu/pubfiles/ir-249.pdf
+ * http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
* <p>
* The analysis package contains three primary components:
* <ul>
@@ -109,12 +109,13 @@
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
- * @return A TokenStream build from a StandardTokenizer filtered with
- * StandardFilter, StopFilter, ArabicNormalizationFilter and ArabicStemFilter.
+ * @return A TokenStream build from an ArabicTokenizer filtered with
+ * StopFilter, LowerCaseFilter, ArabicNormalizationFilter and ArabicStemFilter.
*/
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new ArabicLetterTokenizer( reader );
result = new StopFilter( result, stoptable );
+ result = new LowerCaseFilter(result);
result = new ArabicNormalizationFilter( result );
result = new ArabicStemFilter( result );
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java?rev=801348&r1=801347&r2=801348&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java Wed Aug 5 18:22:22 2009
@@ -47,6 +47,7 @@
("" + BEH + ALEF + LAM).toCharArray(),
("" + KAF + ALEF + LAM).toCharArray(),
("" + FEH + ALEF + LAM).toCharArray(),
+ ("" + LAM + LAM).toCharArray(),
("" + WAW).toCharArray(),
};
Modified: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt?rev=801348&r1=801347&r2=801348&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt Wed Aug 5 18:22:22 2009
@@ -5,75 +5,39 @@
ا
Ø£
Ø
-عشر
عبد
-عدد
-عدة
-عشرة
عدÙ
عاÙ
عاÙ
ا
-عرÙات
عÙ
عÙد
-عÙ
اÙ
عÙدÙ
ا
عÙÙ
-عÙÙ
عÙÙÙ
عÙÙÙا
-عÙ
ÙÙØ©
-زÙارة
-سبتÙ
بر
-ساراÙÙÙÙ
-سÙØ©
-سÙرÙا
-سÙÙات
-تشرÙÙ
تÙ
-تÙ
Ùز
ضد
بعد
بعض
اعادة
اعÙÙ
اعÙÙت
-Øزب
-ØزÙراÙ
بسبب
-اسرائÙÙ
-ØسÙÙ
ØتÙ
اتÙاÙ
-صرب
اذا
اØد
اثر
-غزة
-برس
-باسÙ
اجتÙ
اع
-غدا
-شخصا
-صباØ
اطار
اربعة
-بغداد
اخرÙ
-بارÙس
-رابÙÙ
-شرÙ
باÙ
ابÙ
اجÙ
غÙر
-ØرÙØ©
-رئÙس
-جدÙدة
اطÙاÙ
بشÙÙ
-بطÙÙØ©
-صØÙÙØ©
ØاÙÙا
بÙ
بÙ
@@ -84,166 +48,40 @@
اÙ
بÙا
جÙØ©
-صÙر
ØÙØ«
اÙد
اÙا
اÙ
ا
-اÙعسÙرÙØ©
-اÙعراÙ
-اÙعاصÙ
Ø©
-اÙعربÙØ©
-اÙعراÙÙ
-اÙعراÙÙØ©
اÙعاÙ
-اÙعاÙÙ
-اÙعÙاÙات
-اÙعÙ
Ù
-اÙ
س
-اÙسعÙدÙØ©
-اÙساعة
-اÙسبت
اÙسابÙ
-رÙسÙا
-اÙسÙطة
-اÙسÙطات
اÙسÙاÙ
اÙتعاÙÙ
-اÙتØرÙر
اÙتÙ
اÙتÙ
-اÙتÙبر
-دÙرة
اÙثر
-اÙار
اÙضا
-اÙجزائر
-ØÙ
اس
-اÙاسرائÙÙÙ
-اÙاسرائÙÙÙØ©
-اÙاسبÙع
-اÙاسÙØØ©
-اÙاسÙاÙ
ÙØ©
-Ø°Ùرت
-اÙاتØاد
-اÙاتÙاÙ
-Ø«Ùاثة
-اÙØرب
-اÙاØد
اÙذاتÙ
-اÙشرطة
-اÙاربعاء
-اÙغربÙØ©
-اÙخارجÙØ©
-اÙاردÙ
-اÙشرÙ
-اÙراÙ
-اÙØدÙد
-اÙرئÙس
اÙاخÙرة
اÙثاÙÙ
اÙثاÙÙØ©
-اÙاثÙÙÙ
-Ø´Ù
اÙ
-بÙاÙ
-دÙ
Ø´Ù
اÙØ°Ù
اÙØ°Ù
اÙاÙ
-اÙ
اÙ
-اÙاÙ
Ø®ÙاÙ
-اÙØ´ÙØ®
-اÙجÙØ´
-اÙدÙر
-اÙضÙØ©
-اÙجÙ
عة
-بÙرÙز
-اÙاÙسط
-اÙرÙسÙ
-اÙبÙسÙØ©
-اÙرÙسÙØ©
-بÙرÙت
-اÙاÙتخابات
-اÙبÙاد
-اÙدÙاع
-اÙØ«Ùثاء
-اÙاÙباء
-اÙØ«Ùاثاء
-اÙاÙرÙبÙ
ØÙاÙÙ
اÙØ°ÙÙ
-اÙدÙÙ
اÙØÙÙ
-اÙاÙ
Ù
-اÙاÙ
Ù
اÙاÙÙ
-اÙدÙÙØ©
-اÙØ®ÙÙج
-اÙØ®Ù
Ùس
-اÙاÙ
ÙرÙÙ
-اÙاÙ
ÙرÙÙØ©
-اÙدÙÙÙ
اÙاÙÙÙ
-اÙدÙÙÙØ©
-اÙØÙÙÙ
Ø©
بÙÙ
Ø°ÙÙ
-دÙÙ
-دÙÙ
-ØÙÙ
-ØÙÙ
-اÙÙ
اÙÙ
اÙÙ
-اÙÙ
ضÙ
Ù
-جÙÙب
-دÙÙØ©
اÙÙا
جÙ
Ùع
-اÙÙزراء
-اÙÙ
تØدث
-اÙÙ
تØدة
-دÙÙار
-اÙÙار
-اÙÙضع
-اÙÙدس
-اÙÙ
ØتÙØ©
-اÙÙ
صدر
-اÙÙ
باراة
-اÙÙ
صرÙ
اÙÙ
اضÙ
-اÙÙ
صرÙØ©
-اÙÙ
رØÙØ©
-اÙÙدÙ
-اÙÙجÙØ©
-اÙÙ
جÙس
-اÙÙرÙسÙ
-اÙÙرÙسÙØ©
-اÙÙاÙرة
-اÙÙ
دÙÙØ©
-اÙÙ
اÙÙا
-اÙÙØ·ÙÙØ©
-اÙÙ
جÙ
Ùعة
-اÙÙÙ
-اÙÙÙسطÙÙÙ
-اÙÙÙسطÙÙÙØ©
-اÙÙÙسطÙÙÙÙÙ
-اÙÙÙت
-اÙÙ
Ùرر
-اÙÙÙات
-اÙÙÙائÙ
اÙÙ
ÙبÙ
-اÙÙ
ÙØ·ÙØ©
-اÙÙÙاÙات
-اÙÙ
ÙاÙضات
-اÙÙ
ÙÙ
-اÙÙÙ
Ù
-اÙÙÙÙ
-اÙÙÙÙ
-اÙÙÙÙت
-Ù
Ù
Ù
Ù6
@@ -251,48 +89,19 @@
Ùا
Ù
ا
Ù
ع
-Ùزارة
-ÙزÙر
-Ù
ساء
-ÙتÙ
-Ùرة
-Ù
صر
Ùذا
-Ùاز
-Ùأس
-Ùاسر
-Ùرار
-Ù
صدر
-ÙاØد
-Ùطاع
-Ù
صادر
-Ù
باراة
-Ù
بارÙ
ÙاضاÙ
ÙاضاÙت
-ÙراÙس
-ÙاشÙØ·Ù
ÙاÙ
ÙبÙ
-ÙاÙ
ÙاÙ
ÙدÙ
ÙØÙ
ÙØ°Ù
ÙاÙ
-Ù
ØÙ
د
-ÙاÙد
ÙØ°Ùر
-Ù
جÙس
-ÙرÙسا
-ÙرÙستÙÙر
ÙاÙت
ÙاÙضØ
-ÙبÙاÙ
-Ù
اÙÙ
-Ù
دÙÙØ©
-Ù
جÙ
Ùعة
-ÙاÙÙÙ
ÙÙ
ÙÙ
ÙÙ
@@ -302,49 +111,51 @@
Ù
Ù
ÙÙ
ÙÙ
-ÙÙØ©
ÙÙ
ا
ÙÙا
Ù
ÙØ°
-ÙÙد
ÙÙا
-ÙÙسÙ
-Ù
ÙسÙÙ
-Ù
ÙتÙ
-ÙÙاء
-ÙÙرة
-ÙÙطة
-ÙÙات
Ù
ÙابÙ
-ÙÙدÙ
ÙÙاÙ
-ÙÙاÙ
ÙÙاÙ
-Ù
ÙØ·ÙØ©
-Ù
ÙظÙ
Ø©
-ÙÙاÙØ©
-ÙÙاÙØ©
-ÙÙاÙت
ÙÙاÙت
-ÙÙاÙ
Ù
ÙÙÙ
-ÙÙÙ
ÙÙÙ
ÙÙÙ
-ÙÙÙ
ÙÙÙ
ÙÙ
Ù
ÙÙÙ
ÙÙÙ
-ÙÙÙ
ÙÙÙا
Ù
ÙÙا
-Ù
ÙÙار
-ÙÙÙاÙØ©
ÙÙÙÙ
-ÙÙ
ÙÙ
-ÙÙÙÙتÙÙ
-Ù
ÙÙÙÙ
-ÙÙÙÙÙ
-ÙÙÙÙÙ
-ÙÙÙÙÙرÙ
+أخرÙ
+إذا
+أربعة
+إطار
+إعادة
+أعÙÙ
+أعÙÙت
+Ø£Ù
+Ø£Ùثر
+Ø£Ùد
+Ø¥Ùا
+اÙأخÙرة
+اÙØ¢Ù
+اÙØ£ÙÙ
+اÙØ£ÙÙÙ
+Ø¥ÙÙ
+Ø£Ù
ا
+Ø£Ù
+Ø¥Ù
+Ø¥ÙÙ
+Ø£ÙÙ
+Ø£ÙÙا
+Ø¥ÙÙا
+Ø£Ù
+اÙ
+Ø£Ù
+Ø£Ù
+Ø£Ùضا
+بأÙ
+ÙØ¥Ù
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=801348&r1=801347&r2=801348&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Wed Aug 5 18:22:22 2009
@@ -17,6 +17,12 @@
* limitations under the License.
*/
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
import junit.framework.TestCase;
/**
@@ -31,6 +37,48 @@
new ArabicAnalyzer();
}
- /* TODO: more tests */
+ /**
+ * Some simple tests showing some features of the analyzer, how some regular forms will conflate
+ */
+ public void testBasicFeatures() throws Exception {
+ ArabicAnalyzer a = new ArabicAnalyzer();
+ assertAnalyzesTo(a, "ÙبÙر", new String[] { "ÙبÙر" });
+ assertAnalyzesTo(a, "ÙبÙرة", new String[] { "ÙبÙر" }); // feminine marker
+
+ assertAnalyzesTo(a, "Ù
شرÙب", new String[] { "Ù
شرÙب" });
+ assertAnalyzesTo(a, "Ù
شرÙبات", new String[] { "Ù
شرÙب" }); // plural -at
+
+ assertAnalyzesTo(a, "Ø£Ù
رÙÙÙÙÙ", new String[] { "اÙ
رÙÙ" }); // plural -in
+ assertAnalyzesTo(a, "اÙ
رÙÙÙ", new String[] { "اÙ
رÙÙ" }); // singular with bare alif
+
+ assertAnalyzesTo(a, "Ùتاب", new String[] { "Ùتاب" });
+ assertAnalyzesTo(a, "اÙÙتاب", new String[] { "Ùتاب" }); // definite article
+
+ assertAnalyzesTo(a, "Ù
ا Ù
ÙÙت Ø£ÙÙ
اÙÙÙ
", new String[] { "Ù
ÙÙت", "اÙÙ
اÙÙÙ
"});
+ assertAnalyzesTo(a, "اÙØ°ÙÙ Ù
ÙÙت Ø£ÙÙ
اÙÙÙ
", new String[] { "Ù
ÙÙت", "اÙÙ
اÙÙÙ
" }); // stopwords
+ }
+
+ /**
+ * Non-arabic text gets treated in a similar way as SimpleAnalyzer.
+ */
+ public void testEnglishInput() throws Exception {
+ assertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] {
+ "english", "text" });
+ }
+
+ private void assertAnalyzesTo(Analyzer a, String input, String[] output)
+ throws Exception {
+ TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ TermAttribute termAtt = (TermAttribute) ts
+ .getAttribute(TermAttribute.class);
+
+ for (int i = 0; i < output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(output[i], termAtt.term());
+ }
+
+ assertFalse(ts.incrementToken());
+ ts.close();
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java?rev=801348&r1=801347&r2=801348&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java Wed Aug 5 18:22:22 2009
@@ -50,6 +50,10 @@
check("ÙاÙØسÙ", "ØسÙ");
}
+ public void testLlPrefix() throws IOException {
+ check("ÙÙاخر", "اخر");
+ }
+
public void testWaPrefix() throws IOException {
check("ÙØسÙ", "ØسÙ");
}