You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/12/07 20:51:07 UTC
svn commit: r1043181 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/
lucene/contrib/analyzer...
Author: rmuir
Date: Tue Dec 7 19:51:06 2010
New Revision: 1043181
URL: http://svn.apache.org/viewvc?rev=1043181&view=rev
Log:
LUCENE-2747: Deprecate/remove language-specific tokenizers in favor of StandardTokenizer
Added:
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java
- copied, changed from r1043114, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java
lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java
- copied unchanged from r1043114, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java
Removed:
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java
lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java
lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestArabicFilters.java
lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestHindiFilters.java
Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1043181&r1=1043180&r2=1043181&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Tue Dec 7 19:51:06 2010
@@ -124,6 +124,9 @@ API Changes
new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead.
(Robert Muir, Uwe Schindler)
+ * LUCENE-2747: Deprecated ArabicLetterTokenizer. StandardTokenizer now tokenizes
+ most languages correctly including Arabic. (Steven Rowe, Robert Muir)
+
New features
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=1043181&r1=1043180&r2=1043181&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Tue Dec 7 19:51:06 2010
@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.Stopwo
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
/**
@@ -167,7 +168,7 @@ public final class ArabicAnalyzer extend
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
- * built from an {@link ArabicLetterTokenizer} filtered with
+ * built from an {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter},
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
@@ -175,7 +176,8 @@ public final class ArabicAnalyzer extend
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader);
+ final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ?
+ new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized!
result = new StopFilter( matchVersion, result, stopwords);
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java?rev=1043181&r1=1043180&r2=1043181&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java Tue Dec 7 19:51:06 2010
@@ -20,6 +20,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.LetterTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
@@ -38,7 +39,9 @@ import org.apache.lucene.util.Version;
* detect token characters. See {@link #isTokenChar(int)} and
* {@link #normalize(int)} for details.</li>
* </ul>
+ * @deprecated (3.1) Use {@link StandardTokenizer} instead.
*/
+@Deprecated
public class ArabicLetterTokenizer extends LetterTokenizer {
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=1043181&r1=1043180&r2=1043181&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Tue Dec 7 19:51:06 2010
@@ -27,11 +27,14 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
import org.apache.lucene.util.Version;
/**
@@ -140,14 +143,19 @@ public final class PersianAnalyzer exten
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
- * built from a {@link ArabicLetterTokenizer} filtered with
+ * built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
* {@link PersianNormalizationFilter} and Persian Stop words
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader);
+ final Tokenizer source;
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ source = new StandardTokenizer(matchVersion, reader);
+ } else {
+ source = new ArabicLetterTokenizer(matchVersion, reader);
+ }
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
@@ -158,4 +166,14 @@ public final class PersianAnalyzer exten
*/
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
+
+ /**
+ * Wraps the Reader with {@link PersianCharFilter}
+ */
+ @Override
+ protected Reader initReader(Reader reader) {
+ return matchVersion.onOrAfter(Version.LUCENE_31) ?
+ new PersianCharFilter(CharReader.get(reader)) :
+ reader;
+ }
}
Copied: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java (from r1043114, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java?p2=lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java&p1=lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java&r1=1043114&r2=1043181&rev=1043181&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java Tue Dec 7 19:51:06 2010
@@ -20,7 +20,7 @@ package org.apache.lucene.analysis.fa;
import java.io.IOException;
import org.apache.lucene.analysis.CharStream;
-import org.apache.lucene.analysis.charfilter.CharFilter;
+import org.apache.lucene.analysis.CharFilter;
/**
* CharFilter that replaces instances of Zero-width non-joiner with an
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java?rev=1043181&r1=1043180&r2=1043181&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java Tue Dec 7 19:51:06 2010
@@ -25,7 +25,9 @@ import org.apache.lucene.util.Version;
/**
* Testcase for {@link TestArabicLetterTokenizer}
+ * @deprecated (3.1) Remove in Lucene 5.0
*/
+@Deprecated
public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase {
public void testArabicLetterTokenizer() throws IOException {
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java?rev=1043181&r1=1043180&r2=1043181&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java Tue Dec 7 19:51:06 2010
@@ -68,8 +68,9 @@ public abstract class ReusableAnalyzerBa
final Reader reader) throws IOException {
TokenStreamComponents streamChain = (TokenStreamComponents)
getPreviousTokenStream();
- if (streamChain == null || !streamChain.reset(reader)) {
- streamChain = createComponents(fieldName, reader);
+ final Reader r = initReader(reader);
+ if (streamChain == null || !streamChain.reset(r)) {
+ streamChain = createComponents(fieldName, r);
setPreviousTokenStream(streamChain);
}
return streamChain.getTokenStream();
@@ -88,7 +89,14 @@ public abstract class ReusableAnalyzerBa
@Override
public final TokenStream tokenStream(final String fieldName,
final Reader reader) {
- return createComponents(fieldName, reader).getTokenStream();
+ return createComponents(fieldName, initReader(reader)).getTokenStream();
+ }
+
+ /**
+ * Override this if you want to add a CharFilter chain.
+ */
+ protected Reader initReader(Reader reader) {
+ return reader;
}
/**
Modified: lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java?rev=1043181&r1=1043180&r2=1043181&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java Tue Dec 7 19:51:06 2010
@@ -23,7 +23,9 @@ import java.io.Reader;
/**
* Factory for {@link ArabicLetterTokenizer}
+ * @deprecated (3.1) Use StandardTokenizerFactory instead.
**/
+@Deprecated
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{
public ArabicLetterTokenizer create(Reader input) {
Modified: lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestArabicFilters.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestArabicFilters.java?rev=1043181&r1=1043180&r2=1043181&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestArabicFilters.java (original)
+++ lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestArabicFilters.java Tue Dec 7 19:51:06 2010
@@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import java.io.Reader;
import java.io.StringReader;
+import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -29,7 +30,9 @@ import org.apache.lucene.analysis.Tokeni
public class TestArabicFilters extends BaseTokenTestCase {
/**
* Test ArabicLetterTokenizerFactory
+ * @deprecated (3.1) Remove in Lucene 5.0
*/
+ @Deprecated
public void testTokenizer() throws Exception {
Reader reader = new StringReader("اÙØ°ÙÙ Ù
ÙÙÙت Ø£ÙÙ
اÙÙÙ
");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
@@ -43,7 +46,7 @@ public class TestArabicFilters extends B
*/
public void testNormalizer() throws Exception {
Reader reader = new StringReader("اÙØ°ÙÙ Ù
ÙÙÙت Ø£ÙÙ
اÙÙÙ
");
- ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
+ StandardTokenizerFactory factory = new StandardTokenizerFactory();
ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
filterFactory.init(DEFAULT_VERSION_PARAM);
@@ -57,7 +60,7 @@ public class TestArabicFilters extends B
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("اÙØ°ÙÙ Ù
ÙÙÙت Ø£ÙÙ
اÙÙÙ
");
- ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
+ StandardTokenizerFactory factory = new StandardTokenizerFactory();
ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
@@ -67,4 +70,16 @@ public class TestArabicFilters extends B
stream = stemFactory.create(stream);
assertTokenStreamContents(stream, new String[] {"Ø°ÙÙ", "Ù
ÙÙت", "اÙÙ
اÙÙÙ
"});
}
+
+ /**
+ * Test PersianCharFilterFactory
+ */
+ public void testPersianCharFilter() throws Exception {
+ Reader reader = new StringReader("Ù
ÛâØ®Ùرد");
+ PersianCharFilterFactory charfilterFactory = new PersianCharFilterFactory();
+ StandardTokenizerFactory tokenizerFactory = new StandardTokenizerFactory();
+ tokenizerFactory.init(DEFAULT_VERSION_PARAM);
+ TokenStream stream = tokenizerFactory.create(charfilterFactory.create(CharReader.get(reader)));
+ assertTokenStreamContents(stream, new String[] { "Ù
Û", "Ø®Ùرد" });
+ }
}
Modified: lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestHindiFilters.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestHindiFilters.java?rev=1043181&r1=1043180&r2=1043181&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestHindiFilters.java (original)
+++ lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/analysis/TestHindiFilters.java Tue Dec 7 19:51:06 2010
@@ -28,23 +28,11 @@ import org.apache.lucene.analysis.Tokeni
*/
public class TestHindiFilters extends BaseTokenTestCase {
/**
- * Test IndicTokenizerFactory
- */
- public void testTokenizer() throws Exception {
- Reader reader = new StringReader("मà¥à¤à¥ हिà¤à¤¦à¥ à¤à¤¾ à¤à¤° à¤
à¤à¥à¤¯à¤¾à¤¸ à¤à¤°à¤¨à¤¾ हà¥à¤à¤¾ ।");
- IndicTokenizerFactory factory = new IndicTokenizerFactory();
- factory.init(DEFAULT_VERSION_PARAM);
- Tokenizer stream = factory.create(reader);
- assertTokenStreamContents(stream,
- new String[] { "मà¥à¤à¥", "हिà¤à¤¦à¥", "à¤à¤¾", "à¤à¤°", "à¤
à¤à¥à¤¯à¤¾à¤¸", "à¤à¤°à¤¨à¤¾", "हà¥à¤à¤¾" });
- }
-
- /**
* Test IndicNormalizationFilterFactory
*/
public void testIndicNormalizer() throws Exception {
Reader reader = new StringReader("তà§â à¤
ाà¥à¤°");
- IndicTokenizerFactory factory = new IndicTokenizerFactory();
+ StandardTokenizerFactory factory = new StandardTokenizerFactory();
IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
filterFactory.init(DEFAULT_VERSION_PARAM);
@@ -58,7 +46,7 @@ public class TestHindiFilters extends Ba
*/
public void testHindiNormalizer() throws Exception {
Reader reader = new StringReader("à¥à¤¿à¤¤à¤¾à¤¬");
- IndicTokenizerFactory factory = new IndicTokenizerFactory();
+ StandardTokenizerFactory factory = new StandardTokenizerFactory();
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
@@ -74,7 +62,7 @@ public class TestHindiFilters extends Ba
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("à¤à¤¿à¤¤à¤¾à¤¬à¥à¤");
- IndicTokenizerFactory factory = new IndicTokenizerFactory();
+ StandardTokenizerFactory factory = new StandardTokenizerFactory();
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
HindiStemFilterFactory stemFactory = new HindiStemFilterFactory();