You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by us...@apache.org on 2010/01/27 12:19:07 UTC
svn commit: r903608 [1/2] - in /lucene/java/trunk: ./
contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/ con...
Author: uschindler
Date: Wed Jan 27 11:19:05 2010
New Revision: 903608
URL: http://svn.apache.org/viewvc?rev=903608&view=rev
Log:
LUCENE-2198: Support protected words in stemming TokenFilters using a new KeywordAttribute
Added:
lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java (with props)
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java (with props)
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpls.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed Jan 27 11:19:05 2010
@@ -125,6 +125,9 @@
stopwords, and implement many analyzers in contrib with it.
(Simon Willnauer via Robert Muir)
+* LUCENE-2198: Support protected words in stemming TokenFilters using a
+ new KeywordAttribute. (Simon Willnauer via Uwe Schindler)
+
Optimizations
* LUCENE-2086: When resolving deleted terms, do so in term sort order
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -26,6 +26,8 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
@@ -93,6 +95,8 @@
}
}
}
+
+ private final Set<?> stemExclusionSet;
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
@@ -110,7 +114,25 @@
* a stopword set
*/
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * {@link ArabicStemFilter}.
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ * @param stemExclusionSet
+ * a set of terms not to be stemmed
+ */
+ public ArabicAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
}
/**
@@ -145,7 +167,8 @@
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link TokenStreamComponents} built from an {@link ArabicLetterTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
+ * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter},
+ * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided
* and {@link ArabicStemFilter}.
*/
@Override
@@ -155,7 +178,11 @@
TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized!
result = new StopFilter( matchVersion, result, stopwords);
+ // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
result = new ArabicNormalizationFilter(result);
+ if(!stemExclusionSet.isEmpty()) {
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ }
return new TokenStreamComponents(source, new ArabicStemFilter(result));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java Wed Jan 27 11:19:05 2010
@@ -19,31 +19,41 @@
import java.io.IOException;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
- *
- */
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter */
public final class ArabicStemFilter extends TokenFilter {
private final ArabicStemmer stemmer;
private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
public ArabicStemFilter(TokenStream input) {
super(input);
stemmer = new ArabicStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
- termAtt.setTermLength(newlen);
+ if(!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+ termAtt.setTermLength(newlen);
+ }
return true;
} else {
return false;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -25,6 +25,8 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
@@ -88,6 +90,8 @@
}
}
}
+
+ private final Set<?> stemExclusionSet;
/**
* Builds an analyzer with the default stop words:
@@ -101,16 +105,27 @@
* Builds an analyzer with the given stop words.
*/
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
- super(matchVersion, stopwords);
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
+ * Builds an analyzer with the given stop words and a stem exclusion set.
+ * If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerTokenFilter}
+ * before {@link BulgarianStemFilter}.
+ */
+ public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet)); }
+
+ /**
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
* {@link Reader}.
*
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
- * {@link StopFilter}, and {@link BulgarianStemFilter}.
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link BulgarianStemFilter}.
*/
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
@@ -118,6 +133,8 @@
TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
result = new BulgarianStemFilter(result);
return new TokenStreamComponents(source, result);
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java Wed Jan 27 11:19:05 2010
@@ -19,29 +19,40 @@
import java.io.IOException;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
* words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
*/
public final class BulgarianStemFilter extends TokenFilter {
private final BulgarianStemmer stemmer;
private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
public BulgarianStemFilter(final TokenStream input) {
super(input);
stemmer = new BulgarianStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
- termAtt.setTermLength(newlen);
+ if(!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+ termAtt.setTermLength(newlen);
+ }
return true;
} else {
return false;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -30,6 +30,7 @@
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
@@ -204,8 +205,9 @@
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new StandardFilter(result);
result = new StopFilter(matchVersion, result, stopwords);
- return new TokenStreamComponents(source, new BrazilianStemFilter(result,
- excltable));
+ if(excltable != null && !excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ return new TokenStreamComponents(source, new BrazilianStemFilter(result));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java Wed Jan 27 11:19:05 2010
@@ -20,13 +20,21 @@
import java.io.IOException;
import java.util.Set;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A {@link TokenFilter} that applies {@link BrazilianStemmer}.
- *
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
+ *
*/
public final class BrazilianStemFilter extends TokenFilter {
@@ -34,16 +42,31 @@
* {@link BrazilianStemmer} in use by this filter.
*/
private BrazilianStemmer stemmer = null;
- private Set exclusions = null;
- private TermAttribute termAtt;
-
+ private Set<?> exclusions = null;
+ private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
+
+ /**
+ * Creates a new BrazilianStemFilter
+ *
+ * @param in the source {@link TokenStream}
+ */
public BrazilianStemFilter(TokenStream in) {
super(in);
stemmer = new BrazilianStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
-
- public BrazilianStemFilter(TokenStream in, Set exclusiontable) {
+
+ /**
+ * Creates a new BrazilianStemFilter
+ *
+ * @param in the source {@link TokenStream}
+ * @param exclusiontable a set of terms that should be prevented from being stemmed.
+ * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
+ */
+ @Deprecated
+ public BrazilianStemFilter(TokenStream in, Set<?> exclusiontable) {
this(in);
this.exclusions = exclusiontable;
}
@@ -51,10 +74,10 @@
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- String term = termAtt.term();
+ final String term = termAtt.term();
// Check the exclusion table.
- if (exclusions == null || !exclusions.contains(term)) {
- String s = stemmer.stem(term);
+ if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
+ final String s = stemmer.stem(term);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
termAtt.setTermBuffer(s);
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -105,6 +106,7 @@
// TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
private Set<?> stoptable;
private final Version matchVersion;
+ private final Set<?> stemExclusionTable;
/**
* Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
@@ -124,8 +126,22 @@
* @param stopwords a stopword set
*/
public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words and a set of work to be
+ * excluded from the {@link CzechStemFilter}.
+ *
+ * @param matchVersion Lucene version to match See
+ * {@link <a href="#version">above</a>}
+ * @param stopwords a stopword set
+ * @param a stemming exclusion set
+ */
+ public CzechAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable) {
this.matchVersion = matchVersion;
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
+ this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
}
@@ -207,7 +223,9 @@
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link CzechStemFilter} (only if version is
- * >= LUCENE_31)
+ * >= LUCENE_31). If a version is >= LUCENE_31 and a stem exclusion set
+ * is provided via {@link #CzechAnalyzer(Version, Set, Set)} a
+ * {@link KeywordMarkerTokenFilter} is added before {@link CzechStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
@@ -216,8 +234,11 @@
TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stoptable);
- if (matchVersion.onOrAfter(Version.LUCENE_31))
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ if(!this.stemExclusionTable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionTable);
result = new CzechStemFilter(result);
+ }
return new TokenStreamComponents(source, result);
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java Wed Jan 27 11:19:05 2010
@@ -2,8 +2,10 @@
import java.io.IOException;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
@@ -25,25 +27,34 @@
/**
* A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
- *
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
* <p><b>NOTE</b>: Input is expected to be in lowercase,
* but with diacritical marks</p>
+ * @see KeywordMarkerTokenFilter
*/
public final class CzechStemFilter extends TokenFilter {
private final CzechStemmer stemmer;
private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
public CzechStemFilter(TokenStream input) {
super(input);
stemmer = new CzechStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
- termAtt.setTermLength(newlen);
+ if(!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+ termAtt.setTermLength(newlen);
+ }
return true;
} else {
return false;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -30,6 +30,7 @@
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
@@ -208,6 +209,7 @@
TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopwords);
- return new TokenStreamComponents(source, new GermanStemFilter(result, exclusionSet));
+ result = new KeywordMarkerTokenFilter(result, exclusionSet);
+ return new TokenStreamComponents(source, new GermanStemFilter(result));
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java Wed Jan 27 11:19:05 2010
@@ -20,8 +20,10 @@
import java.io.IOException;
import java.util.Set;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
@@ -31,6 +33,12 @@
* not be stemmed at all. The stemmer used can be changed at runtime after the
* filter object is created (as long as it is a {@link GermanStemmer}).
* </p>
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
*/
public final class GermanStemFilter extends TokenFilter
{
@@ -38,21 +46,29 @@
* The actual token in the input stream.
*/
private GermanStemmer stemmer = null;
- private Set exclusionSet = null;
+ private Set<?> exclusionSet = null;
- private TermAttribute termAtt;
+ private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
+ /**
+ * Creates a {@link GermanStemFilter} instance
+ * @param in the source {@link TokenStream}
+ */
public GermanStemFilter( TokenStream in )
{
super(in);
stemmer = new GermanStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
/**
* Builds a GermanStemFilter that uses an exclusion table.
+ * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
*/
- public GermanStemFilter( TokenStream in, Set exclusionSet )
+ @Deprecated
+ public GermanStemFilter( TokenStream in, Set<?> exclusionSet )
{
this( in );
this.exclusionSet = exclusionSet;
@@ -66,7 +82,7 @@
if (input.incrementToken()) {
String term = termAtt.term();
// Check the exclusion table.
- if (exclusionSet == null || !exclusionSet.contains(term)) {
+ if (!keywordAttr.isKeyword() && (exclusionSet == null || !exclusionSet.contains(term))) {
String s = stemmer.stem(term);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
@@ -91,8 +107,10 @@
/**
* Set an alternative exclusion list for this filter.
+ * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
*/
- public void setExclusionSet( Set exclusionSet )
+ @Deprecated
+ public void setExclusionSet( Set<?> exclusionSet )
{
this.exclusionSet = exclusionSet;
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
@@ -215,7 +216,9 @@
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
result = new StopFilter(matchVersion, result, stopwords);
- result = new FrenchStemFilter(result, excltable);
+ if(!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ result = new FrenchStemFilter(result);
// Convert to lowercase after stemming!
return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java Wed Jan 27 11:19:05 2010
@@ -17,8 +17,10 @@
* limitations under the License.
*/
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
@@ -29,10 +31,15 @@
/**
* A {@link TokenFilter} that stems french words.
* <p>
- * It supports a table of words that should
- * not be stemmed at all. The used stemmer can be changed at runtime after the
+ * The used stemmer can be changed at runtime after the
* filter object is created (as long as it is a {@link FrenchStemmer}).
* </p>
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
*/
public final class FrenchStemFilter extends TokenFilter {
@@ -40,18 +47,26 @@
* The actual token in the input stream.
*/
private FrenchStemmer stemmer = null;
- private Set exclusions = null;
+ private Set<?> exclusions = null;
- private TermAttribute termAtt;
+ private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
public FrenchStemFilter( TokenStream in ) {
super(in);
stemmer = new FrenchStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
-
- public FrenchStemFilter( TokenStream in, Set exclusiontable ) {
+ /**
+ *
+ * @param in the {@link TokenStream} to filter
+ * @param exclusiontable a set of terms not to be stemmed
+ * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
+ */
+ @Deprecated // TODO remove in 3.2
+ public FrenchStemFilter( TokenStream in, Set<?> exclusiontable ) {
this( in );
exclusions = exclusiontable;
}
@@ -65,7 +80,7 @@
String term = termAtt.term();
// Check the exclusion table
- if ( exclusions == null || !exclusions.contains( term ) ) {
+ if ( !keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains( term )) ) {
String s = stemmer.stem( term );
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals( term ) )
@@ -86,8 +101,10 @@
}
/**
* Set an alternative exclusion list for this filter.
+ * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
*/
- public void setExclusionTable( Map exclusiontable ) {
+ @Deprecated // TODO remove in 3.2
+ public void setExclusionTable( Map<?,?> exclusiontable ) {
exclusions = new HashSet(exclusiontable.keySet());
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -19,6 +19,8 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -52,7 +54,7 @@
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
-public final class DutchAnalyzer extends Analyzer {
+public final class DutchAnalyzer extends ReusableAnalyzerBase {
/**
* List of typical Dutch stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
@@ -215,28 +217,7 @@
}
}
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the
- * provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link StopFilter},
- * and {@link DutchStemFilter}
- */
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(matchVersion, reader);
- result = new StandardFilter(result);
- result = new StopFilter(matchVersion, result, stoptable);
- result = new DutchStemFilter(result, excltable, stemdict);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
+
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
* text in the provided {@link Reader}.
@@ -246,19 +227,14 @@
* and {@link DutchStemFilter}
*/
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new StopFilter(matchVersion, streams.result, stoptable);
- streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader aReader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
+ TokenStream result = new StandardFilter(source);
+ result = new StopFilter(matchVersion, result, stoptable);
+ if (!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ result = new DutchStemFilter(result, stemdict);
+ return new TokenStreamComponents(source, result);
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java Wed Jan 27 11:19:05 2010
@@ -23,8 +23,10 @@
import java.util.Map;
import java.util.Set;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
@@ -34,34 +36,54 @@
* not be stemmed at all. The stemmer used can be changed at runtime after the
* filter object is created (as long as it is a {@link DutchStemmer}).
* </p>
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
*/
public final class DutchStemFilter extends TokenFilter {
/**
* The actual token in the input stream.
*/
private DutchStemmer stemmer = null;
- private Set exclusions = null;
+ private Set<?> exclusions = null;
- private TermAttribute termAtt;
+ private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
public DutchStemFilter(TokenStream _in) {
super(_in);
stemmer = new DutchStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
/**
* Builds a DutchStemFilter that uses an exclusion table.
+ * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
*/
- public DutchStemFilter(TokenStream _in, Set exclusiontable) {
+ @Deprecated
+ public DutchStemFilter(TokenStream _in, Set<?> exclusiontable) {
this(_in);
exclusions = exclusiontable;
}
+
+ /**
+ * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
+ */
+ public DutchStemFilter(TokenStream _in, Map<?,?> stemdictionary) {
+ this(_in);
+ stemmer.setStemDictionary(stemdictionary);
+ }
/**
* @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
+ * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
*/
- public DutchStemFilter(TokenStream _in, Set exclusiontable, Map stemdictionary) {
+ @Deprecated
+ public DutchStemFilter(TokenStream _in, Set<?> exclusiontable, Map<?,?> stemdictionary) {
this(_in, exclusiontable);
stemmer.setStemDictionary(stemdictionary);
}
@@ -72,11 +94,11 @@
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- String term = termAtt.term();
+ final String term = termAtt.term();
// Check the exclusion table.
- if (exclusions == null || !exclusions.contains(term)) {
- String s = stemmer.stem(term);
+ if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
+ final String s = stemmer.stem(term);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
termAtt.setTermBuffer(s);
@@ -98,8 +120,10 @@
/**
* Set an alternative exclusion list for this filter.
+ * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
*/
- public void setExclusionTable(HashSet exclusiontable) {
+ @Deprecated
+ public void setExclusionTable(HashSet<?> exclusiontable) {
exclusions = exclusiontable;
}
@@ -107,7 +131,7 @@
* Set dictionary for stemming, this dictionary overrules the algorithm,
* so you can correct for a particular unwanted word-stem pair.
*/
- public void setStemDictionary(HashMap dict) {
+ public void setStemDictionary(HashMap<?,?> dict) {
if (stemmer != null)
stemmer.setStemDictionary(dict);
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -26,6 +26,7 @@
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
@@ -63,6 +64,17 @@
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(RUSSIAN_STOP_WORDS), false));
}
+
+ private final Set<?> stemExclusionSet;
+
+ /**
+ * Returns an unmodifiable instance of the default stop-words set.
+ *
+ * @return an unmodifiable instance of the default stop-words set.
+ */
+ public static Set<?> getDefaultStopSet() {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
public RussianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
@@ -86,9 +98,24 @@
* a stopword set
*/
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ * @param stemExclusionSet a set of words not to be stemmed
+ */
+ public RussianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
}
+
/**
* Builds an analyzer with the given stop words.
* TODO: create a Set version of this ctor
@@ -115,6 +142,8 @@
final Tokenizer source = new RussianLetterTokenizer(reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
return new TokenStreamComponents(source, new RussianStemFilter(result));
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java Wed Jan 27 11:19:05 2010
@@ -17,9 +17,11 @@
* limitations under the License.
*/
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
@@ -32,6 +34,12 @@
* The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter ,
* because RussianStemFilter only works with lowercase characters.
* </p>
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
*/
public final class RussianStemFilter extends TokenFilter
{
@@ -40,13 +48,15 @@
*/
private RussianStemmer stemmer = null;
- private TermAttribute termAtt;
+ private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
public RussianStemFilter(TokenStream in)
{
super(in);
stemmer = new RussianStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
/**
* Returns the next token in the stream, or null at EOS
@@ -55,10 +65,12 @@
public final boolean incrementToken() throws IOException
{
if (input.incrementToken()) {
- String term = termAtt.term();
- String s = stemmer.stem(term);
- if (s != null && !s.equals(term))
- termAtt.setTermBuffer(s);
+ if(!keywordAttr.isKeyword()) {
+ final String term = termAtt.term();
+ final String s = stemmer.stem(term);
+ if (s != null && !s.equals(term))
+ termAtt.setTermBuffer(s);
+ }
return true;
} else {
return false;
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -17,11 +17,15 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
+import javax.print.DocFlavor.CHAR_ARRAY;
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
@@ -84,4 +88,17 @@
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
"brown", "fox" });
}
+
+ public void testWithStemExclusionSet() throws IOException {
+ Set<String> set = new HashSet<String>();
+ set.add("ساÙدÙات");
+ ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesTo(a, "ÙبÙرة the quick ساÙدÙات", new String[] { "ÙبÙر","the", "quick", "ساÙدÙات" });
+ assertAnalyzesToReuse(a, "ÙبÙرة the quick ساÙدÙات", new String[] { "ÙبÙر","the", "quick", "ساÙدÙات" });
+
+
+ a = new ArabicAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
+ assertAnalyzesTo(a, "ÙبÙرة the quick ساÙدÙات", new String[] { "ÙبÙر","the", "quick", "ساÙد" });
+ assertAnalyzesToReuse(a, "ÙبÙرة the quick ساÙدÙات", new String[] { "ÙبÙر","the", "quick", "ساÙد" });
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java Wed Jan 27 11:19:05 2010
@@ -21,7 +21,9 @@
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.util.Version;
/**
* Test the Arabic Normalization Filter
@@ -112,11 +114,19 @@
public void testNonArabic() throws IOException {
check("English", "English");
}
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("ساÙدÙات");
+ ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader("ساÙدÙات"));
+
+ ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerTokenFilter(tokenStream, set));
+ assertTokenStreamContents(filter, new String[]{"ساÙدÙات"});
+ }
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
assertTokenStreamContents(filter, new String[]{expected});
}
-
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
@@ -67,4 +68,11 @@
assertAnalyzesTo(a, "гÑадове", new String[] {"гÑад"});
}
+
+ public void testWithStemExclusionSet() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
+ set.add("ÑÑÑоеве");
+ Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesTo(a, "ÑÑÑоевеÑе ÑÑÑоеве", new String[] { "ÑÑÑой", "ÑÑÑоеве" });
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java Wed Jan 27 11:19:05 2010
@@ -18,8 +18,12 @@
*/
import java.io.IOException;
+import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
/**
@@ -207,4 +211,15 @@
assertAnalyzesTo(a, "ÑÑÑоÑ", new String[] {"ÑÑÑ"});
assertAnalyzesTo(a, "ÑÑÑоÑÑ", new String[] {"ÑÑÑ"});
}
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
+ set.add("ÑÑÑоеве");
+ WhitespaceTokenizer tokenStream = new WhitespaceTokenizer(
+ new StringReader("ÑÑÑоевеÑе ÑÑÑоеве"));
+
+ BulgarianStemFilter filter = new BulgarianStemFilter(
+ new KeywordMarkerTokenFilter(tokenStream, set));
+ assertTokenStreamContents(filter, new String[] { "ÑÑÑой", "ÑÑÑоеве" });
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java Wed Jan 27 11:19:05 2010
@@ -17,12 +17,14 @@
* limitations under the License.
*/
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
+import java.io.IOException;
+import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.util.Version;
/**
@@ -139,6 +141,34 @@
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
}
+ public void testStemExclusionTableBWCompat() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("BrasÃlia");
+ BrazilianStemFilter filter = new BrazilianStemFilter(
+ new LowerCaseTokenizer(new StringReader("BrasÃlia Brasilia")), set);
+ assertTokenStreamContents(filter, new String[] { "brasÃlia", "brasil" });
+ }
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("BrasÃlia");
+ BrazilianStemFilter filter = new BrazilianStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(new StringReader(
+ "BrasÃlia Brasilia")), set));
+ assertTokenStreamContents(filter, new String[] { "brasÃlia", "brasil" });
+ }
+
+ public void testWithKeywordAttributeAndExclusionTable() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("BrasÃlia");
+ CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set1.add("Brasilia");
+ BrazilianStemFilter filter = new BrazilianStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(new StringReader(
+ "BrasÃlia Brasilia")), set), set1);
+ assertTokenStreamContents(filter, new String[] { "brasÃlia", "brasilia" });
+ }
+
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -24,6 +24,7 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
@@ -109,5 +110,11 @@
assertAnalyzesToReuse(cz, "Äeská Republika", new String[] { "Äeská" });
}
-
+
+ public void testWithStemExclusionSet() throws IOException{
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("hole");
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesTo(cz, "hole desek", new String[] {"hole", "desk"});
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java Wed Jan 27 11:19:05 2010
@@ -18,8 +18,12 @@
*/
import java.io.IOException;
+import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
/**
@@ -270,4 +274,13 @@
assertAnalyzesTo(cz, "e", new String[] { "e" });
assertAnalyzesTo(cz, "zi", new String[] { "zi" });
}
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("hole");
+ CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerTokenFilter(
+ new WhitespaceTokenizer(new StringReader("hole desek")), set));
+ assertTokenStreamContents(filter, new String[] { "hole", "desk" });
+ }
+
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Wed Jan 27 11:19:05 2010
@@ -20,10 +20,15 @@
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
+import java.io.IOException;
import java.io.InputStreamReader;
+import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.util.Version;
/**
@@ -64,6 +69,37 @@
checkReuse(a, "Tischen", "tisch");
}
+ public void testExclusionTableBWCompat() throws IOException {
+ GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(
+ new StringReader("Fischen Trinken")));
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("fischen");
+ filter.setExclusionSet(set);
+ assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+ }
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("fischen");
+ GermanStemFilter filter = new GermanStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(new StringReader(
+ "Fischen Trinken")), set));
+ assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+ }
+
+ public void testWithKeywordAttributeAndExclusionTable() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("fischen");
+ CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set1.add("trinken");
+ set1.add("fischen");
+ GermanStemFilter filter = new GermanStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(new StringReader(
+ "Fischen Trinken")), set));
+ filter.setExclusionSet(set1);
+ assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
+ }
+
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
@@ -75,6 +111,7 @@
checkReuse(a, "tischen", "tischen");
}
+
private void check(final String input, final String expected) throws Exception {
checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -17,11 +17,8 @@
* limitations under the License.
*/
-import java.io.StringReader;
-
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
@@ -147,4 +144,17 @@
fa.setStemExclusionTable(new String[] { "habitable" });
assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
}
+
+ public void testExclusionTableViaCtor() throws Exception {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("habitable");
+ FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT,
+ CharArraySet.EMPTY_SET, set);
+ assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
+ "chist" });
+
+ fa = new FrenchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
+ "chist" });
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java Wed Jan 27 11:19:05 2010
@@ -18,9 +18,11 @@
*/
import java.io.File;
+import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
@@ -133,6 +135,19 @@
checkOneTermReuse(a, "lichamelijk", "licham");
a.setStemExclusionTable(new String[] { "lichamelijk" });
checkOneTermReuse(a, "lichamelijk", "lichamelijk");
+
+
+ }
+
+ public void testExclusionTableViaCtor() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
+ set.add("lichamelijk");
+ DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
+
+ a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
+
}
/*
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -26,6 +26,7 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
@@ -116,4 +117,14 @@
assertAnalyzesToReuse(a, "Ðо знание ÑÑо Ñ
ÑанилоÑÑ Ð² Ñайне",
new String[] { "знан", "Ñ
Ñан", "Ñайн" });
}
+
+
+ public void testWithStemExclusionSet() throws Exception {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("пÑедÑÑавление");
+ Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT, RussianAnalyzer.getDefaultStopSet() , set);
+ assertAnalyzesToReuse(a, "ÐмеÑÑе Ñ Ñем о Ñиле ÑлекÑÑомагниÑной ÑнеÑгии имели пÑедÑÑавление еÑе",
+ new String[] { "вмеÑÑ", "Ñил", "ÑлекÑÑомагниÑн", "ÑнеÑг", "имел", "пÑедÑÑавление" });
+
+ }
}
Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java?rev=903608&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java Wed Jan 27 11:19:05 2010
@@ -0,0 +1,82 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * Marks terms as keywords via the {@link KeywordAttribute}. Each token
+ * contained in the provided is marked as a keyword by setting
+ * {@link KeywordAttribute#setKeyword(boolean)} to <code>true</code>.
+ *
+ * @see KeywordAttribute
+ */
+public final class KeywordMarkerTokenFilter extends TokenFilter {
+
+ private final KeywordAttribute keywordAttr;
+ private final TermAttribute termAtt;
+ private final CharArraySet keywordSet;
+
+ /**
+ * Create a new KeywordMarkerTokenFilter, that marks the current token as a
+ * keyword if the tokens term buffer is contained in the given set via the
+ * {@link KeywordAttribute}.
+ *
+ * @param in
+ * TokenStream to filter
+ * @param keywordSet
+ * the keywords set to lookup the current termbuffer
+ */
+ public KeywordMarkerTokenFilter(final TokenStream in,
+ final CharArraySet keywordSet) {
+ super(in);
+ termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
+ this.keywordSet = keywordSet;
+ }
+
+ /**
+ * Create a new KeywordMarkerTokenFilter, that marks the current token as a
+ * keyword if the tokens term buffer is contained in the given set via the
+ * {@link KeywordAttribute}.
+ *
+ * @param in
+ * TokenStream to filter
+ * @param keywordSet
+ * the keywords set to lookup the current termbuffer
+ */
+ public KeywordMarkerTokenFilter(final TokenStream in, final Set<?> keywordSet) {
+ this(in, keywordSet instanceof CharArraySet ? (CharArraySet) keywordSet
+ : CharArraySet.copy(Version.LUCENE_31, keywordSet));
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ keywordAttr.setKeyword(keywordSet.contains(termAtt.termBuffer(), 0,
+ termAtt.termLength()));
+ return true;
+ } else
+ return false;
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java Wed Jan 27 11:19:05 2010
@@ -19,6 +19,7 @@
import java.io.IOException;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/** Transforms the token stream as per the Porter stemming algorithm.
@@ -38,15 +39,23 @@
}
}
</PRE>
+ <p>
+ Note: This filter is aware of the {@link KeywordAttribute}. To prevent
+ certain terms from being passed to the stemmer
+ {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
+ in a previous {@link TokenStream}.
+ </p>
*/
public final class PorterStemFilter extends TokenFilter {
- private PorterStemmer stemmer;
- private TermAttribute termAtt;
+ private final PorterStemmer stemmer;
+ private final TermAttribute termAtt;
+ private final KeywordAttribute keywordAttr;
public PorterStemFilter(TokenStream in) {
super(in);
stemmer = new PorterStemmer();
termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
}
@Override
@@ -54,7 +63,7 @@
if (!input.incrementToken())
return false;
- if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
+ if ((!keywordAttr.isKeyword()) && stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return true;
}
Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java?rev=903608&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java Wed Jan 27 11:19:05 2010
@@ -0,0 +1,49 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * This attribute can be used to mark a token as a keyword. Keyword aware
+ * {@link TokenStream}s can decide to modify a token based on the return value
+ * of {@link #isKeyword()} if the token is modified. Stemming filters for
+ * instance can use this attribute to conditionally skip a term if
+ * {@link #isKeyword()} returns <code>true</code>.
+ */
+public interface KeywordAttribute extends Attribute {
+
+ /**
+ * Returns <code>true</code> iff the current token is a keyword, otherwise
+ * <code>false</code>/
+ *
+ * @return <code>true</code> iff the current token is a keyword, otherwise
+ * <code>false</code>/
+ */
+ public boolean isKeyword();
+
+ /**
+ * Marks the current token as keyword iff set to <code>true</code>.
+ *
+ * @param isKeyword
+ * <code>true</code> iff the current token is a keyword, otherwise
+ * <code>false</code>.
+ */
+ public void setKeyword(boolean isKeyword);
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java?rev=903608&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java Wed Jan 27 11:19:05 2010
@@ -0,0 +1,82 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ *This attribute can be used to mark a token as a keyword. Keyword aware
+ * {@link TokenStream}s can decide to modify a token based on the return value
+ * of {@link #isKeyword()} if the token is modified. Stemming filters for
+ * instance can use this attribute to conditionally skip a term if
+ * {@link #isKeyword()} returns <code>true</code>.
+ */
+public final class KeywordAttributeImpl extends AttributeImpl implements
+ KeywordAttribute {
+ private boolean keyword;
+
+ @Override
+ public void clear() {
+ keyword = false;
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ KeywordAttribute attr = (KeywordAttribute) target;
+ attr.setKeyword(keyword);
+ }
+
+ @Override
+ public int hashCode() {
+ return keyword ? 31 : 37;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (getClass() != obj.getClass())
+ return false;
+ final KeywordAttributeImpl other = (KeywordAttributeImpl) obj;
+ return keyword == other.keyword;
+ }
+
+ /**
+ * Returns <code>true</code> iff the current token is a keyword, otherwise
+ * <code>false</code>/
+ *
+ * @return <code>true</code> iff the current token is a keyword, otherwise
+ * <code>false</code>/
+ */
+ public boolean isKeyword() {
+ return keyword;
+ }
+
+ /**
+ * Marks the current token as keyword iff set to <code>true</code>.
+ *
+ * @param isKeyword
+ * <code>true</code> iff the current token is a keyword, otherwise
+ * <code>false</code>.
+ */
+ public void setKeyword(boolean isKeyword) {
+ keyword = isKeyword;
+ }
+
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java?rev=903608&view=auto
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java Wed Jan 27 11:19:05 2010
@@ -0,0 +1,77 @@
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
+import org.junit.Test;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Testcase for {@link KeywordMarkerTokenFilter}
+ */
+public class TestKeywordMarkerTokenFilter extends BaseTokenStreamTestCase {
+
+ @Test
+ public void testIncrementToken() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_31, 5, true);
+ set.add("lucenefox");
+ String[] output = new String[] { "the", "quick", "brown", "LuceneFox",
+ "jumps" };
+ assertTokenStreamContents(new LowerCaseFilterMock(
+ new KeywordMarkerTokenFilter(new WhitespaceTokenizer(new StringReader(
+ "The quIck browN LuceneFox Jumps")), set)), output);
+ Set<String> jdkSet = new HashSet<String>();
+ jdkSet.add("LuceneFox");
+ assertTokenStreamContents(new LowerCaseFilterMock(
+ new KeywordMarkerTokenFilter(new WhitespaceTokenizer(new StringReader(
+ "The quIck browN LuceneFox Jumps")), jdkSet)), output);
+ Set<?> set2 = set;
+ assertTokenStreamContents(new LowerCaseFilterMock(
+ new KeywordMarkerTokenFilter(new WhitespaceTokenizer(new StringReader(
+ "The quIck browN LuceneFox Jumps")), set2)), output);
+ }
+
+ public static class LowerCaseFilterMock extends TokenFilter {
+
+ private TermAttribute termAtt;
+ private KeywordAttribute keywordAttr;
+
+ public LowerCaseFilterMock(TokenStream in) {
+ super(in);
+ termAtt = addAttribute(TermAttribute.class);
+ keywordAttr = addAttribute(KeywordAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword())
+ termAtt.setTermBuffer(termAtt.term().toLowerCase());
+ return true;
+ }
+ return false;
+ }
+
+ }
+}
Propchange: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java Wed Jan 27 11:19:05 2010
@@ -25,6 +25,8 @@
import java.io.StringReader;
import java.util.zip.ZipFile;
+import org.apache.lucene.util.Version;
+
/**
* Test the PorterStemFilter with Martin Porter's test data.
*/
@@ -56,4 +58,12 @@
outputReader.close();
zipFile.close();
}
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("yourselves");
+ Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("yourselves yours"));
+ TokenStream filter = new PorterStemFilter(new KeywordMarkerTokenFilter(tokenizer, set));
+ assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
+ }
}