You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by us...@apache.org on 2010/01/27 12:19:07 UTC

svn commit: r903608 [1/2] - in /lucene/java/trunk: ./ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/ con...

Author: uschindler
Date: Wed Jan 27 11:19:05 2010
New Revision: 903608

URL: http://svn.apache.org/viewvc?rev=903608&view=rev
Log:
LUCENE-2198: Support protected words in stemming TokenFilters using a new KeywordAttribute

Added:
    lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java   (with props)
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java   (with props)
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpls.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed Jan 27 11:19:05 2010
@@ -125,6 +125,9 @@
   stopwords, and implement many analyzers in contrib with it.  
   (Simon Willnauer via Robert Muir)
   
+* LUCENE-2198: Support protected words in stemming TokenFilters using a
+  new KeywordAttribute.  (Simon Willnauer via Uwe Schindler)
+
 Optimizations
 
 * LUCENE-2086: When resolving deleted terms, do so in term sort order

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -26,6 +26,8 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
@@ -93,6 +95,8 @@
       }
     }
   }
+  
+  private final Set<?> stemExclusionSet;
 
   /**
    * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
@@ -110,7 +114,25 @@
    *          a stopword set
    */
   public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+   * {@link ArabicStemFilter}.
+   * 
+   * @param matchVersion
+   *          lucene compatibility version
+   * @param stopwords
+   *          a stopword set
+   * @param stemExclusionSet
+   *          a set of terms not to be stemmed
+   */
+  public ArabicAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
     super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
   }
 
   /**
@@ -145,7 +167,8 @@
    * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
    *
    * @return {@link TokenStreamComponents} built from an {@link ArabicLetterTokenizer} filtered with
-   * 			{@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
+   * 			{@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter},
+   *      {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided
    *            and {@link ArabicStemFilter}.
    */
   @Override
@@ -155,7 +178,11 @@
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     // the order here is important: the stopword list is not normalized!
     result = new StopFilter( matchVersion, result, stopwords);
+    // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
     result = new ArabicNormalizationFilter(result);
+    if(!stemExclusionSet.isEmpty()) {
+      result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+    }
     return new TokenStreamComponents(source, new ArabicStemFilter(result));
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java Wed Jan 27 11:19:05 2010
@@ -19,31 +19,41 @@
 
 import java.io.IOException;
 
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
  * A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
- * 
- */
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter */
 
 public final class ArabicStemFilter extends TokenFilter {
 
   private final ArabicStemmer stemmer;
   private final TermAttribute termAtt;
+  private final KeywordAttribute keywordAttr;
   
   public ArabicStemFilter(TokenStream input) {
     super(input);
     stemmer = new ArabicStemmer();
     termAtt = addAttribute(TermAttribute.class);
+    keywordAttr = addAttribute(KeywordAttribute.class);
   }
 
   @Override
   public boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
-      int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
-      termAtt.setTermLength(newlen);
+      if(!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+        termAtt.setTermLength(newlen);
+      }
       return true;
     } else {
       return false;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -25,6 +25,8 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
@@ -88,6 +90,8 @@
       }
     }
   }
+  
+  private final Set<?> stemExclusionSet;
    
   /**
    * Builds an analyzer with the default stop words:
@@ -101,16 +105,27 @@
    * Builds an analyzer with the given stop words.
    */
   public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
-    super(matchVersion, stopwords);
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
   }
   
   /**
+   * Builds an analyzer with the given stop words and a stem exclusion set.
+   * If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerTokenFilter} 
+   * before {@link BulgarianStemFilter}.
+   */
+  public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));  }
+  
+  /**
    * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
    * {@link Reader}.
    * 
    * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
    *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
-   *         {@link StopFilter}, and {@link BulgarianStemFilter}.
+   *         {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+   *         exclusion set is provided and {@link BulgarianStemFilter}.
    */
   @Override
   public TokenStreamComponents createComponents(String fieldName, Reader reader) {
@@ -118,6 +133,8 @@
     TokenStream result = new StandardFilter(source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
     result = new BulgarianStemFilter(result);
     return new TokenStreamComponents(source, result);
   }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java Wed Jan 27 11:19:05 2010
@@ -19,29 +19,40 @@
 
 import java.io.IOException;
 
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter; // for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
  * A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
  * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
  */
 public final class BulgarianStemFilter extends TokenFilter {
   private final BulgarianStemmer stemmer;
   private final TermAttribute termAtt;
+  private final KeywordAttribute keywordAttr;
   
   public BulgarianStemFilter(final TokenStream input) {
     super(input);
     stemmer = new BulgarianStemmer();
     termAtt = addAttribute(TermAttribute.class);
+    keywordAttr = addAttribute(KeywordAttribute.class);
   }
   
   @Override
   public boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
-      final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
-      termAtt.setTermLength(newlen);
+      if(!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+        termAtt.setTermLength(newlen);
+      }
       return true;
     } else {
       return false;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -30,6 +30,7 @@
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
@@ -204,8 +205,9 @@
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     result = new StandardFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
-    return new TokenStreamComponents(source, new BrazilianStemFilter(result,
-        excltable));
+    if(excltable != null && !excltable.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, excltable);
+    return new TokenStreamComponents(source, new BrazilianStemFilter(result));
   }
 }
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java Wed Jan 27 11:19:05 2010
@@ -20,13 +20,21 @@
 import java.io.IOException;
 import java.util.Set;
 
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter; // for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
  * A {@link TokenFilter} that applies {@link BrazilianStemmer}.
- *
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
+ * 
  */
 public final class BrazilianStemFilter extends TokenFilter {
 
@@ -34,16 +42,31 @@
    * {@link BrazilianStemmer} in use by this filter.
    */
   private BrazilianStemmer stemmer = null;
-  private Set exclusions = null;
-  private TermAttribute termAtt;
-  
+  private Set<?> exclusions = null;
+  private final TermAttribute termAtt;
+  private final KeywordAttribute keywordAttr;
+
+  /**
+   * Creates a new BrazilianStemFilter 
+   * 
+   * @param in the source {@link TokenStream} 
+   */
   public BrazilianStemFilter(TokenStream in) {
     super(in);
     stemmer = new BrazilianStemmer();
     termAtt = addAttribute(TermAttribute.class);
+    keywordAttr = addAttribute(KeywordAttribute.class);
   }
-
-  public BrazilianStemFilter(TokenStream in, Set exclusiontable) {
+  
+  /**
+   * Creates a new BrazilianStemFilter 
+   * 
+   * @param in the source {@link TokenStream} 
+   * @param exclusiontable a set of terms that should be prevented from being stemmed.
+   * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
+   */
+  @Deprecated
+  public BrazilianStemFilter(TokenStream in, Set<?> exclusiontable) {
     this(in);
     this.exclusions = exclusiontable;
   }
@@ -51,10 +74,10 @@
   @Override
   public boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
-      String term = termAtt.term();
+      final String term = termAtt.term();
       // Check the exclusion table.
-      if (exclusions == null || !exclusions.contains(term)) {
-        String s = stemmer.stem(term);
+      if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
+        final String s = stemmer.stem(term);
         // If not stemmed, don't waste the time adjusting the token.
         if ((s != null) && !s.equals(term))
           termAtt.setTermBuffer(s);

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -21,6 +21,7 @@
 import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
@@ -105,6 +106,7 @@
 	// TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
 	private Set<?> stoptable;
   private final Version matchVersion;
+  private final Set<?> stemExclusionTable;
 
   /**
    * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
@@ -124,8 +126,22 @@
    * @param stopwords a stopword set
    */
   public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words and a set of work to be
+   * excluded from the {@link CzechStemFilter}.
+   * 
+   * @param matchVersion Lucene version to match See
+   *          {@link <a href="#version">above</a>}
+   * @param stopwords a stopword set
+   * @param a stemming exclusion set
+   */
+  public CzechAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable) {
     this.matchVersion = matchVersion;
     this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
+    this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
   }
 
 
@@ -207,7 +223,9 @@
    * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
    *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
    *         {@link StopFilter}, and {@link CzechStemFilter} (only if version is
-   *         >= LUCENE_31)
+   *         >= LUCENE_31). If a version is >= LUCENE_31 and a stem exclusion set
+   *         is provided via {@link #CzechAnalyzer(Version, Set, Set)} a 
+   *         {@link KeywordMarkerTokenFilter} is added before {@link CzechStemFilter}.
    */
   @Override
   protected TokenStreamComponents createComponents(String fieldName,
@@ -216,8 +234,11 @@
     TokenStream result = new StandardFilter(source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter( matchVersion, result, stoptable);
-    if (matchVersion.onOrAfter(Version.LUCENE_31))
+    if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+      if(!this.stemExclusionTable.isEmpty())
+        result = new KeywordMarkerTokenFilter(result, stemExclusionTable);
       result = new CzechStemFilter(result);
+    }
     return new TokenStreamComponents(source, result);
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java Wed Jan 27 11:19:05 2010
@@ -2,8 +2,10 @@
 
 import java.io.IOException;
 
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
@@ -25,25 +27,34 @@
 
 /**
  * A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
- * 
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
  * <p><b>NOTE</b>: Input is expected to be in lowercase, 
  * but with diacritical marks</p>
+ * @see KeywordMarkerTokenFilter
  */
 public final class CzechStemFilter extends TokenFilter {
   private final CzechStemmer stemmer;
   private final TermAttribute termAtt;
+  private final KeywordAttribute keywordAttr;
   
   public CzechStemFilter(TokenStream input) {
     super(input);
     stemmer = new CzechStemmer();
     termAtt = addAttribute(TermAttribute.class);
+    keywordAttr = addAttribute(KeywordAttribute.class);
   }
 
   @Override
   public boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
-      int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
-      termAtt.setTermLength(newlen);
+      if(!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+        termAtt.setTermLength(newlen);
+      }
       return true;
     } else {
       return false;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -30,6 +30,7 @@
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
@@ -208,6 +209,7 @@
     TokenStream result = new StandardFilter(source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter( matchVersion, result, stopwords);
-    return new TokenStreamComponents(source, new GermanStemFilter(result, exclusionSet));
+    result = new KeywordMarkerTokenFilter(result, exclusionSet);
+    return new TokenStreamComponents(source, new GermanStemFilter(result));
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java Wed Jan 27 11:19:05 2010
@@ -20,8 +20,10 @@
 import java.io.IOException;
 import java.util.Set;
 
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
@@ -31,6 +33,12 @@
  * not be stemmed at all. The stemmer used can be changed at runtime after the
  * filter object is created (as long as it is a {@link GermanStemmer}).
  * </p>
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
  */
 public final class GermanStemFilter extends TokenFilter
 {
@@ -38,21 +46,29 @@
      * The actual token in the input stream.
      */
     private GermanStemmer stemmer = null;
-    private Set exclusionSet = null;
+    private Set<?> exclusionSet = null;
 
-    private TermAttribute termAtt;
+    private final TermAttribute termAtt;
+    private final KeywordAttribute keywordAttr;
 
+    /**
+     * Creates a {@link GermanStemFilter} instance
+     * @param in the source {@link TokenStream} 
+     */
     public GermanStemFilter( TokenStream in )
     {
       super(in);
       stemmer = new GermanStemmer();
       termAtt = addAttribute(TermAttribute.class);
+      keywordAttr = addAttribute(KeywordAttribute.class);
     }
 
     /**
      * Builds a GermanStemFilter that uses an exclusion table.
+     * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
      */
-    public GermanStemFilter( TokenStream in, Set exclusionSet )
+    @Deprecated
+    public GermanStemFilter( TokenStream in, Set<?> exclusionSet )
     {
       this( in );
       this.exclusionSet = exclusionSet;
@@ -66,7 +82,7 @@
       if (input.incrementToken()) {
         String term = termAtt.term();
         // Check the exclusion table.
-        if (exclusionSet == null || !exclusionSet.contains(term)) {
+        if (!keywordAttr.isKeyword() && (exclusionSet == null || !exclusionSet.contains(term))) {
           String s = stemmer.stem(term);
           // If not stemmed, don't waste the time adjusting the token.
           if ((s != null) && !s.equals(term))
@@ -91,8 +107,10 @@
 
     /**
      * Set an alternative exclusion list for this filter.
+     * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
      */
-    public void setExclusionSet( Set exclusionSet )
+    @Deprecated
+    public void setExclusionSet( Set<?> exclusionSet )
     {
       this.exclusionSet = exclusionSet;
     }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -21,6 +21,7 @@
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
@@ -215,7 +216,9 @@
     final Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(source);
     result = new StopFilter(matchVersion, result, stopwords);
-    result = new FrenchStemFilter(result, excltable);
+    if(!excltable.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, excltable);
+    result = new FrenchStemFilter(result);
     // Convert to lowercase after stemming!
     return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
   }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java Wed Jan 27 11:19:05 2010
@@ -17,8 +17,10 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 import java.io.IOException;
@@ -29,10 +31,15 @@
 /**
  * A {@link TokenFilter} that stems french words. 
  * <p>
- * It supports a table of words that should
- * not be stemmed at all. The used stemmer can be changed at runtime after the
+ * The used stemmer can be changed at runtime after the
  * filter object is created (as long as it is a {@link FrenchStemmer}).
  * </p>
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
  */
 public final class FrenchStemFilter extends TokenFilter {
 
@@ -40,18 +47,26 @@
 	 * The actual token in the input stream.
 	 */
 	private FrenchStemmer stemmer = null;
-	private Set exclusions = null;
+	private Set<?> exclusions = null;
 	
-	private TermAttribute termAtt;
+	private final TermAttribute termAtt;
+  private final KeywordAttribute keywordAttr;
 
 	public FrenchStemFilter( TokenStream in ) {
           super(in);
 		stemmer = new FrenchStemmer();
 		termAtt = addAttribute(TermAttribute.class);
+    keywordAttr = addAttribute(KeywordAttribute.class);
 	}
 
-
-	public FrenchStemFilter( TokenStream in, Set exclusiontable ) {
+  /**
+   * 
+   * @param in the {@link TokenStream} to filter
+   * @param exclusiontable a set of terms not to be stemmed
+   * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
+   */
+	@Deprecated // TODO remove in 3.2
+	public FrenchStemFilter( TokenStream in, Set<?> exclusiontable ) {
 		this( in );
 		exclusions = exclusiontable;
 	}
@@ -65,7 +80,7 @@
 	    String term = termAtt.term();
 
 	    // Check the exclusion table
-	    if ( exclusions == null || !exclusions.contains( term ) ) {
+	    if ( !keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains( term )) ) {
 	      String s = stemmer.stem( term );
 	      // If not stemmed, don't waste the time  adjusting the token.
 	      if ((s != null) && !s.equals( term ) )
@@ -86,8 +101,10 @@
 	}
 	/**
 	 * Set an alternative exclusion list for this filter.
+   * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
 	 */
-	public void setExclusionTable( Map exclusiontable ) {
+	@Deprecated // TODO remove in 3.2
+	public void setExclusionTable( Map<?,?> exclusiontable ) {
 		exclusions = new HashSet(exclusiontable.keySet());
 	}
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -19,6 +19,8 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -52,7 +54,7 @@
  * <p><b>NOTE</b>: This class uses the same {@link Version}
  * dependent settings as {@link StandardAnalyzer}.</p>
  */
-public final class DutchAnalyzer extends Analyzer {
+public final class DutchAnalyzer extends ReusableAnalyzerBase {
   /**
    * List of typical Dutch stopwords.
    * @deprecated use {@link #getDefaultStopSet()} instead
@@ -215,28 +217,7 @@
     }
   }
 
-  /**
-   * Creates a {@link TokenStream} which tokenizes all the text in the 
-   * provided {@link Reader}.
-   *
-   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
-   *   filtered with {@link StandardFilter}, {@link StopFilter}, 
-   *   and {@link DutchStemFilter}
-   */
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(matchVersion, reader);
-    result = new StandardFilter(result);
-    result = new StopFilter(matchVersion, result, stoptable);
-    result = new DutchStemFilter(result, excltable, stemdict);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
-  };
-  
+
   /**
    * Returns a (possibly reused) {@link TokenStream} which tokenizes all the 
    * text in the provided {@link Reader}.
@@ -246,19 +227,14 @@
    *   and {@link DutchStemFilter}
    */
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader)
-      throws IOException {
-    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new StandardTokenizer(matchVersion, reader);
-      streams.result = new StandardFilter(streams.source);
-      streams.result = new StopFilter(matchVersion, streams.result, stoptable);
-      streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
-      setPreviousTokenStream(streams);
-    } else {
-      streams.source.reset(reader);
-    }
-    return streams.result;
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader aReader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
+    TokenStream result = new StandardFilter(source);
+    result = new StopFilter(matchVersion, result, stoptable);
+    if (!excltable.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, excltable);
+    result = new DutchStemFilter(result, stemdict);
+    return new TokenStreamComponents(source, result);
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java Wed Jan 27 11:19:05 2010
@@ -23,8 +23,10 @@
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
@@ -34,34 +36,54 @@
  * not be stemmed at all. The stemmer used can be changed at runtime after the
  * filter object is created (as long as it is a {@link DutchStemmer}).
  * </p>
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
  */
 public final class DutchStemFilter extends TokenFilter {
   /**
    * The actual token in the input stream.
    */
   private DutchStemmer stemmer = null;
-  private Set exclusions = null;
+  private Set<?> exclusions = null;
   
-  private TermAttribute termAtt;
+  private final TermAttribute termAtt;
+  private final KeywordAttribute keywordAttr;
 
   public DutchStemFilter(TokenStream _in) {
     super(_in);
     stemmer = new DutchStemmer();
     termAtt = addAttribute(TermAttribute.class);
+    keywordAttr = addAttribute(KeywordAttribute.class);
   }
 
   /**
    * Builds a DutchStemFilter that uses an exclusion table.
+   * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
    */
-  public DutchStemFilter(TokenStream _in, Set exclusiontable) {
+  @Deprecated
+  public DutchStemFilter(TokenStream _in, Set<?> exclusiontable) {
     this(_in);
     exclusions = exclusiontable;
   }
+  
+  /**
+   * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
+   */
+  public DutchStemFilter(TokenStream _in,  Map<?,?> stemdictionary) {
+    this(_in);
+    stemmer.setStemDictionary(stemdictionary);
+  }
 
   /**
    * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
+   * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
    */
-  public DutchStemFilter(TokenStream _in, Set exclusiontable, Map stemdictionary) {
+  @Deprecated
+  public DutchStemFilter(TokenStream _in, Set<?> exclusiontable, Map<?,?> stemdictionary) {
     this(_in, exclusiontable);
     stemmer.setStemDictionary(stemdictionary);
   }
@@ -72,11 +94,11 @@
   @Override
   public boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
-      String term = termAtt.term();
+      final String term = termAtt.term();
 
       // Check the exclusion table.
-      if (exclusions == null || !exclusions.contains(term)) {
-        String s = stemmer.stem(term);
+      if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
+        final String s = stemmer.stem(term);
         // If not stemmed, don't waste the time adjusting the token.
         if ((s != null) && !s.equals(term))
           termAtt.setTermBuffer(s);
@@ -98,8 +120,10 @@
 
   /**
    * Set an alternative exclusion list for this filter.
+   * @deprecated use {@link KeywordAttribute} with {@link KeywordMarkerTokenFilter} instead.
    */
-  public void setExclusionTable(HashSet exclusiontable) {
+  @Deprecated
+  public void setExclusionTable(HashSet<?> exclusiontable) {
     exclusions = exclusiontable;
   }
 
@@ -107,7 +131,7 @@
    * Set dictionary for stemming, this dictionary overrules the algorithm,
    * so you can correct for a particular unwanted word-stem pair.
    */
-  public void setStemDictionary(HashMap dict) {
+  public void setStemDictionary(HashMap<?,?> dict) {
     if (stemmer != null)
       stemmer.setStemDictionary(dict);
   }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -26,6 +26,7 @@
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
@@ -63,6 +64,17 @@
           .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, 
               Arrays.asList(RUSSIAN_STOP_WORDS), false));
     }
+    
+    private final Set<?> stemExclusionSet;
+    
+    /**
+     * Returns an unmodifiable instance of the default stop-words set.
+     * 
+     * @return an unmodifiable instance of the default stop-words set.
+     */
+    public static Set<?> getDefaultStopSet() {
+      return DefaultSetHolder.DEFAULT_STOP_SET;
+    }
 
     public RussianAnalyzer(Version matchVersion) {
       this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
@@ -86,9 +98,24 @@
      *          a stopword set
      */
     public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
+      this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+    }
+    
+    /**
+     * Builds an analyzer with the given stop words
+     * 
+     * @param matchVersion
+     *          lucene compatibility version
+     * @param stopwords
+     *          a stopword set
+     * @param stemExclusionSet a set of words not to be stemmed
+     */
+    public RussianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
       super(matchVersion, stopwords);
+      this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
     }
    
+   
     /**
      * Builds an analyzer with the given stop words.
      * TODO: create a Set version of this ctor
@@ -115,6 +142,8 @@
       final Tokenizer source = new RussianLetterTokenizer(reader);
       TokenStream result = new LowerCaseFilter(matchVersion, source);
       result = new StopFilter(matchVersion, result, stopwords);
+      if(!stemExclusionSet.isEmpty())
+        result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
       return new TokenStreamComponents(source, new RussianStemFilter(result));
       
     }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java Wed Jan 27 11:19:05 2010
@@ -17,9 +17,11 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
 import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
 
@@ -32,6 +34,12 @@
  * The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter ,
  * because RussianStemFilter only works with lowercase characters.
  * </p>
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerTokenFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerTokenFilter
  */
 public final class RussianStemFilter extends TokenFilter
 {
@@ -40,13 +48,15 @@
      */
     private RussianStemmer stemmer = null;
 
-    private TermAttribute termAtt;
+    private final TermAttribute termAtt;
+    private final KeywordAttribute keywordAttr;
 
     public RussianStemFilter(TokenStream in)
     {
         super(in);
         stemmer = new RussianStemmer();
         termAtt = addAttribute(TermAttribute.class);
+        keywordAttr = addAttribute(KeywordAttribute.class);
     }
     /**
      * Returns the next token in the stream, or null at EOS
@@ -55,10 +65,12 @@
     public final boolean incrementToken() throws IOException
     {
       if (input.incrementToken()) {
-        String term = termAtt.term();
-        String s = stemmer.stem(term);
-        if (s != null && !s.equals(term))
-          termAtt.setTermBuffer(s);
+        if(!keywordAttr.isKeyword()) {
+          final String term = termAtt.term();
+          final String s = stemmer.stem(term);
+          if (s != null && !s.equals(term))
+            termAtt.setTermBuffer(s);
+        }
         return true;
       } else {
         return false;

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -17,11 +17,15 @@
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 
+import javax.print.DocFlavor.CHAR_ARRAY;
+
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.Version;
 
 /**
@@ -84,4 +88,17 @@
     assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
         "brown", "fox" });
   }
+  
+  public void testWithStemExclusionSet() throws IOException {
+    Set<String> set = new HashSet<String>();
+    set.add("ساهدهات");
+    ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
+    assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
+
+    
+    a = new ArabicAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
+    assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
+    assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
+  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java Wed Jan 27 11:19:05 2010
@@ -21,7 +21,9 @@
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.util.Version;
 
 /**
  * Test the Arabic Normalization Filter
@@ -112,11 +114,19 @@
   public void testNonArabic() throws IOException {
     check("English", "English");
   }
+  
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("ساهدهات");
+    ArabicLetterTokenizer tokenStream  = new ArabicLetterTokenizer(new StringReader("ساهدهات"));
+
+    ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerTokenFilter(tokenStream, set));
+    assertTokenStreamContents(filter, new String[]{"ساهدهات"});
+  }
 
   private void check(final String input, final String expected) throws IOException {
     ArabicLetterTokenizer tokenStream  = new ArabicLetterTokenizer(new StringReader(input));
     ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
     assertTokenStreamContents(filter, new String[]{expected});
   }
-
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -22,6 +22,7 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.Version;
 
 /**
@@ -67,4 +68,11 @@
     
     assertAnalyzesTo(a, "градове", new String[] {"град"});
   }
+  
+  public void testWithStemExclusionSet() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
+    set.add("строеве");
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(a, "строевете строеве", new String[] { "строй", "строеве" });
+  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemmer.java Wed Jan 27 11:19:05 2010
@@ -18,8 +18,12 @@
  */
 
 import java.io.IOException;
+import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.util.Version;
 
 /**
@@ -207,4 +211,15 @@
     assertAnalyzesTo(a, "строя", new String[] {"стр"});
     assertAnalyzesTo(a, "строят", new String[] {"стр"});
   }
+
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
+    set.add("строеве");
+    WhitespaceTokenizer tokenStream = new WhitespaceTokenizer(
+        new StringReader("строевете строеве"));
+
+    BulgarianStemFilter filter = new BulgarianStemFilter(
+        new KeywordMarkerTokenFilter(tokenStream, set));
+    assertTokenStreamContents(filter, new String[] { "строй", "строеве" });
+  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java Wed Jan 27 11:19:05 2010
@@ -17,12 +17,14 @@
  * limitations under the License.
  */
 
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
+import java.io.IOException;
+import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
 import org.apache.lucene.util.Version;
 
 /**
@@ -139,6 +141,34 @@
     checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
   }
   
+  public void testStemExclusionTableBWCompat() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("Brasília");
+    BrazilianStemFilter filter = new BrazilianStemFilter(
+        new LowerCaseTokenizer(new StringReader("Brasília Brasilia")), set);
+    assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
+  }
+
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("Brasília");
+    BrazilianStemFilter filter = new BrazilianStemFilter(
+        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(new StringReader(
+            "Brasília Brasilia")), set));
+    assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
+  }
+
+  public void testWithKeywordAttributeAndExclusionTable() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("Brasília");
+    CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set1.add("Brasilia");
+    BrazilianStemFilter filter = new BrazilianStemFilter(
+        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(new StringReader(
+            "Brasília Brasilia")), set), set1);
+    assertTokenStreamContents(filter, new String[] { "brasília", "brasilia" });
+  }
+  
   /* 
    * Test that changes to the exclusion table are applied immediately
    * when using reusable token streams.

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -24,6 +24,7 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.Version;
 
 /**
@@ -109,5 +110,11 @@
     
     assertAnalyzesToReuse(cz, "Česká Republika", new String[] { "česká" });
   }
-
+  
+  public void testWithStemExclusionSet() throws IOException{
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("hole");
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(cz, "hole desek", new String[] {"hole", "desk"});
+  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java Wed Jan 27 11:19:05 2010
@@ -18,8 +18,12 @@
  */
 
 import java.io.IOException;
+import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.util.Version;
 
 /**
@@ -270,4 +274,13 @@
     assertAnalyzesTo(cz, "e", new String[] { "e" });
     assertAnalyzesTo(cz, "zi", new String[] { "zi" });
   }
+  
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("hole");
+    CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerTokenFilter(
+        new WhitespaceTokenizer(new StringReader("hole desek")), set));
+    assertTokenStreamContents(filter, new String[] { "hole", "desk" });
+  }
+  
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Wed Jan 27 11:19:05 2010
@@ -20,10 +20,15 @@
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.IOException;
 import java.io.InputStreamReader;
+import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
 import org.apache.lucene.util.Version;
 
 /**
@@ -64,6 +69,37 @@
     checkReuse(a, "Tischen", "tisch");
   }
   
+  public void testExclusionTableBWCompat() throws IOException {
+    GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(
+        new StringReader("Fischen Trinken")));
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("fischen");
+    filter.setExclusionSet(set);
+    assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+  }
+
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("fischen");
+    GermanStemFilter filter = new GermanStemFilter(
+        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(new StringReader(
+            "Fischen Trinken")), set));
+    assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+  }
+
+  public void testWithKeywordAttributeAndExclusionTable() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("fischen");
+    CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set1.add("trinken");
+    set1.add("fischen");
+    GermanStemFilter filter = new GermanStemFilter(
+        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(new StringReader(
+            "Fischen Trinken")), set));
+    filter.setExclusionSet(set1);
+    assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
+  }
+  
   /* 
    * Test that changes to the exclusion table are applied immediately
    * when using reusable token streams.
@@ -75,6 +111,7 @@
     checkReuse(a, "tischen", "tischen");
   }
   
+  
   private void check(final String input, final String expected) throws Exception {
     checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
   }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -17,11 +17,8 @@
  * limitations under the License.
  */
 
-import java.io.StringReader;
-
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.Version;
 
 /**
@@ -147,4 +144,17 @@
 	  fa.setStemExclusionTable(new String[] { "habitable" });
 	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
 	}
+	
+  public void testExclusionTableViaCtor() throws Exception {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("habitable");
+    FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT,
+        CharArraySet.EMPTY_SET, set);
+    assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
+        "chist" });
+
+    fa = new FrenchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
+        "chist" });
+  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java Wed Jan 27 11:19:05 2010
@@ -18,9 +18,11 @@
  */
 
 import java.io.File;
+import java.io.IOException;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.Version;
 
 /**
@@ -133,6 +135,19 @@
     checkOneTermReuse(a, "lichamelijk", "licham");
     a.setStemExclusionTable(new String[] { "lichamelijk" });
     checkOneTermReuse(a, "lichamelijk", "lichamelijk");
+
+    
+  }
+  
+  public void testExclusionTableViaCtor() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
+    set.add("lichamelijk");
+    DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
+    
+    a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
+
   }
   
   /* 

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java Wed Jan 27 11:19:05 2010
@@ -26,6 +26,7 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.Version;
@@ -116,4 +117,14 @@
       assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
           new String[] { "знан", "хран", "тайн" });
     }
+    
+    
+    public void testWithStemExclusionSet() throws Exception {
+      CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+      set.add("представление");
+      Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT, RussianAnalyzer.getDefaultStopSet() , set);
+      assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
+          new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
+     
+    }
 }

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java?rev=903608&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java Wed Jan 27 11:19:05 2010
@@ -0,0 +1,82 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * Marks terms as keywords via the {@link KeywordAttribute}. Each token
+ * contained in the provided is marked as a keyword by setting
+ * {@link KeywordAttribute#setKeyword(boolean)} to <code>true</code>.
+ * 
+ * @see KeywordAttribute
+ */
+public final class KeywordMarkerTokenFilter extends TokenFilter {
+
+  private final KeywordAttribute keywordAttr;
+  private final TermAttribute termAtt;
+  private final CharArraySet keywordSet;
+
+  /**
+   * Create a new KeywordMarkerTokenFilter, that marks the current token as a
+   * keyword if the tokens term buffer is contained in the given set via the
+   * {@link KeywordAttribute}.
+   * 
+   * @param in
+   *          TokenStream to filter
+   * @param keywordSet
+   *          the keywords set to lookup the current termbuffer
+   */
+  public KeywordMarkerTokenFilter(final TokenStream in,
+      final CharArraySet keywordSet) {
+    super(in);
+    termAtt = addAttribute(TermAttribute.class);
+    keywordAttr = addAttribute(KeywordAttribute.class);
+    this.keywordSet = keywordSet;
+  }
+
+  /**
+   * Create a new KeywordMarkerTokenFilter, that marks the current token as a
+   * keyword if the tokens term buffer is contained in the given set via the
+   * {@link KeywordAttribute}.
+   * 
+   * @param in
+   *          TokenStream to filter
+   * @param keywordSet
+   *          the keywords set to lookup the current termbuffer
+   */
+  public KeywordMarkerTokenFilter(final TokenStream in, final Set<?> keywordSet) {
+    this(in, keywordSet instanceof CharArraySet ? (CharArraySet) keywordSet
+        : CharArraySet.copy(Version.LUCENE_31, keywordSet));
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      keywordAttr.setKeyword(keywordSet.contains(termAtt.termBuffer(), 0,
+          termAtt.termLength()));
+      return true;
+    } else
+      return false;
+  }
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordMarkerTokenFilter.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java Wed Jan 27 11:19:05 2010
@@ -19,6 +19,7 @@
 
 import java.io.IOException;
 
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /** Transforms the token stream as per the Porter stemming algorithm.
@@ -38,15 +39,23 @@
       }
     }
     </PRE>
+    <p>
+    Note: This filter is aware of the {@link KeywordAttribute}. To prevent
+    certain terms from being passed to the stemmer
+    {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
+    in a previous {@link TokenStream}.
+    </p>
 */
 public final class PorterStemFilter extends TokenFilter {
-  private PorterStemmer stemmer;
-  private TermAttribute termAtt;
+  private final PorterStemmer stemmer;
+  private final TermAttribute termAtt;
+  private final KeywordAttribute keywordAttr;
 
   public PorterStemFilter(TokenStream in) {
     super(in);
     stemmer = new PorterStemmer();
     termAtt = addAttribute(TermAttribute.class);
+    keywordAttr = addAttribute(KeywordAttribute.class);
   }
 
   @Override
@@ -54,7 +63,7 @@
     if (!input.incrementToken())
       return false;
 
-    if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
+    if ((!keywordAttr.isKeyword()) && stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
       termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
     return true;
   }

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java?rev=903608&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java Wed Jan 27 11:19:05 2010
@@ -0,0 +1,49 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * This attribute can be used to mark a token as a keyword. Keyword aware
+ * {@link TokenStream}s can decide to modify a token based on the return value
+ * of {@link #isKeyword()} if the token is modified. Stemming filters for
+ * instance can use this attribute to conditionally skip a term if
+ * {@link #isKeyword()} returns <code>true</code>.
+ */
+public interface KeywordAttribute extends Attribute {
+
+  /**
+   * Returns <code>true</code> iff the current token is a keyword, otherwise
+   * <code>false</code>/
+   * 
+   * @return <code>true</code> iff the current token is a keyword, otherwise
+   *         <code>false</code>/
+   */
+  public boolean isKeyword();
+
+  /**
+   * Marks the current token as keyword iff set to <code>true</code>.
+   * 
+   * @param isKeyword
+   *          <code>true</code> iff the current token is a keyword, otherwise
+   *          <code>false</code>.
+   */
+  public void setKeyword(boolean isKeyword);
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttribute.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java?rev=903608&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java Wed Jan 27 11:19:05 2010
@@ -0,0 +1,82 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ *This attribute can be used to mark a token as a keyword. Keyword aware
+ * {@link TokenStream}s can decide to modify a token based on the return value
+ * of {@link #isKeyword()} if the token is modified. Stemming filters for
+ * instance can use this attribute to conditionally skip a term if
+ * {@link #isKeyword()} returns <code>true</code>.
+ */
+public final class KeywordAttributeImpl extends AttributeImpl implements
+    KeywordAttribute {
+  private boolean keyword;
+
+  @Override
+  public void clear() {
+    keyword = false;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    KeywordAttribute attr = (KeywordAttribute) target;
+    attr.setKeyword(keyword);
+  }
+
+  @Override
+  public int hashCode() {
+    return keyword ? 31 : 37;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (getClass() != obj.getClass())
+      return false;
+    final KeywordAttributeImpl other = (KeywordAttributeImpl) obj;
+    return keyword == other.keyword;
+  }
+
+  /**
+   * Returns <code>true</code> iff the current token is a keyword, otherwise
+   * <code>false</code>/
+   * 
+   * @return <code>true</code> iff the current token is a keyword, otherwise
+   *         <code>false</code>/
+   */
+  public boolean isKeyword() {
+    return keyword;
+  }
+
+  /**
+   * Marks the current token as keyword iff set to <code>true</code>.
+   * 
+   * @param isKeyword
+   *          <code>true</code> iff the current token is a keyword, otherwise
+   *          <code>false</code>.
+   */
+  public void setKeyword(boolean isKeyword) {
+    keyword = isKeyword;
+  }
+
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/KeywordAttributeImpl.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Added: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java?rev=903608&view=auto
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java Wed Jan 27 11:19:05 2010
@@ -0,0 +1,77 @@
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
+import org.junit.Test;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Testcase for {@link KeywordMarkerTokenFilter}
+ */
+public class TestKeywordMarkerTokenFilter extends BaseTokenStreamTestCase {
+
+  @Test
+  public void testIncrementToken() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_31, 5, true);
+    set.add("lucenefox");
+    String[] output = new String[] { "the", "quick", "brown", "LuceneFox",
+        "jumps" };
+    assertTokenStreamContents(new LowerCaseFilterMock(
+        new KeywordMarkerTokenFilter(new WhitespaceTokenizer(new StringReader(
+            "The quIck browN LuceneFox Jumps")), set)), output);
+    Set<String> jdkSet = new HashSet<String>();
+    jdkSet.add("LuceneFox");
+    assertTokenStreamContents(new LowerCaseFilterMock(
+        new KeywordMarkerTokenFilter(new WhitespaceTokenizer(new StringReader(
+            "The quIck browN LuceneFox Jumps")), jdkSet)), output);
+    Set<?> set2 = set;
+    assertTokenStreamContents(new LowerCaseFilterMock(
+        new KeywordMarkerTokenFilter(new WhitespaceTokenizer(new StringReader(
+            "The quIck browN LuceneFox Jumps")), set2)), output);
+  }
+
+  public static class LowerCaseFilterMock extends TokenFilter {
+
+    private TermAttribute termAtt;
+    private KeywordAttribute keywordAttr;
+
+    public LowerCaseFilterMock(TokenStream in) {
+      super(in);
+      termAtt = addAttribute(TermAttribute.class);
+      keywordAttr = addAttribute(KeywordAttribute.class);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        if (!keywordAttr.isKeyword())
+          termAtt.setTermBuffer(termAtt.term().toLowerCase());
+        return true;
+      }
+      return false;
+    }
+
+  }
+}

Propchange: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestKeywordMarkerTokenFilter.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java?rev=903608&r1=903607&r2=903608&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestPorterStemFilter.java Wed Jan 27 11:19:05 2010
@@ -25,6 +25,8 @@
 import java.io.StringReader;
 import java.util.zip.ZipFile;
 
+import org.apache.lucene.util.Version;
+
 /**
  * Test the PorterStemFilter with Martin Porter's test data.
  */
@@ -56,4 +58,12 @@
     outputReader.close();
     zipFile.close();
   }
+  
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("yourselves");
+    Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("yourselves yours"));
+    TokenStream filter = new PorterStemFilter(new KeywordMarkerTokenFilter(tokenizer, set));   
+    assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
+  }
 }