You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/06 00:05:49 UTC

svn commit: r907125 [2/3] - in /lucene/java/trunk: ./ contrib/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/ contrib/analyzers/common/src/java/org/apache/lucene/analysi...

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java Fri Feb  5 23:05:46 2010
@@ -26,8 +26,10 @@
  * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
  * algorithm in Martin Porter's snowball project.
  * </p>
+ * @deprecated Use {@link org.tartarus.snowball.ext.DutchStemmer} instead, 
+ * which has the same functionality. This filter will be removed in Lucene 4.0
  */
-
+@Deprecated
 public class DutchStemmer {
   /**
    * Buffer for the terms while stemming them.

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,130 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.NorwegianStemmer;
+
+/**
+ * {@link Analyzer} for Norwegian.
+ */
+public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
+  private final Set<?> stemExclusionSet;
+  
+  /** File containing default Norwegian stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public NorwegianAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+   * stemming.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+   *         exclusion set is provided and {@link SnowballFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+    result = new SnowballFilter(result, new NorwegianStemmer());
+    return new TokenStreamComponents(source, result);
+  }
+}
+

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html Fri Feb  5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Norwegian.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java Fri Feb  5 23:05:46 2010
@@ -17,7 +17,6 @@
  */
 
 
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java Fri Feb  5 23:05:46 2010
@@ -19,7 +19,6 @@
 
 import java.io.IOException;
 
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java Fri Feb  5 23:05:46 2010
@@ -19,7 +19,6 @@
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.index.Payload;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java Fri Feb  5 23:05:46 2010
@@ -21,7 +21,6 @@
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 
 /** Set the positionIncrement of all tokens to the "positionIncrement",

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.PortugueseStemmer;
+
+/**
+ * {@link Analyzer} for Portuguese.
+ */
+public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
+  private final Set<?> stemExclusionSet;
+  
+  /** File containing default Portuguese stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public PortugueseAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+   * stemming.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+   *         exclusion set is provided and {@link SnowballFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+    result = new SnowballFilter(result, new PortugueseStemmer());
+    return new TokenStreamComponents(source, result);
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html Fri Feb  5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Portuguese.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,133 @@
+package org.apache.lucene.analysis.ro;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.RomanianStemmer;
+
+/**
+ * {@link Analyzer} for Romanian.
+ */
+public final class RomanianAnalyzer extends StopwordAnalyzerBase {
+  private final Set<?> stemExclusionSet;
+  
+  /** File containing default Romanian stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  /**
+   * The comment character in the stopwords file.  
+   * All lines prefixed with this will be ignored.
+   */
+  private static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, RomanianAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public RomanianAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+   * stemming.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public RomanianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+   *         exclusion set is provided and {@link SnowballFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+    result = new SnowballFilter(result, new RomanianStemmer());
+    return new TokenStreamComponents(source, result);
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html Fri Feb  5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Romanian.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -17,6 +17,7 @@
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.Map;
@@ -26,11 +27,15 @@
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.util.Version;
 
 /**
@@ -40,13 +45,22 @@
  * will not be indexed at all).
  * A default set of stopwords is used unless an alternative list is specified.
  * </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating RussianAnalyzer:
+ * <ul>
+ *   <li> As of 3.1, StandardTokenizer is used, Snowball stemming is done with
+ *        SnowballFilter, and Snowball stopwords are used by default.
+ * </ul>
  */
 public final class RussianAnalyzer extends StopwordAnalyzerBase
 {
     /**
-     * List of typical Russian stopwords.
+     * List of typical Russian stopwords. (for backwards compatibility)
+     * @deprecated Remove this for LUCENE 4.0
      */
-    private static final String[] RUSSIAN_STOP_WORDS = {
+    @Deprecated
+    private static final String[] RUSSIAN_STOP_WORDS_30 = {
       "а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
       "вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где", 
       "да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть", 
@@ -59,10 +73,27 @@
       "чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
     };
     
+    /** File containing default Russian stopwords. */
+    public final static String DEFAULT_STOPWORD_FILE = "russian_stop.txt";
+    
     private static class DefaultSetHolder {
-      static final Set<?> DEFAULT_STOP_SET = CharArraySet
+      /** @deprecated remove this for Lucene 4.0 */
+      @Deprecated
+      static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
           .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, 
-              Arrays.asList(RUSSIAN_STOP_WORDS), false));
+              Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
+      static final Set<?> DEFAULT_STOP_SET;
+      
+      static {
+        try {
+          DEFAULT_STOP_SET = 
+            WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+        } catch (IOException ex) {
+          // default set should always be present as it is part of the
+          // distribution (JAR)
+          throw new RuntimeException("Unable to load default stopword set");
+        }
+      }
     }
     
     private final Set<?> stemExclusionSet;
@@ -77,7 +108,9 @@
     }
 
     public RussianAnalyzer(Version matchVersion) {
-      this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+      this(matchVersion,
+        matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
+            : DefaultSetHolder.DEFAULT_STOP_SET_30);
     }
   
     /**
@@ -132,19 +165,30 @@
      * provided {@link Reader}.
      *
      * @return {@link TokenStreamComponents} built from a 
-     *   {@link RussianLetterTokenizer} filtered with 
+     *   {@link StandardTokenizer} filtered with {@link StandardFilter},
      *   {@link LowerCaseFilter}, {@link StopFilter}, 
-     *   and {@link RussianStemFilter}
+     *   {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+     *   and {@link SnowballFilter}
      */
     @Override
     protected TokenStreamComponents createComponents(String fieldName,
         Reader reader) {
-      final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
-      TokenStream result = new LowerCaseFilter(matchVersion, source);
-      result = new StopFilter(matchVersion, result, stopwords);
-      if(!stemExclusionSet.isEmpty())
-        result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
-      return new TokenStreamComponents(source, new RussianStemFilter(result));
-      
+      if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+        TokenStream result = new StandardFilter(source);
+        result = new LowerCaseFilter(matchVersion, result);
+        result = new StopFilter(matchVersion, result, stopwords);
+        if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
+            result, stemExclusionSet);
+        result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
+        return new TokenStreamComponents(source, result);
+      } else {
+        final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
+        TokenStream result = new LowerCaseFilter(matchVersion, source);
+        result = new StopFilter(matchVersion, result, stopwords);
+        if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
+          result, stemExclusionSet);
+        return new TokenStreamComponents(source, new RussianStemFilter(result));
+      }
     }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java Fri Feb  5 23:05:46 2010
@@ -21,6 +21,7 @@
 import org.apache.lucene.analysis.CharTokenizer;
 import org.apache.lucene.analysis.Tokenizer; // for javadocs
 import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
+import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.Version;
 
@@ -35,8 +36,11 @@
  * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
  * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
  * {@link CharTokenizer#normalize(int)} for details.</li>
- * </ul> 
+ * </ul>
+ * @deprecated Use {@link StandardTokenizer} instead, which has the same functionality.
+ * This filter will be removed in Lucene 4.0 
  */
+@Deprecated
 public class RussianLetterTokenizer extends CharTokenizer
 {    
     private static final int DIGIT_0 = '0';

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java Fri Feb  5 23:05:46 2010
@@ -24,6 +24,7 @@
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter; // javadoc @link
 
 import java.io.IOException;
 
@@ -40,7 +41,11 @@
  * the {@link KeywordAttribute} before this {@link TokenStream}.
  * </p>
  * @see KeywordMarkerTokenFilter
+ * @deprecated Use {@link SnowballFilter} with 
+ * {@link org.tartarus.snowball.ext.RussianStemmer} instead, which has the
+ * same functionality. This filter will be removed in Lucene 4.0
  */
+@Deprecated
 public final class RussianStemFilter extends TokenFilter
 {
     /**

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java Fri Feb  5 23:05:46 2010
@@ -19,7 +19,10 @@
 
 /**
  * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
+ * @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead, 
+ * which has the same functionality. This filter will be removed in Lucene 4.0
  */
+@Deprecated
 class RussianStemmer
 {
     // positions of RV, R1 and R2 respectively

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -39,7 +39,10 @@
  *   <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
  * </ul>
  * </p>
+ * @deprecated Use the language-specific analyzer in contrib/analyzers instead. 
+ * This analyzer will be removed in Lucene 4.0
  */
+@Deprecated
 public final class SnowballAnalyzer extends Analyzer {
   private String name;
   private Set<?> stopSet;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java Fri Feb  5 23:05:46 2010
@@ -21,6 +21,7 @@
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
 import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
@@ -39,14 +40,14 @@
  */
 public final class SnowballFilter extends TokenFilter {
 
-  private SnowballProgram stemmer;
+  private final SnowballProgram stemmer;
 
-  private TermAttribute termAtt;
+  private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
   
   public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
     super(input);
     this.stemmer = stemmer;
-    termAtt = addAttribute(TermAttribute.class);
   }
 
   /**
@@ -67,23 +68,24 @@
     } catch (Exception e) {
       throw new RuntimeException(e.toString());
     }
-    termAtt = addAttribute(TermAttribute.class);
   }
 
   /** Returns the next input Token, after being stemmed */
   @Override
   public final boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
-      char termBuffer[] = termAtt.termBuffer();
-      final int length = termAtt.termLength();
-      stemmer.setCurrent(termBuffer, length);
-      stemmer.stem();
-      final char finalTerm[] = stemmer.getCurrentBuffer();
-      final int newLength = stemmer.getCurrentBufferLength();
-      if (finalTerm != termBuffer)
-        termAtt.setTermBuffer(finalTerm, 0, newLength);
-      else
-        termAtt.setTermLength(newLength); 
+      if (!keywordAttr.isKeyword()) {
+        char termBuffer[] = termAtt.termBuffer();
+        final int length = termAtt.termLength();
+        stemmer.setCurrent(termBuffer, length);
+        stemmer.stem();
+        final char finalTerm[] = stemmer.getCurrentBuffer();
+        final int newLength = stemmer.getCurrentBufferLength();
+        if (finalTerm != termBuffer)
+          termAtt.setTermBuffer(finalTerm, 0, newLength);
+        else
+          termAtt.setTermLength(newLength);
+      }
       return true;
     } else {
       return false;

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.sv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.SwedishStemmer;
+
+/**
+ * {@link Analyzer} for Swedish.
+ */
+public final class SwedishAnalyzer extends StopwordAnalyzerBase {
+  private final Set<?> stemExclusionSet;
+  
+  /** File containing default Swedish stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public SwedishAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public SwedishAnalyzer(Version matchVersion, Set<?> stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+   * stemming.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public SwedishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+   *         exclusion set is provided and {@link SnowballFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+    result = new SnowballFilter(result, new SwedishStemmer());
+    return new TokenStreamComponents(source, result);
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html Fri Feb  5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Swedish.
+</body>
+</html>

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java Fri Feb  5 23:05:46 2010
@@ -19,7 +19,6 @@
 import java.io.IOException;
 import java.util.Locale;
 import java.lang.Character.UnicodeBlock;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,132 @@
+package org.apache.lucene.analysis.tr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.TurkishStemmer;
+
+/**
+ * {@link Analyzer} for Turkish.
+ */
+public final class TurkishAnalyzer extends StopwordAnalyzerBase {
+  private final Set<?> stemExclusionSet;
+  
+  /** File containing default Turkish stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  /**
+   * The comment character in the stopwords file.  
+   * All lines prefixed with this will be ignored.
+   */
+  private static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, TurkishAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public TurkishAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public TurkishAnalyzer(Version matchVersion, Set<?> stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+   * stemming.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public TurkishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+   * {@link Reader}.
+   * 
+   * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+   *         filtered with {@link StandardFilter}, {@link TurkishLowerCaseFilter},
+   *         {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+   *         exclusion set is provided and {@link SnowballFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new TurkishLowerCaseFilter(result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+    result = new SnowballFilter(result, new TurkishStemmer());
+    return new TokenStreamComponents(source, result);
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html Fri Feb  5 23:05:46 2010
@@ -17,15 +17,6 @@
 -->
 <html><head></head>
 <body>
-Support for Turkish.
-<p>
-This package contains just the TokenStream for handling turkish casing,
-for a stemmer please see the snowball package. 
-</p>
-<p>
-WARNING: SnowballAnalyzer uses LowerCaseFilter by default, even when the
-language is set to Turkish, so you will need to construct your own
-analyzer that combines TurkishLowerCaseFilter and SnowballFilter.
-</p>
+Analyzer for Turkish.
 </body>
-</html>
\ No newline at end of file
+</html>

Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt Fri Feb  5 23:05:46 2010
@@ -0,0 +1,233 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+acea
+aceasta
+această
+aceea
+acei
+aceia
+acel
+acela
+acele
+acelea
+acest
+acesta
+aceste
+acestea
+aceşti
+aceştia
+acolo
+acum
+ai
+aia
+aibă
+aici
+al
+ăla
+ale
+alea
+ălea
+altceva
+altcineva
+am
+ar
+are
+aş
+aşadar
+asemenea
+asta
+ăsta
+astăzi
+astea
+ăstea
+ăştia
+asupra
+aţi
+au
+avea
+avem
+aveţi
+azi
+bine
+bucur
+bună
+ca
+că
+căci
+când
+care
+cărei
+căror
+cărui
+cât
+câte
+câţi
+către
+câtva
+ce
+cel
+ceva
+chiar
+cînd
+cine
+cineva
+cît
+cîte
+cîţi
+cîtva
+contra
+cu
+cum
+cumva
+curând
+curînd
+da
+dă
+dacă
+dar
+datorită
+de
+deci
+deja
+deoarece
+departe
+deşi
+din
+dinaintea
+dintr
+dintre
+drept
+după
+ea
+ei
+el
+ele
+eram
+este
+eşti
+eu
+face
+fără
+fi
+fie
+fiecare
+fii
+fim
+fiţi
+iar
+ieri
+îi
+îl
+îmi
+împotriva
+în 
+înainte
+înaintea
+încât
+încît
+încotro
+între
+întrucât
+întrucît
+îţi
+la
+lângă
+le
+li
+lîngă
+lor
+lui
+mă
+mâine
+mea
+mei
+mele
+mereu
+meu
+mi
+mine
+mult
+multă
+mulţi
+ne
+nicăieri
+nici
+nimeni
+nişte
+noastră
+noastre
+noi
+noştri
+nostru
+nu
+ori
+oricând
+oricare
+oricât
+orice
+oricînd
+oricine
+oricît
+oricum
+oriunde
+până
+pe
+pentru
+peste
+pînă
+poate
+pot
+prea
+prima
+primul
+prin
+printr
+sa
+să
+săi
+sale
+sau
+său
+se
+şi
+sînt
+sîntem
+sînteţi
+spre
+sub
+sunt
+suntem
+sunteţi
+ta
+tăi
+tale
+tău
+te
+Å£i
+Å£ie
+tine
+toată
+toate
+tot
+toţi
+totuşi
+tu
+un
+una
+unde
+undeva
+unei
+unele
+uneori
+unor
+vă
+vi
+voastră
+voastre
+voi
+voştri
+vostru
+vouă
+vreo
+vreun

Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt Fri Feb  5 23:05:46 2010
@@ -0,0 +1,212 @@
+# Turkish stopwords from LUCENE-559
+# merged with the list from "Information Retrieval on Turkish Texts"
+#   (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
+acaba
+altmış
+altı
+ama
+ancak
+arada
+aslında
+ayrıca
+bana
+bazı
+belki
+ben
+benden
+beni
+benim
+beri
+beş
+bile
+bin
+bir
+birçok
+biri
+birkaç
+birkez
+birşey
+birşeyi
+biz
+bize
+bizden
+bizi
+bizim
+böyle
+böylece
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+burada
+çok
+çünkü
+da
+daha
+dahi
+de
+defa
+değil
+diğer
+diye
+doksan
+dokuz
+dolayı
+dolayısıyla
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+eğer
+elli
+en
+etmesi
+etti
+ettiği
+ettiğini
+gibi
+göre
+halen
+hangi
+hatta
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkesin
+hiç
+hiçbir
+için
+iki
+ile
+ilgili
+ise
+işte
+itibaren
+itibariyle
+kadar
+karşın
+katrilyon
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kez
+ki
+kim
+kimden
+kime
+kimi
+kimse
+kırk
+milyar
+milyon
+mu
+mü
+mı
+nasıl
+ne
+neden
+nedenle
+nerde
+nerede
+nereye
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduğu
+olduğunu
+olduklarını
+olmadı
+olmadığı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+ondan
+onlar
+onlardan
+onları
+onların
+onu
+onun
+otuz
+oysa
+öyle
+pek
+rağmen
+sadece
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+şey
+şeyden
+şeyi
+şeyler
+şöyle
+şu
+şuna
+şunda
+şundan
+şunları
+şunu
+tarafından
+trilyon
+tüm
+üç
+üzere
+var
+vardı
+ve
+veya
+ya
+yani
+yapacak
+yapılan
+yapılması
+yapıyor
+yapmak
+yaptı
+yaptığı
+yaptığını
+yaptıkları
+yedi
+yerine
+yetmiş
+yine
+yirmi
+yoksa
+yüz
+zaten

Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -22,8 +22,6 @@
 import java.util.HashSet;
 import java.util.Set;
 
-import javax.print.DocFlavor.CHAR_ARRAY;
-
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.Version;

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java Fri Feb  5 23:05:46 2010
@@ -21,7 +21,6 @@
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.Version;
 
 /**

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.da;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new DanishAnalyzer(Version.LUCENE_CURRENT);
+  }
+  
+  /** test stopwords and stemming */
+  public void testBasics() throws IOException {
+    Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT);
+    // stemming
+    checkOneTermReuse(a, "undersøg", "undersøg");
+    checkOneTermReuse(a, "undersøgelse", "undersøg");
+    // stopword
+    assertAnalyzesTo(a, "på", new String[] {});
+  }
+  
+  /** test use of exclusion set */
+  public void testExclude() throws IOException {
+    Set<String> exclusionSet = new HashSet<String>();
+    exclusionSet.add("undersøgelse");
+    Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT, 
+        DanishAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTermReuse(a, "undersøgelse", "undersøgelse");
+    checkOneTermReuse(a, "undersøg", "undersøg");
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,93 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.util.Version;
+
+public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
+  public void testReusableTokenStream() throws Exception {
+    Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+    checkOneTermReuse(a, "Tisch", "tisch");
+    checkOneTermReuse(a, "Tische", "tisch");
+    checkOneTermReuse(a, "Tischen", "tisch");
+  }
+  
+  public void testExclusionTableBWCompat() throws IOException {
+    GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, 
+        new StringReader("Fischen Trinken")));
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("fischen");
+    filter.setExclusionSet(set);
+    assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+  }
+
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("fischen");
+    GermanStemFilter filter = new GermanStemFilter(
+        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader( 
+            "Fischen Trinken")), set));
+    assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+  }
+
+  public void testWithKeywordAttributeAndExclusionTable() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("fischen");
+    CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set1.add("trinken");
+    set1.add("fischen");
+    GermanStemFilter filter = new GermanStemFilter(
+        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
+            "Fischen Trinken")), set));
+    filter.setExclusionSet(set1);
+    assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
+  }
+  
+  /* 
+   * Test that changes to the exclusion table are applied immediately
+   * when using reusable token streams.
+   */
+  public void testExclusionTableReuse() throws Exception {
+    GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+    checkOneTermReuse(a, "tischen", "tisch");
+    a.setStemExclusionTable(new String[] { "tischen" });
+    checkOneTermReuse(a, "tischen", "tischen");
+  }
+  
+  /** test some features of the new snowball filter
+   * these only pass with LUCENE_CURRENT, not if you use o.a.l.a.de.GermanStemmer
+   */
+  public void testGermanSpecials() throws Exception {
+    GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+    // a/o/u + e is equivalent to the umlaut form
+    checkOneTermReuse(a, "Schaltflächen", "schaltflach");
+    checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
+    // here they are with the old stemmer
+    a = new GermanAnalyzer(Version.LUCENE_30);
+    checkOneTermReuse(a, "Schaltflächen", "schaltflach");
+    checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Fri Feb  5 23:05:46 2010
@@ -20,15 +20,14 @@
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
-import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util.Version;
 
 /**
@@ -40,6 +39,8 @@
 public class TestGermanStemFilter extends BaseTokenStreamTestCase {
 
   public void testStemming() throws Exception {
+    Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
+    TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer));
     // read test cases from external file:
     File dataDir = new File(System.getProperty("dataDir", "./bin"));
     File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
@@ -55,68 +56,12 @@
         continue;    // ignore comments and empty lines
       String[] parts = line.split(";");
       //System.out.println(parts[0] + " -- " + parts[1]);
-      check(parts[0], parts[1]);
+      tokenizer.reset(new StringReader(parts[0]));
+      filter.reset();
+      assertTokenStreamContents(filter, new String[] { parts[1] });
     }
     breader.close();
     isr.close();
     fis.close();
   }
-  
-  public void testReusableTokenStream() throws Exception {
-    Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
-    checkReuse(a, "Tisch", "tisch");
-    checkReuse(a, "Tische", "tisch");
-    checkReuse(a, "Tischen", "tisch");
-  }
-  
-  public void testExclusionTableBWCompat() throws IOException {
-    GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, 
-        new StringReader("Fischen Trinken")));
-    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
-    set.add("fischen");
-    filter.setExclusionSet(set);
-    assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
-  }
-
-  public void testWithKeywordAttribute() throws IOException {
-    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
-    set.add("fischen");
-    GermanStemFilter filter = new GermanStemFilter(
-        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader( 
-            "Fischen Trinken")), set));
-    assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
-  }
-
-  public void testWithKeywordAttributeAndExclusionTable() throws IOException {
-    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
-    set.add("fischen");
-    CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
-    set1.add("trinken");
-    set1.add("fischen");
-    GermanStemFilter filter = new GermanStemFilter(
-        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
-            "Fischen Trinken")), set));
-    filter.setExclusionSet(set1);
-    assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
-  }
-  
-  /* 
-   * Test that changes to the exclusion table are applied immediately
-   * when using reusable token streams.
-   */
-  public void testExclusionTableReuse() throws Exception {
-    GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
-    checkReuse(a, "tischen", "tisch");
-    a.setStemExclusionTable(new String[] { "tischen" });
-    checkReuse(a, "tischen", "tischen");
-  }
-  
-  
-  private void check(final String input, final String expected) throws Exception {
-    checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
-  }
-  
-  private void checkReuse(Analyzer a, String input, String expected) throws Exception {
-    checkOneTermReuse(a, input, expected);
-  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java Fri Feb  5 23:05:46 2010
@@ -18,7 +18,6 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Version;
 
 /**
@@ -63,4 +62,23 @@
 	    assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3  \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
 	            new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
 	}
+	
+	/**
+	 * Greek Analyzer didn't call standardFilter, so no normalization of acronyms.
+	 * check that this is preserved.
+	 * @deprecated remove this test in Lucene 4.0
+	 */
+	@Deprecated
+	public void testAcronymBWCompat() throws Exception {
+	  Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
+	  assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "α.π.τ." });
+	}
+	
+  /**
+   * test that acronym normalization works
+   */
+  public void testAcronym() throws Exception {
+    Analyzer a = new GreekAnalyzer(Version.LUCENE_31);
+    assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "απτ" });
+  }
 }

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.en;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new EnglishAnalyzer(Version.LUCENE_CURRENT);
+  }
+  
+  /** test stopwords and stemming */
+  public void testBasics() throws IOException {
+    Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT);
+    // stemming
+    checkOneTermReuse(a, "books", "book");
+    checkOneTermReuse(a, "book", "book");
+    // stopword
+    assertAnalyzesTo(a, "the", new String[] {});
+  }
+  
+  /** test use of exclusion set */
+  public void testExclude() throws IOException {
+    Set<String> exclusionSet = new HashSet<String>();
+    exclusionSet.add("books");
+    Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT, 
+        EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTermReuse(a, "books", "books");
+    checkOneTermReuse(a, "book", "book");
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.es;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new SpanishAnalyzer(Version.LUCENE_CURRENT);
+  }
+  
+  /** test stopwords and stemming */
+  public void testBasics() throws IOException {
+    Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT);
+    // stemming
+    checkOneTermReuse(a, "chicana", "chican");
+    checkOneTermReuse(a, "chicano", "chican");
+    // stopword
+    assertAnalyzesTo(a, "los", new String[] {});
+  }
+  
+  /** test use of exclusion set */
+  public void testExclude() throws IOException {
+    Set<String> exclusionSet = new HashSet<String>();
+    exclusionSet.add("chicano");
+    Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT, 
+        SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTermReuse(a, "chicana", "chican");
+    checkOneTermReuse(a, "chicano", "chicano");
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java Fri Feb  5 23:05:46 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.fi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new FinnishAnalyzer(Version.LUCENE_CURRENT);
+  }
+  
+  /** test stopwords and stemming */
+  public void testBasics() throws IOException {
+    Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT);
+    // stemming
+    checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
+    checkOneTermReuse(a, "edeltäjistään", "edeltäj");
+    // stopword
+    assertAnalyzesTo(a, "olla", new String[] {});
+  }
+  
+  /** test use of exclusion set */
+  public void testExclude() throws IOException {
+    Set<String> exclusionSet = new HashSet<String>();
+    exclusionSet.add("edeltäjistään");
+    Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT, 
+        FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
+    checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native