You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/07/14 14:10:37 UTC
svn commit: r964019 [2/4] - in /lucene/dev/trunk: lucene/contrib/ modules/analysis/ modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ modules/analysis/common/src/java/org/apache/lucene/analysis/bg/ modules/analysis/common/src/java/org/apa...

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,238 @@
+package org.apache.lucene.analysis.hu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ * 
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer. Redistributions in binary 
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials 
+ * provided with the distribution. Neither the name of the author nor the names 
+ * of its contributors may be used to endorse or promote products derived from 
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Hungarian.
+ * <p>
+ * This stemmer implements the "UniNE" algorithm in:
+ * <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
+ * Jacques Savoy
+ */
+public class HungarianLightStemmer {
+  public int stem(char s[], int len) {
+    for (int i = 0; i < len; i++)
+      switch(s[i]) {
+        case 'Ã¡': s[i] = 'a'; break;
+        case 'Ã«':
+        case 'Ã©': s[i] = 'e'; break;
+        case 'Ã': s[i] = 'i'; break;
+        case 'Ã³':
+        case 'Å':
+        case 'Ãµ':
+        case 'Ã¶': s[i] = 'o'; break;
+        case 'Ãº':
+        case 'Å±':
+        case 'Å©':
+        case 'Ã»':
+        case 'Ã¼': s[i] = 'u'; break;
+      }
+    
+    len = removeCase(s, len);
+    len = removePossessive(s, len);
+    len = removePlural(s, len);
+    return normalize(s, len);
+  }
+  
+  private int removeCase(char s[], int len) {
+    if (len > 6 && endsWith(s, len, "kent"))
+      return len - 4;
+    
+    if (len > 5) {
+      if (endsWith(s, len, "nak") ||
+          endsWith(s, len, "nek") ||
+          endsWith(s, len, "val") ||
+          endsWith(s, len, "vel") ||
+          endsWith(s, len, "ert") ||
+          endsWith(s, len, "rol") ||
+          endsWith(s, len, "ban") ||
+          endsWith(s, len, "ben") ||
+          endsWith(s, len, "bol") ||
+          endsWith(s, len, "nal") ||
+          endsWith(s, len, "nel") ||
+          endsWith(s, len, "hoz") ||
+          endsWith(s, len, "hez") ||
+          endsWith(s, len, "tol"))
+        return len - 3;
+      
+      if (endsWith(s, len, "al") || endsWith(s, len, "el")) {
+        if (!isVowel(s[len-3]) && s[len-3] == s[len-4])
+          return len - 3;
+      }
+    }
+    
+    if (len > 4) {
+      if (endsWith(s, len, "at") ||
+          endsWith(s, len, "et") ||
+          endsWith(s, len, "ot") ||
+          endsWith(s, len, "va") ||
+          endsWith(s, len, "ve") ||
+          endsWith(s, len, "ra") ||
+          endsWith(s, len, "re") ||
+          endsWith(s, len, "ba") ||
+          endsWith(s, len, "be") ||
+          endsWith(s, len, "ul") ||
+          endsWith(s, len, "ig"))
+        return len - 2;
+      
+      if ((endsWith(s, len, "on") || endsWith(s, len, "en")) && !isVowel(s[len-3]))
+          return len - 2;
+      
+      switch(s[len-1]) {
+        case 't':
+        case 'n': return len - 1;
+        case 'a':
+        case 'e': if (s[len-2] == s[len-3] && !isVowel(s[len-2])) return len - 2;
+      }
+    }
+    
+    return len;
+  }
+
+  private int removePossessive(char s[], int len) {
+    if (len > 6) {
+      if (!isVowel(s[len-5]) && 
+         (endsWith(s, len, "atok") || 
+          endsWith(s, len, "otok") || 
+          endsWith(s, len, "etek")))
+        return len - 4;
+      
+      if (endsWith(s, len, "itek") || endsWith(s, len, "itok"))
+        return len - 4;
+    }
+    
+    if (len > 5) {
+      if (!isVowel(s[len-4]) &&
+        (endsWith(s, len, "unk") ||
+         endsWith(s, len, "tok") ||
+         endsWith(s, len, "tek")))
+        return len - 3;
+      
+      if (isVowel(s[len-4]) && endsWith(s, len, "juk"))
+        return len - 3;
+      
+      if (endsWith(s, len, "ink"))
+        return len - 3;
+    }
+    
+    if (len > 4) {
+      if (!isVowel(s[len-3]) &&
+         (endsWith(s, len, "am") ||
+          endsWith(s, len, "em") ||
+          endsWith(s, len, "om") ||
+          endsWith(s, len, "ad") ||
+          endsWith(s, len, "ed") ||
+          endsWith(s, len, "od") ||
+          endsWith(s, len, "uk")))
+        return len - 2;
+      
+      if (isVowel(s[len-3]) &&
+         (endsWith(s, len, "nk") ||
+          endsWith(s, len, "ja") ||
+          endsWith(s, len, "je")))
+        return len - 2;
+      
+      if (endsWith(s, len, "im") ||
+          endsWith(s, len, "id") ||
+          endsWith(s, len, "ik"))
+        return len - 2;
+    }
+    
+    if (len > 3)
+      switch(s[len-1]) {
+        case 'a':
+        case 'e': if (!isVowel(s[len-2])) return len - 1; break;
+        case 'm':
+        case 'd': if (isVowel(s[len-2])) return len - 1; break;
+        case 'i': return len - 1;
+      }
+    
+    return len;
+  }
+
+  private int removePlural(char s[], int len) {
+    if (len > 3 && s[len-1] == 'k')
+      switch(s[len-2]) {
+        case 'a':
+        case 'o':
+        case 'e': if (len > 4) return len - 2; /* intentional fallthru */
+        default: return len - 1;
+      }
+    return len;
+  }
+
+  private int normalize(char s[], int len) {
+    if (len > 3)
+      switch(s[len-1]) {
+        case 'a':
+        case 'e':
+        case 'i':
+        case 'o': return len - 1;
+      }
+    return len;
+  }
+
+  private boolean isVowel(char ch) {
+    switch(ch) {
+      case 'a':
+      case 'e':
+      case 'i':
+      case 'o':
+      case 'u':
+      case 'y': return true;
+      default: return false;
+    }
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java Wed Jul 14 12:10:34 2010
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis.id;
  * limitations under the License.
  */
 
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
 /**
  * Stemmer for Indonesian.
  * <p>
@@ -266,39 +268,5 @@ public class IndonesianStemmer {
       return length - 1;
     }
     return length;
-  }
-  
-  private boolean startsWith(char s[], int len, String prefix) {
-    final int prefixLen = prefix.length();
-    if (prefixLen > len)
-      return false;
-    for (int i = 0; i < prefixLen; i++)
-      if (s[i] != prefix.charAt(i)) 
-        return false;
-    return true;
-  }
-  
-  private boolean endsWith(char s[], int len, String suffix) {
-    final int suffixLen = suffix.length();
-    if (suffixLen > len)
-      return false;
-    for (int i = suffixLen - 1; i >= 0; i--)
-      if (s[len -(suffixLen - i)] != suffix.charAt(i))
-        return false;
-    
-    return true;
-  }
-  
-  private int deleteN(char s[], int pos, int len, int nChars) {
-    for (int i = 0; i < nChars; i++)
-      len = delete(s, pos, len);
-    return len;
-  }
-  
-  private int delete(char s[], int pos, int len) {
-    if (pos < len) 
-      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
-    
-    return len - 1;
-  }
+  }  
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java Wed Jul 14 12:10:34 2010
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.in;
 import java.util.BitSet;
 import java.util.IdentityHashMap;
 import static java.lang.Character.UnicodeBlock.*;
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
 
 /**
  * Normalizes the Unicode representation of text in Indian languages.
@@ -290,14 +291,4 @@ public class IndicNormalizer {
     
     return len;
   }
-  
-  /**
-   * Delete a character in-place
-   */
-  private int delete(char s[], int pos, int len) {
-    if (pos < len) 
-      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
-    
-    return len - 1;
-  }
 }

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.it;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link ItalianLightStemmer} to stem Italian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class ItalianLightStemFilter extends TokenFilter {
+  private final ItalianLightStemmer stemmer = new ItalianLightStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public ItalianLightStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,117 @@
+package org.apache.lucene.analysis.it;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ * 
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer. Redistributions in binary 
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials 
+ * provided with the distribution. Neither the name of the author nor the names 
+ * of its contributors may be used to endorse or promote products derived from 
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Light Stemmer for Italian.
+ * <p>
+ * This stemmer implements the algorithm described in:
+ * <i>Report on CLEF-2001 Experiments</i>
+ * Jacques Savoy
+ */
+public class ItalianLightStemmer {
+  
+  public int stem(char s[], int len) {
+    if (len < 6)
+      return len;
+    
+    for (int i = 0; i < len; i++)
+      switch(s[i]) {
+        case 'Ã ': 
+        case 'Ã¡':
+        case 'Ã¢':
+        case 'Ã¤': s[i] = 'a'; break;
+        case 'Ã²':
+        case 'Ã³':
+        case 'Ã´':
+        case 'Ã¶': s[i] = 'o'; break;
+        case 'Ã¨':
+        case 'Ã©':
+        case 'Ãª':
+        case 'Ã«': s[i] = 'e'; break;
+        case 'Ã¹':
+        case 'Ãº':
+        case 'Ã»':
+        case 'Ã¼': s[i] = 'u'; break;
+        case 'Ã¬':
+        case 'Ã':
+        case 'Ã®':
+        case 'Ã¯': s[i] = 'i'; break;
+      }
+    
+    switch(s[len-1]) {
+      case 'e':
+        if (s[len-2] == 'i' || s[len-2] == 'h')
+          return len - 2;
+        else
+          return len - 1;
+      case 'i':
+        if (s[len-2] == 'h' || s[len-2] == 'i')
+          return len - 2;
+        else
+          return len - 1;
+      case 'a':
+        if (s[len-2] == 'i')
+          return len - 2;
+        else
+          return len - 1;
+      case 'o':
+        if (s[len-2] == 'i')
+          return len - 2;
+        else
+          return len - 1;
+    }
+    
+    return len;
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link PortugueseLightStemmer} to stem 
+ * Portuguese words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class PortugueseLightStemFilter extends TokenFilter {
+  private final PortugueseLightStemmer stemmer = new PortugueseLightStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public PortugueseLightStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,202 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ * 
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer. Redistributions in binary 
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials 
+ * provided with the distribution. Neither the name of the author nor the names 
+ * of its contributors may be used to endorse or promote products derived from 
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Portuguese
+ */
+public class PortugueseLightStemmer {
+  
+  public int stem(char s[], int len) {
+    if (len < 4)
+      return len;
+    
+    len = removeSuffix(s, len);
+    
+    if (len > 3 && s[len-1] == 'a')
+      len = normFeminine(s, len);
+    
+    if (len > 4)
+      switch(s[len-1]) {
+        case 'e':
+        case 'a':
+        case 'o': len--; break;
+      }
+    
+    for (int i = 0; i < len; i++)
+      switch(s[i]) {
+        case 'Ã ': 
+        case 'Ã¡':
+        case 'Ã¢':
+        case 'Ã¤': 
+        case 'Ã£': s[i] = 'a'; break;
+        case 'Ã²':
+        case 'Ã³':
+        case 'Ã´':
+        case 'Ã¶': 
+        case 'Ãµ': s[i] = 'o'; break;
+        case 'Ã¨':
+        case 'Ã©':
+        case 'Ãª':
+        case 'Ã«': s[i] = 'e'; break;
+        case 'Ã¹':
+        case 'Ãº':
+        case 'Ã»':
+        case 'Ã¼': s[i] = 'u'; break;
+        case 'Ã¬':
+        case 'Ã':
+        case 'Ã®':
+        case 'Ã¯': s[i] = 'i'; break;
+        case 'Ã§': s[i] = 'c'; break;
+      }
+
+    return len;
+  }
+  
+  private int removeSuffix(char s[], int len) {
+    if (len > 4 && endsWith(s, len, "es"))
+      switch(s[len-3]) {
+        case 'r':
+        case 's':
+        case 'l':
+        case 'z': return len - 2;
+      }
+    
+    if (len > 3 && endsWith(s, len, "ns")) {
+      s[len - 2] = 'm';
+      return len - 1;
+    }
+    
+    if (len > 4 && (endsWith(s, len, "eis") || endsWith(s, len, "Ã©is"))) {
+      s[len - 3] = 'e';
+      s[len - 2] = 'l';
+      return len - 1;
+    }
+    
+    if (len > 4 && endsWith(s, len, "ais")) {
+      s[len - 2] = 'l';
+      return len - 1;
+    }
+    
+    if (len > 4 && endsWith(s, len, "Ã³is")) {
+      s[len - 3] = 'o';
+      s[len - 2] = 'l';
+      return len - 1;
+    }
+    
+    if (len > 4 && endsWith(s, len, "is")) {
+      s[len - 1] = 'l';
+      return len;
+    }
+    
+    if (len > 3 &&
+        (endsWith(s, len, "Ãµes") ||
+         endsWith(s, len, "Ã£es"))) {
+      len--;
+      s[len - 2] = 'Ã£';
+      s[len - 1] = 'o';
+      return len;
+    }
+    
+    if (len > 6 && endsWith(s, len, "mente"))
+      return len - 5;
+    
+    if (len > 3 && s[len-1] == 's')
+      return len - 1;
+    return len;
+  }
+
+  private int normFeminine(char s[], int len) {
+    if (len > 7 && 
+        (endsWith(s, len, "inha") ||
+         endsWith(s, len, "iaca") ||
+         endsWith(s, len, "eira"))) {
+      s[len - 1] = 'o';
+      return len;
+    }
+    
+    if (len > 6) {
+      if (endsWith(s, len, "osa") ||
+          endsWith(s, len, "ica") ||
+          endsWith(s, len, "ida") ||
+          endsWith(s, len, "ada") ||
+          endsWith(s, len, "iva") ||
+          endsWith(s, len, "ama")) {
+        s[len - 1] = 'o';
+        return len;
+      }
+      
+      if (endsWith(s, len, "ona")) {
+        s[len - 3] = 'Ã£';
+        s[len - 2] = 'o';
+        return len - 1;
+      }
+      
+      if (endsWith(s, len, "ora"))
+        return len - 1;
+      
+      if (endsWith(s, len, "esa")) {
+        s[len - 3] = 'Ãª';
+        return len - 1;
+      }
+      
+      if (endsWith(s, len, "na")) {
+        s[len - 1] = 'o';
+        return len;
+      }
+    }
+    return len;
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link PortugueseMinimalStemmer} to stem 
+ * Portuguese words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class PortugueseMinimalStemFilter extends TokenFilter {
+  private final PortugueseMinimalStemmer stemmer = new PortugueseMinimalStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public PortugueseMinimalStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,119 @@
+package org.apache.lucene.analysis.pt;
+
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Minimal Stemmer for Portuguese
+ * <p>
+ * This follows the "RSLP-S" algorithm presented in:
+ * <i>A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese
+ * Information Retrieval</i> (Orengo, et al)
+ * which is just the plural reduction step of the RSLP
+ * algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>,
+ * Orengo et al.
+ */
+public class PortugueseMinimalStemmer {
+  
+  private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31,
+      Arrays.asList("lÃ¡pis", "cais", "mais", "crÃºcis", "biquÃnis", "pois", 
+          "depois","dois","leis"),
+      false);
+  
+  private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31,
+      Arrays.asList("aliÃ¡s", "pires", "lÃ¡pis", "cais", "mais", "mas", "menos",
+          "fÃ©rias", "fezes", "pÃªsames", "crÃºcis", "gÃ¡s", "atrÃ¡s", "moisÃ©s",
+          "atravÃ©s", "convÃ©s", "Ãªs", "paÃs", "apÃ³s", "ambas", "ambos",
+          "messias", "depois"), 
+      false);
+  
+  public int stem(char s[], int len) {
+    if (len < 3 || s[len-1] != 's')
+      return len;
+    
+    if (s[len-2] == 'n') {
+      len--;
+      s[len-1] = 'm';
+      return len;
+    }
+    
+    if (len >= 6 && s[len-3] == 'Ãµ' && s[len-2] == 'e') {
+      len--;
+      s[len-2] = 'Ã£';
+      s[len-1] = 'o';
+      return len;
+    }
+      
+    if (len >= 4 && s[len-3] == 'Ã£' && s[len-2] == 'e')
+      if (!(len == 4 && s[0] == 'm')) {
+        len--;
+        s[len-1] = 'o';
+        return len;
+      }
+    
+    if (len >= 4 && s[len-2] == 'i') {
+      if (s[len-3] == 'a')
+        if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) {
+          len--;
+          s[len-1] = 'l';
+          return len;
+        }
+   
+      if (len >= 5 && s[len-3] == 'Ã©') {
+        len--;
+        s[len-2] = 'e';
+        s[len-1] = 'l';
+        return len;
+      }
+    
+      if (len >= 5 && s[len-3] == 'e') {
+        len--;
+        s[len-1] = 'l';
+        return len;
+      }
+    
+      if (len >= 5 && s[len-3] == 'Ã³') {
+        len--;
+        s[len-2] = 'o';
+        s[len-1] = 'l';
+        return len;
+      }
+  
+      if (!excIS.contains(s, 0, len)) {
+        s[len-1] = 'l';
+        return len;
+      }
+    }
+    
+    if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e')
+      return len - 2;
+    
+    if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e')
+      if (!(len == 7 && s[0] == 'Ã¡' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o'))
+        return len - 2;
+      
+    if (excS.contains(s, 0, len))
+      return len;
+    else
+      return len-1;
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link RussianLightStemmer} to stem Russian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class RussianLightStemFilter extends TokenFilter {
+  private final RussianLightStemmer stemmer = new RussianLightStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public RussianLightStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,153 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ * 
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer. Redistributions in binary 
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials 
+ * provided with the distribution. Neither the name of the author nor the names 
+ * of its contributors may be used to endorse or promote products derived from 
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Russian.
+ * <p>
+ * This stemmer implements the following algorithm:
+ * <i>Indexing and Searching Strategies for the Russian Language.</i>
+ * Ljiljana Dolamic and Jacques Savoy.
+ */
+public class RussianLightStemmer {
+
+  public int stem(char s[], int len) {
+    len = removeCase(s, len);
+    return normalize(s, len);
+  }
+  
+  private int normalize(char s[], int len) {
+    if (len > 3)
+      switch(s[len-1]) { 
+        case 'Ñ':
+        case 'Ð¸': return len - 1;
+        case 'Ð½': if (s[len-2] == 'Ð½') return len - 1;
+      }
+    return len;
+  }
+
+  private int removeCase(char s[], int len) {
+    if (len > 6 && 
+        (endsWith(s, len, "Ð¸ÑÐ¼Ð¸") ||
+         endsWith(s, len, "Ð¾ÑÐ¼Ð¸")))
+      return len - 4;
+    
+    if (len > 5 && 
+        (endsWith(s, len, "Ð¸ÑÐ¼") ||
+         endsWith(s, len, "Ð¸ÑÑ") ||
+         endsWith(s, len, "Ð¾ÑÑ") ||
+         endsWith(s, len, "ÑÐ¼Ð¸") ||
+         endsWith(s, len, "Ð¾ÑÐ¼") ||
+         endsWith(s, len, "Ð¾ÑÐ²") ||
+         endsWith(s, len, "Ð°Ð¼Ð¸") ||
+         endsWith(s, len, "ÐµÐ³Ð¾") ||
+         endsWith(s, len, "ÐµÐ¼Ñ") ||
+         endsWith(s, len, "ÐµÑÐ¸") ||
+         endsWith(s, len, "Ð¸Ð¼Ð¸") ||
+         endsWith(s, len, "Ð¾Ð³Ð¾") ||
+         endsWith(s, len, "Ð¾Ð¼Ñ") ||
+         endsWith(s, len, "ÑÐ¼Ð¸") ||
+         endsWith(s, len, "Ð¾ÐµÐ²")))
+      return len - 3;
+    
+    if (len > 4 &&
+        (endsWith(s, len, "Ð°Ñ") ||
+         endsWith(s, len, "ÑÑ") ||
+         endsWith(s, len, "ÑÑ") ||
+         endsWith(s, len, "ÑÑ") ||
+         endsWith(s, len, "Ð°Ñ") ||
+         endsWith(s, len, "ÐµÑ") ||
+         endsWith(s, len, "Ð¸Ñ") ||
+         endsWith(s, len, "Ð¸Ñ") ||
+         endsWith(s, len, "Ð¸Ñ") ||
+         endsWith(s, len, "ÑÐ²") ||
+         endsWith(s, len, "Ð¾Ñ") ||
+         endsWith(s, len, "ÑÑ") ||
+         endsWith(s, len, "ÑÐ¼") ||
+         endsWith(s, len, "ÑÑ") ||
+         endsWith(s, len, "ÐµÑ") ||
+         endsWith(s, len, "Ð°Ð¼") ||
+         endsWith(s, len, "ÐµÐ¼") ||
+         endsWith(s, len, "ÐµÐ¹") ||
+         endsWith(s, len, "ÑÐ¼") ||
+         endsWith(s, len, "ÐµÐ²") ||
+         endsWith(s, len, "Ð¸Ð¹") ||
+         endsWith(s, len, "Ð¸Ð¼") ||
+         endsWith(s, len, "Ð¾Ðµ") ||
+         endsWith(s, len, "Ð¾Ð¹") ||
+         endsWith(s, len, "Ð¾Ð¼") ||
+         endsWith(s, len, "Ð¾Ð²") ||
+         endsWith(s, len, "ÑÐµ") ||
+         endsWith(s, len, "ÑÐ¹") ||
+         endsWith(s, len, "ÑÐ¼") ||
+         endsWith(s, len, "Ð¼Ð¸")))
+      return len - 2;
+    
+    if (len > 3)
+      switch(s[len-1]) {
+        case 'Ð°':
+        case 'Ðµ':
+        case 'Ð¸':
+        case 'Ð¾':
+        case 'Ñ':
+        case 'Ð¹':
+        case 'Ñ':
+        case 'Ñ':
+        case 'Ñ': return len - 1;
+      }
+    
+    return len;
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.sv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link SwedishLightStemmer} to stem Swedish
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class SwedishLightStemFilter extends TokenFilter {
+  private final SwedishLightStemmer stemmer = new SwedishLightStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public SwedishLightStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,111 @@
+package org.apache.lucene.analysis.sv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ * 
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer. Redistributions in binary 
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials 
+ * provided with the distribution. Neither the name of the author nor the names 
+ * of its contributors may be used to endorse or promote products derived from 
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Swedish.
+ * <p>
+ * This stemmer implements the algorithm described in:
+ * <i>Report on CLEF-2003 Monolingual Tracks</i>
+ * Jacques Savoy
+ */
+public class SwedishLightStemmer {
+  
+  public int stem(char s[], int len) {   
+    if (len > 4 && s[len-1] == 's')
+      len--;
+    
+    if (len > 7 && 
+        (endsWith(s, len, "elser") || 
+         endsWith(s, len, "heten")))
+      return len - 5;
+    
+    if (len > 6 &&
+        (endsWith(s, len, "arne") ||
+         endsWith(s, len, "erna") ||
+         endsWith(s, len, "ande") ||
+         endsWith(s, len, "else") ||
+         endsWith(s, len, "aste") ||
+         endsWith(s, len, "orna") ||
+         endsWith(s, len, "aren")))
+      return len - 4;
+    
+    if (len > 5 &&
+        (endsWith(s, len, "are") ||
+         endsWith(s, len, "ast") ||
+         endsWith(s, len, "het")))
+      return len - 3;
+    
+    if (len > 4 &&
+        (endsWith(s, len, "ar") ||
+         endsWith(s, len, "er") ||
+         endsWith(s, len, "or") ||
+         endsWith(s, len, "en") ||
+         endsWith(s, len, "at") ||
+         endsWith(s, len, "te") ||
+         endsWith(s, len, "et")))
+      return len - 2;
+    
+    if (len > 3)
+      switch(s[len-1]) {
+        case 't':
+        case 'a':
+        case 'e':
+        case 'n': return len - 1;
+      }
+    
+    return len;
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,89 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Some commonly-used stemming functions */
+public class StemmerUtil {
+  /**
+   * Returns true if the character array starts with the suffix.
+   * 
+   * @param s Input Buffer
+   * @param len length of input buffer
+   * @param suffix Suffix string to test
+   * @return true if <code>s</code> starts with <code>suffix</code>
+   */
+  public static boolean startsWith(char s[], int len, String prefix) {
+    final int prefixLen = prefix.length();
+    if (prefixLen > len)
+      return false;
+    for (int i = 0; i < prefixLen; i++)
+      if (s[i] != prefix.charAt(i)) 
+        return false;
+    return true;
+  }
+  
+  /**
+   * Returns true if the character array ends with the suffix.
+   * 
+   * @param s Input Buffer
+   * @param len length of input buffer
+   * @param suffix Suffix string to test
+   * @return true if <code>s</code> ends with <code>suffix</code>
+   */
+  public static boolean endsWith(char s[], int len, String suffix) {
+    final int suffixLen = suffix.length();
+    if (suffixLen > len)
+      return false;
+    for (int i = suffixLen - 1; i >= 0; i--)
+      if (s[len -(suffixLen - i)] != suffix.charAt(i))
+        return false;
+    
+    return true;
+  }
+  
+  /**
+   * Delete a character in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len length of input buffer
+   * @return length of input buffer after deletion
+   */
+  public static int delete(char s[], int pos, int len) {
+    if (pos < len) 
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+  
+  /**
+   * Delete n characters in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len Length of input buffer
+   * @param nChars number of characters to delete
+   * @return length of input buffer after deletion
+   */
+  public static int deleteN(char s[], int pos, int len, int nChars) {
+    // TODO: speed up, this is silly
+    for (int i = 0; i < nChars; i++)
+      len = delete(s, pos, len);
+    return len;
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link GermanLightStemFilter}
+ */
+public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+      return new TokenStreamComponents(source, new GermanLightStemFilter(source));
+    }
+  };
+  
+  /** Test against a vocabulary from the reference impl */
+  public void testVocabulary() throws IOException {
+    assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,60 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link GermanMinimalStemFilter}
+ */
+public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+      return new TokenStreamComponents(source, new GermanMinimalStemFilter(source));
+    }
+  };
+  
+  /** Test some examples from the paper */
+  public void testExamples() throws IOException {
+    checkOneTerm(analyzer, "sÃ¤ngerinnen", "sangerin");
+    checkOneTerm(analyzer, "frauen", "frau");
+    checkOneTerm(analyzer, "kenntnisse", "kenntnis");
+    checkOneTerm(analyzer, "staates", "staat");
+    checkOneTerm(analyzer, "bilder", "bild");
+    checkOneTerm(analyzer, "boote", "boot");
+    checkOneTerm(analyzer, "gÃ¶tter", "gott");
+    checkOneTerm(analyzer, "Ã¤pfel", "apfel");
+  }
+  
+  /** Test against a vocabulary from the reference impl */
+  public void testVocabulary() throws IOException {
+    assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Wed Jul 14 12:10:34 2010
@@ -17,17 +17,17 @@ package org.apache.lucene.analysis.de;
  * limitations under the License.
  */
 
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-import java.io.StringReader;
+import java.io.InputStream;
+import java.io.Reader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
 
 /**
  * Test the German stemmer. The stemming algorithm is known to work less 
@@ -38,25 +38,18 @@ import org.apache.lucene.analysis.core.L
 public class TestGermanStemFilter extends BaseTokenStreamTestCase {
 
   public void testStemming() throws Exception {
-    Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
-    TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer));
-    // read test cases from external file:
-    InputStreamReader isr = new InputStreamReader(getClass().getResourceAsStream("data.txt"), "iso-8859-1");
-    BufferedReader breader = new BufferedReader(isr);
-    while(true) {
-      String line = breader.readLine();
-      if (line == null)
-        break;
-      line = line.trim();
-      if (line.startsWith("#") || line.equals(""))
-        continue;    // ignore comments and empty lines
-      String[] parts = line.split(";");
-      //System.out.println(parts[0] + " -- " + parts[1]);
-      tokenizer.reset(new StringReader(parts[0]));
-      filter.reset();
-      assertTokenStreamContents(filter, new String[] { parts[1] });
-    }
-    breader.close();
-    isr.close();
+    Analyzer analyzer = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName,
+          Reader reader) {
+        Tokenizer t = new KeywordTokenizer(reader);
+        return new TokenStreamComponents(t,
+            new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t)));
+      }
+    };
+    
+    InputStream vocOut = getClass().getResourceAsStream("data.txt");
+    assertVocabulary(analyzer, vocOut);
+    vocOut.close();
   }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt Wed Jul 14 12:10:34 2010
@@ -1,48 +1,48 @@
 # German special characters are replaced:
-häufig;haufig
+hÃ¤ufig	haufig
 
 # here the stemmer works okay, it maps related words to the same stem:
-abschließen;abschliess
-abschließender;abschliess
-abschließendes;abschliess
-abschließenden;abschliess
-
-Tisch;tisch
-Tische;tisch
-Tischen;tisch
-
-Haus;hau
-Hauses;hau
-Häuser;hau
-Häusern;hau
+abschlieÃen	abschliess
+abschlieÃender	abschliess
+abschlieÃendes	abschliess
+abschlieÃenden	abschliess
+
+Tisch	tisch
+Tische	tisch
+Tischen	tisch
+
+Haus	hau
+Hauses	hau
+HÃ¤user	hau
+HÃ¤usern	hau
 # here's a case where overstemming occurs, i.e. a word is 
 # mapped to the same stem as unrelated words:
-hauen;hau
+hauen	hau
 
 # here's a case where understemming occurs, i.e. two related words
 # are not mapped to the same stem. This is the case with basically
 # all irregular forms:
-Drama;drama
-Dramen;dram
+Drama	drama
+Dramen	dram
 
-# replace "ß" with 'ss':
-Ausmaß;ausmass
+# replace "Ã" with 'ss':
+AusmaÃ	ausmass
 
 # fake words to test if suffixes are cut off:
-xxxxxe;xxxxx
-xxxxxs;xxxxx
-xxxxxn;xxxxx
-xxxxxt;xxxxx
-xxxxxem;xxxxx
-xxxxxer;xxxxx
-xxxxxnd;xxxxx
+xxxxxe	xxxxx
+xxxxxs	xxxxx
+xxxxxn	xxxxx
+xxxxxt	xxxxx
+xxxxxem	xxxxx
+xxxxxer	xxxxx
+xxxxxnd	xxxxx
 # the suffixes are also removed when combined:
-xxxxxetende;xxxxx
+xxxxxetende	xxxxx
 
 # words that are shorter than four charcters are not changed:
-xxe;xxe
+xxe	xxe
 # -em and -er are not removed from words shorter than five characters:
-xxem;xxem
-xxer;xxer
+xxem	xxem
+xxer	xxer
 # -nd is not removed from words shorter than six characters:
-xxxnd;xxxnd
+xxxnd	xxxnd

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip?rev=964019&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip?rev=964019&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.en;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+/**
+ * Simple tests for {@link EnglishMinimalStemFilter}
+ */
+public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+      return new TokenStreamComponents(source, new EnglishMinimalStemFilter(source));
+    }
+  };
+  
+  /** Test some examples from various papers about this technique */
+  public void testExamples() throws IOException {
+    checkOneTerm(analyzer, "queries", "query");
+    checkOneTerm(analyzer, "phrases", "phrase");
+    checkOneTerm(analyzer, "corpus", "corpus");
+    checkOneTerm(analyzer, "stress", "stress");
+    checkOneTerm(analyzer, "kings", "king");
+    checkOneTerm(analyzer, "panels", "panel");
+    checkOneTerm(analyzer, "aerodynamics", "aerodynamic");
+    checkOneTerm(analyzer, "congress", "congress");
+    checkOneTerm(analyzer, "serious", "serious");
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java Wed Jul 14 12:10:34 2010
@@ -17,21 +17,22 @@ package org.apache.lucene.analysis.en;
  * limitations under the License.
  */
 
-import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
+import java.io.Reader;
 import java.io.StringReader;
-import java.util.zip.ZipFile;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
 /**
  * Test the PorterStemFilter with Martin Porter's test data.
  */
@@ -41,26 +42,16 @@ public class TestPorterStemFilter extend
    * The output should be the same as the string in output.txt
    */
   public void testPorterStemFilter() throws Exception {
-    Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
-    TokenStream filter = new PorterStemFilter(tokenizer);   
-    ZipFile zipFile = new ZipFile(getDataFile("porterTestData.zip"));
-    InputStream voc = zipFile.getInputStream(zipFile.getEntry("voc.txt"));
-    InputStream out = zipFile.getInputStream(zipFile.getEntry("output.txt"));
-    BufferedReader vocReader = new BufferedReader(new InputStreamReader(
-        voc, "UTF-8"));
-    BufferedReader outputReader = new BufferedReader(new InputStreamReader(
-        out, "UTF-8"));
-    String inputWord = null;
-    while ((inputWord = vocReader.readLine()) != null) {
-      String expectedWord = outputReader.readLine();
-      assertNotNull(expectedWord);
-      tokenizer.reset(new StringReader(inputWord));
-      filter.reset();
-      assertTokenStreamContents(filter, new String[] { expectedWord });
-    }
-    vocReader.close();
-    outputReader.close();
-    zipFile.close();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName,
+          Reader reader) {
+        Tokenizer t = new KeywordTokenizer(reader);
+        return new TokenStreamComponents(t, new PorterStemFilter(t));
+      }
+    };
+
+    assertVocabulary(a, getDataFile("porterTestData.zip"), "voc.txt", "output.txt");
   }
   
   public void testWithKeywordAttribute() throws IOException {

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.es;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link SpanishLightStemFilter}
+ */
+public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+      return new TokenStreamComponents(source, new SpanishLightStemFilter(source));
+    }
+  };
+  
+  /** Test against a vocabulary from the reference impl */
+  public void testVocabulary() throws IOException {
+    assertVocabulary(analyzer, getDataFile("eslighttestdata.zip"), "eslight.txt");
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip?rev=964019&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.fi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link FinnishLightStemFilter}
+ */
+public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+      return new TokenStreamComponents(source, new FinnishLightStemFilter(source));
+    }
+  };
+  
+  /** Test against a vocabulary from the reference impl */
+  public void testVocabulary() throws IOException {
+    assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip?rev=964019&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream