You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/08 13:07:54 UTC
svn commit: r1241878 [1/3] - in /lucene/dev/trunk: lucene/contrib/ modules/analysis/common/src/java/org/apache/lucene/analysis/ca/ modules/analysis/common/src/java/org/apache/lucene/analysis/de/ modules/analysis/common/src/java/org/apache/lucene/analys...

Author: rmuir
Date: Wed Feb  8 12:07:52 2012
New Revision: 1241878

URL: http://svn.apache.org/viewvc?rev=1241878&view=rev
Log:
SOLR-3097, SOLR-3105: add fieldtypes for different languages to the example

Added:
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/
    lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt   (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java   (with props)
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java   (with props)
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java   (with props)
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java   (with props)
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/
    lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ar.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_bg.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ca.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_cz.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_da.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_de.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_el.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_en.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_es.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_eu.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_fa.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_fi.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_fr.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_gl.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_hi.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_hu.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_hy.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_id.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_it.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_lv.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_nl.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_no.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_pt.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ro.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ru.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_sv.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_th.txt   (with props)
    lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_tr.txt   (with props)
Removed:
    lucene/dev/trunk/solr/example/solr/conf/stopwords_en.txt
Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/build.xml
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java
    lucene/dev/trunk/solr/example/solr/conf/schema.xml

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Wed Feb  8 12:07:52 2012
@@ -169,6 +169,14 @@ Changes in runtime behavior
 
  * LUCENE-3626: PKIndexSplitter and MultiPassIndexSplitter now work
    per segment.  (Uwe Schindler)
+   
+ * SOLR-3105: When passed LUCENE_36 or greater as version, GermanAnalyzer,
+   SpanishAnalyzer, FrenchAnalyzer, ItalianAnalyzer, and PortugueseAnalyzer
+   use a lighter stemming approach, CatalanAnalyzer uses ElisionFilter 
+   with a set of contractions, HindiAnalyzer uses StandardTokenizer, and
+   ThaiAnalyzer uses thai stopwords. Add GermanNormalizationFilter which applies
+   the Snowball German2 algorithm to ae/oe/ue and case-folds Ã. Add 
+   GalicianMinimalStemFilter for plural removal only. (Robert Muir)
 
 Optimizations
 

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -19,11 +19,13 @@ package org.apache.lucene.analysis.ca;
 
 import java.io.IOException;
 import java.io.Reader;
+import java.util.Arrays;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.fr.ElisionFilter;
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -37,6 +39,14 @@ import org.tartarus.snowball.ext.Catalan
 
 /**
  * {@link Analyzer} for Catalan.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating CatalanAnalyzer:
+ * <ul>
+ *   <li> As of 3.6, ElisionFilter with a set of Catalan 
+ *        contractions is used by default.
+ * </ul>
  */
 public final class CatalanAnalyzer extends StopwordAnalyzerBase {
   private final Set<?> stemExclusionSet;
@@ -44,6 +54,12 @@ public final class CatalanAnalyzer exten
   /** File containing default Catalan stopwords. */
   public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
   
+  private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
+      new CharArraySet(Version.LUCENE_CURRENT, 
+          Arrays.asList(
+              "d", "l", "m", "n", "s", "t"
+          ), true));
+  
   /**
    * Returns an unmodifiable instance of the default stop words set.
    * @return default stop words set.
@@ -120,6 +136,9 @@ public final class CatalanAnalyzer exten
       Reader reader) {
     final Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
+    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+      result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
+    }
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if(!stemExclusionSet.isEmpty())

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -54,6 +54,7 @@ import org.tartarus.snowball.ext.German2
  * <p>You must specify the required {@link Version}
  * compatibility when creating GermanAnalyzer:
  * <ul>
+ *   <li> As of 3.6, GermanLightStemFilter is used for less aggressive stemming.
  *   <li> As of 3.1, Snowball stemming is done with SnowballFilter, and 
  *        Snowball stopwords are used by default.
  *   <li> As of 2.9, StopFilter preserves position
@@ -166,7 +167,7 @@ public final class GermanAnalyzer extend
    *         built from a {@link StandardTokenizer} filtered with
    *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
    *         , {@link KeywordMarkerFilter} if a stem exclusion set is
-   *         provided, and {@link SnowballFilter}
+   *         provided, {@link GermanNormalizationFilter} and {@link GermanLightStemFilter}
    */
   @Override
   protected TokenStreamComponents createComponents(String fieldName,
@@ -176,10 +177,14 @@ public final class GermanAnalyzer extend
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter( matchVersion, result, stopwords);
     result = new KeywordMarkerFilter(result, exclusionSet);
-    if (matchVersion.onOrAfter(Version.LUCENE_31))
+    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+      result = new GermanNormalizationFilter(result);
+      result = new GermanLightStemFilter(result);
+    } else if (matchVersion.onOrAfter(Version.LUCENE_31)) {
       result = new SnowballFilter(result, new German2Stemmer());
-    else
+    } else {
       result = new GermanStemFilter(result);
+    }
     return new TokenStreamComponents(source, result);
   }
 }

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java Wed Feb  8 12:07:52 2012
@@ -0,0 +1,112 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.StemmerUtil;
+
+/**
+ * Normalizes German characters according to the heuristics
+ * of the <a href="http://snowball.tartarus.org/algorithms/german2/stemmer.html">
+ * German2 snowball algorithm</a>.
+ * It allows for the fact that Ã¤, Ã¶ and Ã¼ are sometimes written as ae, oe and ue.
+ * <p>
+ * <ul>
+ *   <li> 'Ã' is replaced by 'ss'
+ *   <li> 'Ã¤', 'Ã¶', 'Ã¼' are replaced by 'a', 'o', 'u', respectively.
+ *   <li> 'ae' and 'oe' are replaced by 'a', and 'o', respectively.
+ *   <li> 'ue' is replaced by 'u', when not following a vowel or q.
+ * </ul>
+ * <p>
+ * This is useful if you want this normalization without using
+ * the German2 stemmer, or perhaps no stemming at all.
+ */
+public final class GermanNormalizationFilter extends TokenFilter {
+  // FSM with 3 states:
+  private static final int N = 0; /* ordinary state */
+  private static final int V = 1; /* stops 'u' from entering umlaut state */
+  private static final int U = 2; /* umlaut state, allows e-deletion */
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  public GermanNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      int state = N;
+      char buffer[] = termAtt.buffer();
+      int length = termAtt.length();
+      for (int i = 0; i < length; i++) {
+        final char c = buffer[i];
+        switch(c) {
+          case 'a':
+          case 'o':
+            state = U;
+            break;
+          case 'u':
+            state = (state == N) ? U : V;
+            break;
+          case 'e':
+            if (state == U)
+              length = StemmerUtil.delete(buffer, i--, length);
+            state = V;
+            break;
+          case 'i':
+          case 'q':
+          case 'y':
+            state = V;
+            break;
+          case 'Ã¤':
+            buffer[i] = 'a';
+            state = V;
+            break;
+          case 'Ã¶':
+            buffer[i] = 'o';
+            state = V;
+            break;
+          case 'Ã¼': 
+            buffer[i] = 'u';
+            state = V;
+            break;
+          case 'Ã':
+            buffer[i++] = 's';
+            buffer = termAtt.resizeBuffer(1+length);
+            if (i < length)
+              System.arraycopy(buffer, i, buffer, i+1, (length-i));
+            buffer[i] = 's';
+            length++;
+            state = N;
+            break;
+          default:
+            state = N;
+        }
+      }
+      termAtt.setLength(length);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -39,6 +39,13 @@ import org.tartarus.snowball.ext.Spanish
 
 /**
  * {@link Analyzer} for Spanish.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating SpanishAnalyzer:
+ * <ul>
+ *   <li> As of 3.6, SpanishLightStemFilter is used for less aggressive stemming.
+ * </ul>
  */
 public final class SpanishAnalyzer extends StopwordAnalyzerBase {
   private final Set<?> stemExclusionSet;
@@ -115,7 +122,7 @@ public final class SpanishAnalyzer exten
    *         built from an {@link StandardTokenizer} filtered with
    *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
    *         , {@link KeywordMarkerFilter} if a stem exclusion set is
-   *         provided and {@link SnowballFilter}.
+   *         provided and {@link SpanishLightStemFilter}.
    */
   @Override
   protected TokenStreamComponents createComponents(String fieldName,
@@ -126,7 +133,11 @@ public final class SpanishAnalyzer exten
     result = new StopFilter(matchVersion, result, stopwords);
     if(!stemExclusionSet.isEmpty())
       result = new KeywordMarkerFilter(result, stemExclusionSet);
-    result = new SnowballFilter(result, new SpanishStemmer());
+    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+      result = new SpanishLightStemFilter(result);
+    } else {
+      result = new SnowballFilter(result, new SpanishStemmer());
+    }
     return new TokenStreamComponents(source, result);
   }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -52,6 +52,7 @@ import java.util.Set;
  * <p>You must specify the required {@link Version}
  * compatibility when creating FrenchAnalyzer:
  * <ul>
+ *   <li> As of 3.6, FrenchLightStemFilter is used for less aggressive stemming.
  *   <li> As of 3.1, Snowball stemming is done with SnowballFilter, 
  *        LowerCaseFilter is used prior to StopFilter, and ElisionFilter and 
  *        Snowball stopwords are used by default.
@@ -177,7 +178,7 @@ public final class FrenchAnalyzer extend
    *         {@link StandardFilter}, {@link ElisionFilter},
    *         {@link LowerCaseFilter}, {@link StopFilter},
    *         {@link KeywordMarkerFilter} if a stem exclusion set is
-   *         provided, and {@link SnowballFilter}
+   *         provided, and {@link FrenchLightStemFilter}
    */
   @Override
   protected TokenStreamComponents createComponents(String fieldName,
@@ -190,7 +191,11 @@ public final class FrenchAnalyzer extend
       result = new StopFilter(matchVersion, result, stopwords);
       if(!excltable.isEmpty())
         result = new KeywordMarkerFilter(result, excltable);
-      result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
+      if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+        result = new FrenchLightStemFilter(result);
+      } else {
+        result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
+      }
       return new TokenStreamComponents(source, result);
     } else {
       final Tokenizer source = new StandardTokenizer(matchVersion, reader);

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java Wed Feb  8 12:07:52 2012
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link GalicianMinimalStemmer} to stem 
+ * Galician words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class GalicianMinimalStemFilter extends TokenFilter {
+  private final GalicianMinimalStemmer stemmer = new GalicianMinimalStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public GalicianMinimalStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java Wed Feb  8 12:07:52 2012
@@ -0,0 +1,38 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.pt.RSLPStemmerBase;
+
+/**
+ * Minimal Stemmer for Galician
+ * <p>
+ * This follows the "RSLP-S" algorithm, but modified for Galician.
+ * Hence this stemmer only applies the plural reduction step of:
+ * "Regras do lematizador para o galego"
+ * @see RSLPStemmerBase
+ */
+public class GalicianMinimalStemmer extends RSLPStemmerBase {
+  
+  private static final Step pluralStep = 
+    parse(GalicianMinimalStemmer.class, "galician.rslp").get("Plural");
+  
+  public int stem(char s[], int len) {
+    return pluralStep.apply(s, len);
+  }
+}

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -22,6 +22,7 @@ import java.io.Reader;
 import java.util.Set;
 
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
@@ -34,6 +35,13 @@ import org.apache.lucene.util.Version;
 
 /**
  * Analyzer for Hindi.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating HindiAnalyzer:
+ * <ul>
+ *   <li> As of 3.6, StandardTokenizer is used for tokenization
+ * </ul>
  */
 public final class HindiAnalyzer extends StopwordAnalyzerBase {
   private final Set<?> stemExclusionSet;
@@ -110,7 +118,7 @@ public final class HindiAnalyzer extends
    * used to tokenize all the text in the provided {@link Reader}.
    * 
    * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
-   *         built from a {@link IndicTokenizer} filtered with
+   *         built from a {@link StandardTokenizer} filtered with
    *         {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
    *         {@link HindiNormalizationFilter}, {@link KeywordMarkerFilter}
    *         if a stem exclusion set is provided, {@link HindiStemFilter}, and
@@ -119,7 +127,12 @@ public final class HindiAnalyzer extends
   @Override
   protected TokenStreamComponents createComponents(String fieldName,
       Reader reader) {
-    final Tokenizer source = new IndicTokenizer(matchVersion, reader);
+    final Tokenizer source;
+    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+      source = new StandardTokenizer(matchVersion, reader);
+    } else {
+      source = new IndicTokenizer(matchVersion, reader);
+    }
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     if (!stemExclusionSet.isEmpty())
       result = new KeywordMarkerFilter(result, stemExclusionSet);

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java Wed Feb  8 12:07:52 2012
@@ -20,12 +20,15 @@ package org.apache.lucene.analysis.in;
 import java.io.Reader;
 
 import org.apache.lucene.analysis.util.CharTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer; // javadocs
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.Version;
 
 /**
  * Simple Tokenizer for text in Indian Languages.
+ * @deprecated (3.6) Use {@link StandardTokenizer} instead.
  */
+@Deprecated
 public final class IndicTokenizer extends CharTokenizer {
  
   public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -46,6 +46,7 @@ import org.tartarus.snowball.ext.Italian
  * <p>You must specify the required {@link Version}
  * compatibility when creating ItalianAnalyzer:
  * <ul>
+ *   <li> As of 3.6, ItalianLightStemFilter is used for less aggressive stemming.
  *   <li> As of 3.2, ElisionFilter with a set of Italian 
  *        contractions is used by default.
  * </ul>
@@ -132,7 +133,7 @@ public final class ItalianAnalyzer exten
    *         built from an {@link StandardTokenizer} filtered with
    *         {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
    *         , {@link KeywordMarkerFilter} if a stem exclusion set is
-   *         provided and {@link SnowballFilter}.
+   *         provided and {@link ItalianLightStemFilter}.
    */
   @Override
   protected TokenStreamComponents createComponents(String fieldName,
@@ -146,7 +147,11 @@ public final class ItalianAnalyzer exten
     result = new StopFilter(matchVersion, result, stopwords);
     if(!stemExclusionSet.isEmpty())
       result = new KeywordMarkerFilter(result, stemExclusionSet);
-    result = new SnowballFilter(result, new ItalianStemmer());
+    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+      result = new ItalianLightStemFilter(result);
+    } else {
+      result = new SnowballFilter(result, new ItalianStemmer());
+    }
     return new TokenStreamComponents(source, result);
   }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -39,6 +39,13 @@ import org.tartarus.snowball.ext.Portugu
 
 /**
  * {@link Analyzer} for Portuguese.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating PortugueseAnalyzer:
+ * <ul>
+ *   <li> As of 3.6, PortugueseLightStemFilter is used for less aggressive stemming.
+ * </ul>
  */
 public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
   private final Set<?> stemExclusionSet;
@@ -115,7 +122,7 @@ public final class PortugueseAnalyzer ex
    *         built from an {@link StandardTokenizer} filtered with
    *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
    *         , {@link KeywordMarkerFilter} if a stem exclusion set is
-   *         provided and {@link SnowballFilter}.
+   *         provided and {@link PortugueseLightStemFilter}.
    */
   @Override
   protected TokenStreamComponents createComponents(String fieldName,
@@ -126,7 +133,11 @@ public final class PortugueseAnalyzer ex
     result = new StopFilter(matchVersion, result, stopwords);
     if(!stemExclusionSet.isEmpty())
       result = new KeywordMarkerFilter(result, stemExclusionSet);
-    result = new SnowballFilter(result, new PortugueseStemmer());
+    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+      result = new PortugueseLightStemFilter(result);
+    } else {
+      result = new SnowballFilter(result, new PortugueseStemmer());
+    }
     return new TokenStreamComponents(source, result);
   }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java Wed Feb  8 12:07:52 2012
@@ -24,7 +24,7 @@ package org.apache.lucene.analysis.pt;
  * <i>A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese
  * Information Retrieval</i> (Orengo, et al)
  * which is just the plural reduction step of the RSLP
- * algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>,
+ * algorithm from <i>A Stemming Algorithm for the Portuguese Language</i>,
  * Orengo et al.
  * @see RSLPStemmerBase
  */

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -16,7 +16,9 @@ package org.apache.lucene.analysis.th;
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.io.Reader;
+import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
@@ -24,22 +26,75 @@ import org.apache.lucene.analysis.Tokeni
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopAnalyzer;
 import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.util.Version;
 
 /**
  * {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
- *
- * <p><b>NOTE</b>: This class uses the same {@link Version}
- * dependent settings as {@link StandardAnalyzer}.</p>
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ThaiAnalyzer:
+ * <ul>
+ *   <li> As of 3.6, a set of Thai stopwords is used by default
+ * </ul>
  */
-public final class ThaiAnalyzer extends Analyzer {
-  private final Version matchVersion;
+public final class ThaiAnalyzer extends StopwordAnalyzerBase {
+  
+  /** File containing default Thai stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  /**
+   * The comment character in the stopwords file.  
+   * All lines prefixed with this will be ignored.
+   */
+  private static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, ThaiAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
 
+  /**
+   * Builds an analyzer with the default stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   */
   public ThaiAnalyzer(Version matchVersion) {
-    this.matchVersion = matchVersion;
+    this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STOP_SET : StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public ThaiAnalyzer(Version matchVersion, Set<?> stopwords) {
+    super(matchVersion, stopwords);
   }
 
   /**
@@ -61,6 +116,6 @@ public final class ThaiAnalyzer extends 
       result = new LowerCaseFilter(matchVersion, result);
     result = new ThaiWordFilter(matchVersion, result);
     return new TokenStreamComponents(source, new StopFilter(matchVersion,
-        result, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
+        result, stopwords));
   }
 }

Added: lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt (added)
+++ lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt Wed Feb  8 12:07:52 2012
@@ -0,0 +1,119 @@
+# Thai stopwords from:
+# "Opinion Detection in Thai Political News Columns
+# Based on Subjectivity Analysis"
+# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
+à¹à¸§à¹
+à¹à¸¡à¹
+à¹à¸
+à¹à¸à¹
+à¹à¸«à¹
+à¹à¸
+à¹à¸à¸¢
+à¹à¸«à¹à¸
+à¹à¸¥à¹à¸§
+à¹à¸¥à¸°
+à¹à¸£à¸
+à¹à¸à¸
+à¹à¸à¹
+à¹à¸à¸
+à¹à¸«à¹à¸
+à¹à¸¥à¸¢
+à¹à¸£à¸´à¹à¸¡
+à¹à¸£à¸²
+à¹à¸¡à¸·à¹à¸
+à¹à¸à¸·à¹à¸
+à¹à¸à¸£à¸²à¸°
+à¹à¸à¹à¸à¸à¸²à¸£
+à¹à¸à¹à¸
+à¹à¸à¸´à¸à¹à¸à¸¢
+à¹à¸à¸´à¸
+à¹à¸à¸·à¹à¸à¸à¸à¸²à¸
+à¹à¸à¸µà¸¢à¸§à¸à¸±à¸
+à¹à¸à¸µà¸¢à¸§
+à¹à¸à¹à¸
+à¹à¸à¸à¸²à¸°
+à¹à¸à¸¢
+à¹à¸à¹à¸²
+à¹à¸à¸²
+à¸à¸µà¸
+à¸à¸²à¸
+à¸à¸°à¹à¸£
+à¸à¸à¸
+à¸à¸¢à¹à¸²à¸
+à¸à¸¢à¸¹à¹
+à¸à¸¢à¸²à¸
+à¸«à¸²à¸
+à¸«à¸¥à¸²à¸¢
+à¸«à¸¥à¸±à¸à¸à¸²à¸
+à¸«à¸¥à¸±à¸
+à¸«à¸£à¸·à¸
+à¸«à¸à¸¶à¹à¸
+à¸ªà¹à¸§à¸
+à¸ªà¹à¸
+à¸ªà¸¸à¸
+à¸ªà¹à¸²à¸«à¸£à¸±à¸
+à¸§à¹à¸²
+à¸§à¸±à¸
+à¸¥à¸
+à¸£à¹à¸§à¸¡
+à¸£à¸²à¸¢
+à¸£à¸±à¸
+à¸£à¸°à¸«à¸§à¹à¸²à¸
+à¸£à¸§à¸¡
+à¸¢à¸±à¸
+à¸¡à¸µ
+à¸¡à¸²à¸
+à¸¡à¸²
+à¸à¸£à¹à¸à¸¡
+à¸à¸
+à¸à¹à¸²à¸
+à¸à¸¥
+à¸à¸²à¸
+à¸à¹à¸²
+à¸à¸µà¹
+à¸à¹à¸²
+à¸à¸±à¹à¸
+à¸à¸±à¸
+à¸à¸à¸à¸à¸²à¸
+à¸à¸¸à¸
+à¸à¸µà¹à¸ªà¸¸à¸
+à¸à¸µà¹
+à¸à¹à¸²à¹à¸«à¹
+à¸à¹à¸²
+à¸à¸²à¸
+à¸à¸±à¹à¸à¸à¸µà¹
+à¸à¸±à¹à¸
+à¸à¹à¸²
+à¸à¸¹à¸
+à¸à¸¶à¸
+à¸à¹à¸à¸
+à¸à¹à¸²à¸à¹
+à¸à¹à¸²à¸
+à¸à¹à¸
+à¸à¸²à¸¡
+à¸à¸±à¹à¸à¹à¸à¹
+à¸à¸±à¹à¸
+à¸à¹à¸²à¸
+à¸à¹à¸§à¸¢
+à¸à¸±à¸
+à¸à¸¶à¹à¸
+à¸à¹à¸§à¸
+à¸à¸¶à¸
+à¸à¸²à¸
+à¸à¸±à¸
+à¸à¸°
+à¸à¸·à¸
+à¸à¸§à¸²à¸¡
+à¸à¸£à¸±à¹à¸
+à¸à¸
+à¸à¸¶à¹à¸
+à¸à¸à¸
+à¸à¸
+à¸à¸à¸°
+à¸à¹à¸à¸
+à¸à¹
+à¸à¸²à¸£
+à¸à¸±à¸
+à¸à¸±à¸
+à¸à¸§à¹à¸²
+à¸à¸¥à¹à¸²à¸§

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -41,6 +41,13 @@ public class TestCatalanAnalyzer extends
     assertAnalyzesTo(a, "un", new String[] { });
   }
   
+  /** test use of elisionfilter */
+  public void testContractions() throws IOException {
+    Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT);
+    assertAnalyzesTo(a, "Diccionari de l'Institut d'Estudis Catalans",
+        new String[] { "diccion", "inst", "estud", "catalan" });
+  }
+  
   /** test use of exclusion set */
   public void testExclude() throws IOException {
     Set<String> exclusionSet = new HashSet<String>();

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java Wed Feb  8 12:07:52 2012
@@ -0,0 +1,68 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Tests {@link GermanNormalizationFilter}
+ */
+public class TestGermanNormalizationFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String field, Reader reader) {
+      final Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+      final TokenStream stream = new GermanNormalizationFilter(tokenizer);
+      return new TokenStreamComponents(tokenizer, stream);
+    }
+  };
+  
+  /**
+   * Tests that a/o/u + e is equivalent to the umlaut form
+   */
+  public void testBasicExamples() throws IOException {
+    checkOneTerm(analyzer, "SchaltflÃ¤chen", "Schaltflachen");
+    checkOneTerm(analyzer, "Schaltflaechen", "Schaltflachen");
+  }
+
+  /**
+   * Tests the specific heuristic that ue is not folded after a vowel or q.
+   */
+  public void testUHeuristic() throws IOException {
+    checkOneTerm(analyzer, "dauer", "dauer");
+  }
+  
+  /**
+   * Tests german specific folding of sharp-s
+   */
+  public void testSpecialFolding() throws IOException {
+    checkOneTerm(analyzer, "weiÃbier", "weissbier");
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
+}

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -56,7 +56,7 @@ public class TestFrenchAnalyzer extends 
 		assertAnalyzesTo(
 			fa,
 			"mot \"entreguillemet\"",
-			new String[] { "mot", "entreguillemet" });
+			new String[] { "mot", "entreguilemet" });
 
 		// let's do some french specific tests now	
 
@@ -66,7 +66,7 @@ public class TestFrenchAnalyzer extends 
 		assertAnalyzesTo(
 			fa,
 			"Jean-FranÃ§ois",
-			new String[] { "jean", "franÃ§ois" });
+			new String[] { "jean", "francoi" });
 
 		// 2. stopwords
 		assertAnalyzesTo(
@@ -81,16 +81,16 @@ public class TestFrenchAnalyzer extends 
 			new String[] {
 				"lanc",
 				"chism",
-				"habit",
+				"habitabl",
 				"chist",
-				"Ã©lÃ©ment",
+				"element",
 				"captif" });
 
 		// some verbs
 		assertAnalyzesTo(
 			fa,
 			"finissions souffrirent rugissante",
-			new String[] { "fin", "souffr", "rug" });
+			new String[] { "finision", "soufrirent", "rugisant" });
 
 		// some everything else
 		// aujourd'hui stays one term which is OK
@@ -101,16 +101,16 @@ public class TestFrenchAnalyzer extends 
 				"c3po",
 				"aujourd'hui",
 				"oeuf",
-				"Ã¯Ã¢Ã¶Ã»Ã Ã¤",
-				"anticonstitutionnel",
-				"jav" });
+				"Ã¯aÃ¶uaÃ¤",
+				"anticonstitutionel",
+				"java" });
 
 		// some more everything else
 		// here 1940-1945 stays as one term, 1940:1945 not ?
 		assertAnalyzesTo(
 			fa,
 			"33Bis 1940-1945 1940:1945 (---i+++)*",
-			new String[] { "33bis", "1940", "1945", "1940", "1945", "i" });
+			new String[] { "33bi", "1940", "1945", "1940", "1945", "i" });
 
 	}
 	
@@ -217,9 +217,9 @@ public class TestFrenchAnalyzer extends 
           new String[] {
               "lanc",
               "chism",
-              "habit",
+              "habitabl",
               "chist",
-              "Ã©lÃ©ment",
+              "element",
               "captif" });
 	}
 
@@ -238,7 +238,7 @@ public class TestFrenchAnalyzer extends 
   
   public void testElision() throws Exception {
     FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
-    assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
+    assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouil" });
   }
   
   /**

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java Wed Feb  8 12:07:52 2012
@@ -0,0 +1,55 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Simple tests for {@link GalicianMinimalStemmer}
+ */
+public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
+  Analyzer a = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+      return new TokenStreamComponents(tokenizer, new GalicianMinimalStemFilter(tokenizer));
+    }
+  };
+  
+  public void testPlural() throws Exception {
+    checkOneTerm(a, "elefantes", "elefante");
+    checkOneTerm(a, "elefante", "elefante");
+    checkOneTerm(a, "kalÃ³res", "kalÃ³r");
+    checkOneTerm(a, "kalÃ³r", "kalÃ³r");
+  }
+  
+  public void testExceptions() throws Exception {
+    checkOneTerm(a, "mas", "mas");
+    checkOneTerm(a, "barcelonÃªs", "barcelonÃªs");
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
+}

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -36,8 +36,8 @@ public class TestItalianAnalyzer extends
   public void testBasics() throws IOException {
     Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
     // stemming
-    checkOneTermReuse(a, "abbandonata", "abbandon");
-    checkOneTermReuse(a, "abbandonati", "abbandon");
+    checkOneTermReuse(a, "abbandonata", "abbandonat");
+    checkOneTermReuse(a, "abbandonati", "abbandonat");
     // stopword
     assertAnalyzesTo(a, "dallo", new String[] {});
   }
@@ -49,7 +49,7 @@ public class TestItalianAnalyzer extends
     Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT, 
         ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
     checkOneTermReuse(a, "abbandonata", "abbandonata");
-    checkOneTermReuse(a, "abbandonati", "abbandon");
+    checkOneTermReuse(a, "abbandonati", "abbandonat");
   }
   
   /** blast some random strings through the analyzer */
@@ -61,7 +61,7 @@ public class TestItalianAnalyzer extends
   public void testContractions() throws IOException {
     Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
     assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
-    assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" });
+    assertAnalyzesTo(a, "l'Italiano", new String[] { "italian" });
   }
   
   /** test that we don't enable this before 3.2*/

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -35,8 +35,8 @@ public class TestPortugueseAnalyzer exte
   public void testBasics() throws IOException {
     Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT);
     // stemming
-    checkOneTermReuse(a, "quilomÃ©tricas", "quilomÃ©tr");
-    checkOneTermReuse(a, "quilomÃ©tricos", "quilomÃ©tr");
+    checkOneTermReuse(a, "quilomÃ©tricas", "quilometric");
+    checkOneTermReuse(a, "quilomÃ©tricos", "quilometric");
     // stopword
     assertAnalyzesTo(a, "nÃ£o", new String[] {});
   }
@@ -48,7 +48,7 @@ public class TestPortugueseAnalyzer exte
     Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT, 
         PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
     checkOneTermReuse(a, "quilomÃ©tricas", "quilomÃ©tricas");
-    checkOneTermReuse(a, "quilomÃ©tricos", "quilomÃ©tr");
+    checkOneTermReuse(a, "quilomÃ©tricos", "quilometric");
   }
   
   /** blast some random strings through the analyzer */

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Wed Feb  8 12:07:52 2012
@@ -21,7 +21,9 @@ import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.StopAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.Version;
 
 /**
@@ -40,14 +42,29 @@ public class TestThaiAnalyzer extends Ba
 	 * testcase for offsets
 	 */
 	public void testOffsets() throws Exception {
-		assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ", 
+		assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ", 
 		    new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "à¸§à¹à¸²", "à¸à¸²à¸", "à¸à¸µ" },
 				new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
 				new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
 	}
 	
+	public void testStopWords() throws Exception {
+	  assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ", 
+	      new String[] { "à¹à¸ªà¸à¸", "à¸à¸²à¸", "à¸à¸µ" },
+	      new int[] { 13, 20, 23 },
+	      new int[] { 17, 23, 25 },
+	      new int[] { 5, 2, 1 });
+	}
+	
+	public void testBackwardsStopWords() throws Exception {
+	   assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_35), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ", 
+	        new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "à¸§à¹à¸²", "à¸à¸²à¸", "à¸à¸µ" },
+	        new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
+	        new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
+	}
+	
 	public void testTokenType() throws Exception {
-      assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ à¹à¹à¹", 
+      assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ à¹à¹à¹", 
                        new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "à¸§à¹à¸²", "à¸à¸²à¸", "à¸à¸µ", "à¹à¹à¹" },
                        new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
                                       "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
@@ -96,8 +113,9 @@ public class TestThaiAnalyzer extends Ba
 	/*
 	 * Test that position increments are adjusted correctly for stopwords.
 	 */
+	// note this test uses stopfilter's stopset
 	public void testPositionIncrements() throws Exception {
-	  final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
+	  final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
     assertAnalyzesTo(analyzer, "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸ the à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ", 
         new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "à¸§à¹à¸²", "à¸à¸²à¸", "à¸à¸µ" },
         new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
@@ -113,7 +131,7 @@ public class TestThaiAnalyzer extends Ba
 	}
 	
 	public void testReusableTokenStream() throws Exception {
-	  ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
+	  ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
 	  assertAnalyzesToReuse(analyzer, "", new String[] {});
 
       assertAnalyzesToReuse(

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Feb  8 12:07:52 2012
@@ -479,6 +479,9 @@ New Features
   CommonGramsQueryFilterFactory can optionally read stopwords in Snowball
   format (specify format="snowball").  (Robert Muir)
 
+* SOLR-3105: ElisionFilterFactory optionally allows the parameter 
+  ignoreCase (default=false).  (Robert Muir)
+
 Optimizations
 ----------------------
 * SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
@@ -592,6 +595,9 @@ Other Changes
 
 * SOLR-3059: Example XSL stylesheet for indexing query result XML (janhoy)
 
+* SOLR-3097, SOLR-3105: Add analysis configurations for different languages to 
+  the example.  (Christian Moen, Robert Muir)
+
 Build
 ----------------------
 * SOLR-2487: Add build target to package war without slf4j jars (janhoy)

Modified: lucene/dev/trunk/solr/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/build.xml?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/solr/build.xml (original)
+++ lucene/dev/trunk/solr/build.xml Wed Feb  8 12:07:52 2012
@@ -625,4 +625,98 @@
       <arg value="update"/>
     </exec>
   </target>
+
+  <property name="analysis-common.res.dir"  value="../modules/analysis/common/src/resources/org/apache/lucene/analysis"/>
+  <property name="analysis-kuromoji.res.dir"  value="../modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis"/>
+  <property name="analysis.conf.dest" value="${example}/solr/conf/lang"/>
+
+  <target name="sync-analyzers"
+          description="Committers' Helper: synchronizes analysis resources (e.g. stoplists) to the example">
+    <!-- arabic -->
+    <copy verbose="true" file="${analysis-common.res.dir}/ar/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_ar.txt"/>
+    <!-- bulgarian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/bg/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_bg.txt"/>
+    <!-- catalan -->
+    <copy verbose="true" file="${analysis-common.res.dir}/ca/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_ca.txt"/>
+    <!-- czech -->
+    <copy verbose="true" file="${analysis-common.res.dir}/cz/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_cz.txt"/>
+    <!-- danish -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/danish_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_da.txt"/>
+    <!-- german -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/german_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_de.txt"/>
+    <!-- greek -->
+    <copy verbose="true" file="${analysis-common.res.dir}/el/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_el.txt"/>
+    <!-- spanish -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/spanish_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_es.txt"/>
+  	<!-- basque -->
+    <copy verbose="true" file="${analysis-common.res.dir}/eu/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_eu.txt"/>
+  	<!-- persian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/fa/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_fa.txt"/>
+  	<!-- finnish -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/finnish_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_fi.txt"/>
+  	<!-- french -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/french_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_fr.txt"/>
+  	<!-- galician -->
+    <copy verbose="true" file="${analysis-common.res.dir}/gl/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_gl.txt"/>
+  	<!-- hindi -->
+    <copy verbose="true" file="${analysis-common.res.dir}/hi/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_hi.txt"/>
+  	<!-- hungarian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/hungarian_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_hu.txt"/>
+  	<!-- armenian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/hy/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_hy.txt"/>
+  	<!-- indonesian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/id/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_id.txt"/>
+  	<!-- italian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/italian_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_it.txt"/>
+    <!-- japanese -->
+    <copy verbose="true" file="${analysis-kuromoji.res.dir}/kuromoji/stopwords.txt" 
+                         tofile="${analysis.conf.dest}/stopwords_ja.txt"/>
+    <copy verbose="true" file="${analysis-kuromoji.res.dir}/kuromoji/stoptags.txt" 
+                         tofile="${analysis.conf.dest}/stoptags_ja.txt"/>
+  	<!-- latvian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/lv/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_lv.txt"/>
+  	<!-- dutch -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/dutch_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_nl.txt"/>
+  	<!-- norwegian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/norwegian_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_no.txt"/>
+  	<!-- portuguese -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/portuguese_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_pt.txt"/>
+  	<!-- romanian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/ro/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_ro.txt"/>
+  	<!-- russian -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/russian_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_ru.txt"/>
+  	<!-- swedish -->
+    <copy verbose="true" file="${analysis-common.res.dir}/snowball/swedish_stop.txt"
+                         tofile="${analysis.conf.dest}/stopwords_sv.txt"/>
+  	<!-- thai -->
+    <copy verbose="true" file="${analysis-common.res.dir}/th/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_th.txt"/>
+  	<!-- turkish -->
+    <copy verbose="true" file="${analysis-common.res.dir}/tr/stopwords.txt"
+                         tofile="${analysis.conf.dest}/stopwords_tr.txt"/>
+  </target>
 </project>

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java Wed Feb  8 12:07:52 2012
@@ -34,7 +34,8 @@ import org.apache.lucene.analysis.TokenS
  *   &lt;analyzer&gt;
  *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
  *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
- *     &lt;filter class="solr.ElisionFilterFactory" articles="stopwordarticles.txt"/&gt;
+ *     &lt;filter class="solr.ElisionFilterFactory" 
+ *       articles="stopwordarticles.txt" ignoreCase="true"/&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
  *
@@ -45,10 +46,11 @@ public class ElisionFilterFactory extend
 
   public void inform(ResourceLoader loader) {
     String articlesFile = args.get("articles");
+    boolean ignoreCase = getBoolean("ignoreCase", false);
 
     if (articlesFile != null) {
       try {
-        articles = getWordSet(loader, articlesFile, false);
+        articles = getWordSet(loader, articlesFile, ignoreCase);
       } catch (IOException e) {
         throw new RuntimeException(e);
       }

Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java Wed Feb  8 12:07:52 2012
@@ -0,0 +1,39 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.gl.GalicianMinimalStemFilter;
+
+/**
+ * Factory for {@link GalicianMinimalStemFilter}. 
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_glplural" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.GalicianMinimalStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre> 
+ *
+ */
+public class GalicianMinimalStemFilterFactory extends BaseTokenFilterFactory {
+  public TokenStream create(TokenStream input) {
+    return new GalicianMinimalStemFilter(input);
+  }
+}

Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java Wed Feb  8 12:07:52 2012
@@ -0,0 +1,39 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.de.GermanNormalizationFilter;
+
+/**
+ * Factory for {@link GermanNormalizationFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_denorm" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.GermanNormalizationFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre> 
+ */
+public class GermanNormalizationFilterFactory extends BaseTokenFilterFactory {
+
+  public TokenStream create(TokenStream input) {
+    return new GermanNormalizationFilter(input);
+  }
+}

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java Wed Feb  8 12:07:52 2012
@@ -64,4 +64,22 @@ public class TestElisionFilterFactory ex
     assertTokenStreamContents(stream, new String[] { "avion" });
   }
   
+  /**
+   * Test setting ignoreCase=true
+   */
+  public void testCaseInsensitive() throws Exception {
+    Reader reader = new StringReader("L'avion");
+    Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+    ElisionFilterFactory factory = new ElisionFilterFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    ResourceLoader loader = new SolrResourceLoader(null, null);
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("articles", "frenchArticles.txt");
+    args.put("ignoreCase", "true");
+    factory.init(args);
+    factory.inform(loader);
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "avion" });
+  }
+  
 }

Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java Wed Feb  8 12:07:52 2012
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure the Galician plural stem factory is working.
+ */
+public class TestGalicianMinimalStemFilterFactory extends BaseTokenTestCase {
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("elefantes");
+    GalicianMinimalStemFilterFactory factory = new GalicianMinimalStemFilterFactory();
+    TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
+    assertTokenStreamContents(stream, new String[] { "elefante" });
+  }
+}

Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java Wed Feb  8 12:07:52 2012
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure the German normalization factory is working.
+ */
+public class TestGermanNormalizationFilterFactory extends BaseTokenTestCase {
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("weiÃbier");
+    GermanNormalizationFilterFactory factory = new GermanNormalizationFilterFactory();
+    TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
+    assertTokenStreamContents(stream, new String[] { "weissbier" });
+  }
+}

Added: lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt Wed Feb  8 12:07:52 2012
@@ -0,0 +1,8 @@
+# Set of Catalan contractions for ElisionFilter
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+d
+l
+m
+n
+s
+t

Added: lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt Wed Feb  8 12:07:52 2012
@@ -0,0 +1,9 @@
+# Set of French contractions for ElisionFilter
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+l
+m
+t
+qu
+n
+s
+j

Added: lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt Wed Feb  8 12:07:52 2012
@@ -0,0 +1,23 @@
+# Set of Italian contractions for ElisionFilter
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+c
+l 
+all 
+dall 
+dell 
+nell 
+sull 
+coll 
+pell 
+gl 
+agl 
+dagl 
+degl 
+negl 
+sugl 
+un 
+m 
+t 
+s 
+v 
+d

Added: lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt Wed Feb  8 12:07:52 2012
@@ -0,0 +1,6 @@
+# Set of overrides for the dutch stemmer
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+fiets	fiets
+bromfiets	bromfiets
+ei	eier
+kind	kinder