You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/08 13:07:54 UTC
svn commit: r1241878 [1/3] - in /lucene/dev/trunk: lucene/contrib/
modules/analysis/common/src/java/org/apache/lucene/analysis/ca/
modules/analysis/common/src/java/org/apache/lucene/analysis/de/
modules/analysis/common/src/java/org/apache/lucene/analys...
Author: rmuir
Date: Wed Feb 8 12:07:52 2012
New Revision: 1241878
URL: http://svn.apache.org/viewvc?rev=1241878&view=rev
Log:
SOLR-3097, SOLR-3105: add fieldtypes for different languages to the example
Added:
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java (with props)
lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/
lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java (with props)
lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java (with props)
lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java (with props)
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java (with props)
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/
lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stoptags_ja.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ar.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_bg.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ca.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_cz.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_da.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_de.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_el.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_en.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_es.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_eu.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_fa.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_fi.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_fr.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_gl.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_hi.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_hu.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_hy.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_id.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_it.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ja.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_lv.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_nl.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_no.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_pt.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ro.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_ru.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_sv.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_th.txt (with props)
lucene/dev/trunk/solr/example/solr/conf/lang/stopwords_tr.txt (with props)
Removed:
lucene/dev/trunk/solr/example/solr/conf/stopwords_en.txt
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/build.xml
lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java
lucene/dev/trunk/solr/example/solr/conf/schema.xml
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Wed Feb 8 12:07:52 2012
@@ -169,6 +169,14 @@ Changes in runtime behavior
* LUCENE-3626: PKIndexSplitter and MultiPassIndexSplitter now work
per segment. (Uwe Schindler)
+
+ * SOLR-3105: When passed LUCENE_36 or greater as version, GermanAnalyzer,
+ SpanishAnalyzer, FrenchAnalyzer, ItalianAnalyzer, and PortugueseAnalyzer
+ use a lighter stemming approach, CatalanAnalyzer uses ElisionFilter
+ with a set of contractions, HindiAnalyzer uses StandardTokenizer, and
+ ThaiAnalyzer uses thai stopwords. Add GermanNormalizationFilter which applies
+ the Snowball German2 algorithm to ae/oe/ue and case-folds Ã. Add
+ GalicianMinimalStemFilter for plural removal only. (Robert Muir)
Optimizations
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -19,11 +19,13 @@ package org.apache.lucene.analysis.ca;
import java.io.IOException;
import java.io.Reader;
+import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -37,6 +39,14 @@ import org.tartarus.snowball.ext.Catalan
/**
* {@link Analyzer} for Catalan.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating CatalanAnalyzer:
+ * <ul>
+ * <li> As of 3.6, ElisionFilter with a set of Catalan
+ * contractions is used by default.
+ * </ul>
*/
public final class CatalanAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
@@ -44,6 +54,12 @@ public final class CatalanAnalyzer exten
/** File containing default Catalan stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
+ new CharArraySet(Version.LUCENE_CURRENT,
+ Arrays.asList(
+ "d", "l", "m", "n", "s", "t"
+ ), true));
+
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
@@ -120,6 +136,9 @@ public final class CatalanAnalyzer exten
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
+ if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+ result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
+ }
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -54,6 +54,7 @@ import org.tartarus.snowball.ext.German2
* <p>You must specify the required {@link Version}
* compatibility when creating GermanAnalyzer:
* <ul>
+ * <li> As of 3.6, GermanLightStemFilter is used for less aggressive stemming.
* <li> As of 3.1, Snowball stemming is done with SnowballFilter, and
* Snowball stopwords are used by default.
* <li> As of 2.9, StopFilter preserves position
@@ -166,7 +167,7 @@ public final class GermanAnalyzer extend
* built from a {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
- * provided, and {@link SnowballFilter}
+ * provided, {@link GermanNormalizationFilter} and {@link GermanLightStemFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
@@ -176,10 +177,14 @@ public final class GermanAnalyzer extend
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopwords);
result = new KeywordMarkerFilter(result, exclusionSet);
- if (matchVersion.onOrAfter(Version.LUCENE_31))
+ if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+ result = new GermanNormalizationFilter(result);
+ result = new GermanLightStemFilter(result);
+ } else if (matchVersion.onOrAfter(Version.LUCENE_31)) {
result = new SnowballFilter(result, new German2Stemmer());
- else
+ } else {
result = new GermanStemFilter(result);
+ }
return new TokenStreamComponents(source, result);
}
}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilter.java Wed Feb 8 12:07:52 2012
@@ -0,0 +1,112 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.StemmerUtil;
+
+/**
+ * Normalizes German characters according to the heuristics
+ * of the <a href="http://snowball.tartarus.org/algorithms/german2/stemmer.html">
+ * German2 snowball algorithm</a>.
+ * It allows for the fact that ä, ö and ü are sometimes written as ae, oe and ue.
+ * <p>
+ * <ul>
+ * <li> 'Ã' is replaced by 'ss'
+ * <li> 'ä', 'ö', 'ü' are replaced by 'a', 'o', 'u', respectively.
+ * <li> 'ae' and 'oe' are replaced by 'a', and 'o', respectively.
+ * <li> 'ue' is replaced by 'u', when not following a vowel or q.
+ * </ul>
+ * <p>
+ * This is useful if you want this normalization without using
+ * the German2 stemmer, or perhaps no stemming at all.
+ */
+public final class GermanNormalizationFilter extends TokenFilter {
+ // FSM with 3 states:
+ private static final int N = 0; /* ordinary state */
+ private static final int V = 1; /* stops 'u' from entering umlaut state */
+ private static final int U = 2; /* umlaut state, allows e-deletion */
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public GermanNormalizationFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ int state = N;
+ char buffer[] = termAtt.buffer();
+ int length = termAtt.length();
+ for (int i = 0; i < length; i++) {
+ final char c = buffer[i];
+ switch(c) {
+ case 'a':
+ case 'o':
+ state = U;
+ break;
+ case 'u':
+ state = (state == N) ? U : V;
+ break;
+ case 'e':
+ if (state == U)
+ length = StemmerUtil.delete(buffer, i--, length);
+ state = V;
+ break;
+ case 'i':
+ case 'q':
+ case 'y':
+ state = V;
+ break;
+ case 'ä':
+ buffer[i] = 'a';
+ state = V;
+ break;
+ case 'ö':
+ buffer[i] = 'o';
+ state = V;
+ break;
+ case 'ü':
+ buffer[i] = 'u';
+ state = V;
+ break;
+ case 'Ã':
+ buffer[i++] = 's';
+ buffer = termAtt.resizeBuffer(1+length);
+ if (i < length)
+ System.arraycopy(buffer, i, buffer, i+1, (length-i));
+ buffer[i] = 's';
+ length++;
+ state = N;
+ break;
+ default:
+ state = N;
+ }
+ }
+ termAtt.setLength(length);
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -39,6 +39,13 @@ import org.tartarus.snowball.ext.Spanish
/**
* {@link Analyzer} for Spanish.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating SpanishAnalyzer:
+ * <ul>
+ * <li> As of 3.6, SpanishLightStemFilter is used for less aggressive stemming.
+ * </ul>
*/
public final class SpanishAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
@@ -115,7 +122,7 @@ public final class SpanishAnalyzer exten
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
- * provided and {@link SnowballFilter}.
+ * provided and {@link SpanishLightStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
@@ -126,7 +133,11 @@ public final class SpanishAnalyzer exten
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
- result = new SnowballFilter(result, new SpanishStemmer());
+ if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+ result = new SpanishLightStemFilter(result);
+ } else {
+ result = new SnowballFilter(result, new SpanishStemmer());
+ }
return new TokenStreamComponents(source, result);
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -52,6 +52,7 @@ import java.util.Set;
* <p>You must specify the required {@link Version}
* compatibility when creating FrenchAnalyzer:
* <ul>
+ * <li> As of 3.6, FrenchLightStemFilter is used for less aggressive stemming.
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
* LowerCaseFilter is used prior to StopFilter, and ElisionFilter and
* Snowball stopwords are used by default.
@@ -177,7 +178,7 @@ public final class FrenchAnalyzer extend
* {@link StandardFilter}, {@link ElisionFilter},
* {@link LowerCaseFilter}, {@link StopFilter},
* {@link KeywordMarkerFilter} if a stem exclusion set is
- * provided, and {@link SnowballFilter}
+ * provided, and {@link FrenchLightStemFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
@@ -190,7 +191,11 @@ public final class FrenchAnalyzer extend
result = new StopFilter(matchVersion, result, stopwords);
if(!excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
- result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
+ if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+ result = new FrenchLightStemFilter(result);
+ } else {
+ result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
+ }
return new TokenStreamComponents(source, result);
} else {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilter.java Wed Feb 8 12:07:52 2012
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link GalicianMinimalStemmer} to stem
+ * Galician words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class GalicianMinimalStemFilter extends TokenFilter {
+ private final GalicianMinimalStemmer stemmer = new GalicianMinimalStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public GalicianMinimalStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemmer.java Wed Feb 8 12:07:52 2012
@@ -0,0 +1,38 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.pt.RSLPStemmerBase;
+
+/**
+ * Minimal Stemmer for Galician
+ * <p>
+ * This follows the "RSLP-S" algorithm, but modified for Galician.
+ * Hence this stemmer only applies the plural reduction step of:
+ * "Regras do lematizador para o galego"
+ * @see RSLPStemmerBase
+ */
+public class GalicianMinimalStemmer extends RSLPStemmerBase {
+
+ private static final Step pluralStep =
+ parse(GalicianMinimalStemmer.class, "galician.rslp").get("Plural");
+
+ public int stem(char s[], int len) {
+ return pluralStep.apply(s, len);
+ }
+}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -22,6 +22,7 @@ import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
@@ -34,6 +35,13 @@ import org.apache.lucene.util.Version;
/**
* Analyzer for Hindi.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating HindiAnalyzer:
+ * <ul>
+ * <li> As of 3.6, StandardTokenizer is used for tokenization
+ * </ul>
*/
public final class HindiAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
@@ -110,7 +118,7 @@ public final class HindiAnalyzer extends
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * built from a {@link IndicTokenizer} filtered with
+ * built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
* {@link HindiNormalizationFilter}, {@link KeywordMarkerFilter}
* if a stem exclusion set is provided, {@link HindiStemFilter}, and
@@ -119,7 +127,12 @@ public final class HindiAnalyzer extends
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- final Tokenizer source = new IndicTokenizer(matchVersion, reader);
+ final Tokenizer source;
+ if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+ source = new StandardTokenizer(matchVersion, reader);
+ } else {
+ source = new IndicTokenizer(matchVersion, reader);
+ }
TokenStream result = new LowerCaseFilter(matchVersion, source);
if (!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java Wed Feb 8 12:07:52 2012
@@ -20,12 +20,15 @@ package org.apache.lucene.analysis.in;
import java.io.Reader;
import org.apache.lucene.analysis.util.CharTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer; // javadocs
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/**
* Simple Tokenizer for text in Indian Languages.
+ * @deprecated (3.6) Use {@link StandardTokenizer} instead.
*/
+@Deprecated
public final class IndicTokenizer extends CharTokenizer {
public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -46,6 +46,7 @@ import org.tartarus.snowball.ext.Italian
* <p>You must specify the required {@link Version}
* compatibility when creating ItalianAnalyzer:
* <ul>
+ * <li> As of 3.6, ItalianLightStemFilter is used for less aggressive stemming.
* <li> As of 3.2, ElisionFilter with a set of Italian
* contractions is used by default.
* </ul>
@@ -132,7 +133,7 @@ public final class ItalianAnalyzer exten
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
- * provided and {@link SnowballFilter}.
+ * provided and {@link ItalianLightStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
@@ -146,7 +147,11 @@ public final class ItalianAnalyzer exten
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
- result = new SnowballFilter(result, new ItalianStemmer());
+ if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+ result = new ItalianLightStemFilter(result);
+ } else {
+ result = new SnowballFilter(result, new ItalianStemmer());
+ }
return new TokenStreamComponents(source, result);
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -39,6 +39,13 @@ import org.tartarus.snowball.ext.Portugu
/**
* {@link Analyzer} for Portuguese.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating PortugueseAnalyzer:
+ * <ul>
+ * <li> As of 3.6, PortugueseLightStemFilter is used for less aggressive stemming.
+ * </ul>
*/
public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet;
@@ -115,7 +122,7 @@ public final class PortugueseAnalyzer ex
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
- * provided and {@link SnowballFilter}.
+ * provided and {@link PortugueseLightStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
@@ -126,7 +133,11 @@ public final class PortugueseAnalyzer ex
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
- result = new SnowballFilter(result, new PortugueseStemmer());
+ if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+ result = new PortugueseLightStemFilter(result);
+ } else {
+ result = new SnowballFilter(result, new PortugueseStemmer());
+ }
return new TokenStreamComponents(source, result);
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java Wed Feb 8 12:07:52 2012
@@ -24,7 +24,7 @@ package org.apache.lucene.analysis.pt;
* <i>A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese
* Information Retrieval</i> (Orengo, et al)
* which is just the plural reduction step of the RSLP
- * algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>,
+ * algorithm from <i>A Stemming Algorithm for the Portuguese Language</i>,
* Orengo et al.
* @see RSLPStemmerBase
*/
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -16,7 +16,9 @@ package org.apache.lucene.analysis.th;
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
+import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
@@ -24,22 +26,75 @@ import org.apache.lucene.analysis.Tokeni
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
- *
- * <p><b>NOTE</b>: This class uses the same {@link Version}
- * dependent settings as {@link StandardAnalyzer}.</p>
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ThaiAnalyzer:
+ * <ul>
+ * <li> As of 3.6, a set of Thai stopwords is used by default
+ * </ul>
*/
-public final class ThaiAnalyzer extends Analyzer {
- private final Version matchVersion;
+public final class ThaiAnalyzer extends StopwordAnalyzerBase {
+
+ /** File containing default Thai stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ /**
+ * The comment character in the stopwords file.
+ * All lines prefixed with this will be ignored.
+ */
+ private static final String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false, ThaiAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+ /**
+ * Builds an analyzer with the default stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ */
public ThaiAnalyzer(Version matchVersion) {
- this.matchVersion = matchVersion;
+ this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STOP_SET : StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public ThaiAnalyzer(Version matchVersion, Set<?> stopwords) {
+ super(matchVersion, stopwords);
}
/**
@@ -61,6 +116,6 @@ public final class ThaiAnalyzer extends
result = new LowerCaseFilter(matchVersion, result);
result = new ThaiWordFilter(matchVersion, result);
return new TokenStreamComponents(source, new StopFilter(matchVersion,
- result, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
+ result, stopwords));
}
}
Added: lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt (added)
+++ lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt Wed Feb 8 12:07:52 2012
@@ -0,0 +1,119 @@
+# Thai stopwords from:
+# "Opinion Detection in Thai Political News Columns
+# Based on Subjectivity Analysis"
+# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
+à¹à¸§à¹
+à¹à¸¡à¹
+à¹à¸
+à¹à¸à¹
+à¹à¸«à¹
+à¹à¸
+à¹à¸à¸¢
+à¹à¸«à¹à¸
+à¹à¸¥à¹à¸§
+à¹à¸¥à¸°
+à¹à¸£à¸
+à¹à¸à¸
+à¹à¸à¹
+à¹à¸à¸
+à¹à¸«à¹à¸
+à¹à¸¥à¸¢
+à¹à¸£à¸´à¹à¸¡
+à¹à¸£à¸²
+à¹à¸¡à¸·à¹à¸
+à¹à¸à¸·à¹à¸
+à¹à¸à¸£à¸²à¸°
+à¹à¸à¹à¸à¸à¸²à¸£
+à¹à¸à¹à¸
+à¹à¸à¸´à¸à¹à¸à¸¢
+à¹à¸à¸´à¸
+à¹à¸à¸·à¹à¸à¸à¸à¸²à¸
+à¹à¸à¸µà¸¢à¸§à¸à¸±à¸
+à¹à¸à¸µà¸¢à¸§
+à¹à¸à¹à¸
+à¹à¸à¸à¸²à¸°
+à¹à¸à¸¢
+à¹à¸à¹à¸²
+à¹à¸à¸²
+à¸à¸µà¸
+à¸à¸²à¸
+à¸à¸°à¹à¸£
+à¸à¸à¸
+à¸à¸¢à¹à¸²à¸
+à¸à¸¢à¸¹à¹
+à¸à¸¢à¸²à¸
+หาà¸
+หลาย
+หลัà¸à¸à¸²à¸
+หลัà¸
+หรืà¸
+หà¸à¸¶à¹à¸
+สà¹à¸§à¸
+สà¹à¸
+สุà¸
+สà¹à¸²à¸«à¸£à¸±à¸
+วà¹à¸²
+วัà¸
+ลà¸
+รà¹à¸§à¸¡
+ราย
+รัà¸
+ระหวà¹à¸²à¸
+รวม
+ยัà¸
+มี
+มาà¸
+มา
+à¸à¸£à¹à¸à¸¡
+à¸à¸
+à¸à¹à¸²à¸
+à¸à¸¥
+à¸à¸²à¸
+à¸à¹à¸²
+à¸à¸µà¹
+à¸à¹à¸²
+à¸à¸±à¹à¸
+à¸à¸±à¸
+à¸à¸à¸à¸à¸²à¸
+à¸à¸¸à¸
+à¸à¸µà¹à¸ªà¸¸à¸
+à¸à¸µà¹
+à¸à¹à¸²à¹à¸«à¹
+à¸à¹à¸²
+à¸à¸²à¸
+à¸à¸±à¹à¸à¸à¸µà¹
+à¸à¸±à¹à¸
+à¸à¹à¸²
+à¸à¸¹à¸
+à¸à¸¶à¸
+à¸à¹à¸à¸
+à¸à¹à¸²à¸à¹
+à¸à¹à¸²à¸
+à¸à¹à¸
+à¸à¸²à¸¡
+à¸à¸±à¹à¸à¹à¸à¹
+à¸à¸±à¹à¸
+à¸à¹à¸²à¸
+à¸à¹à¸§à¸¢
+à¸à¸±à¸
+à¸à¸¶à¹à¸
+à¸à¹à¸§à¸
+à¸à¸¶à¸
+à¸à¸²à¸
+à¸à¸±à¸
+à¸à¸°
+à¸à¸·à¸
+à¸à¸§à¸²à¸¡
+à¸à¸£à¸±à¹à¸
+à¸à¸
+à¸à¸¶à¹à¸
+à¸à¸à¸
+à¸à¸
+à¸à¸à¸°
+à¸à¹à¸à¸
+à¸à¹
+à¸à¸²à¸£
+à¸à¸±à¸
+à¸à¸±à¸
+à¸à¸§à¹à¸²
+à¸à¸¥à¹à¸²à¸§
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -41,6 +41,13 @@ public class TestCatalanAnalyzer extends
assertAnalyzesTo(a, "un", new String[] { });
}
+ /** test use of elisionfilter */
+ public void testContractions() throws IOException {
+ Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT);
+ assertAnalyzesTo(a, "Diccionari de l'Institut d'Estudis Catalans",
+ new String[] { "diccion", "inst", "estud", "catalan" });
+ }
+
/** test use of exclusion set */
public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>();
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilter.java Wed Feb 8 12:07:52 2012
@@ -0,0 +1,68 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Tests {@link GermanNormalizationFilter}
+ */
+public class TestGermanNormalizationFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String field, Reader reader) {
+ final Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ final TokenStream stream = new GermanNormalizationFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+
+ /**
+ * Tests that a/o/u + e is equivalent to the umlaut form
+ */
+ public void testBasicExamples() throws IOException {
+ checkOneTerm(analyzer, "Schaltflächen", "Schaltflachen");
+ checkOneTerm(analyzer, "Schaltflaechen", "Schaltflachen");
+ }
+
+ /**
+ * Tests the specific heuristic that ue is not folded after a vowel or q.
+ */
+ public void testUHeuristic() throws IOException {
+ checkOneTerm(analyzer, "dauer", "dauer");
+ }
+
+ /**
+ * Tests german specific folding of sharp-s
+ */
+ public void testSpecialFolding() throws IOException {
+ checkOneTerm(analyzer, "weiÃbier", "weissbier");
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
+}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -56,7 +56,7 @@ public class TestFrenchAnalyzer extends
assertAnalyzesTo(
fa,
"mot \"entreguillemet\"",
- new String[] { "mot", "entreguillemet" });
+ new String[] { "mot", "entreguilemet" });
// let's do some french specific tests now
@@ -66,7 +66,7 @@ public class TestFrenchAnalyzer extends
assertAnalyzesTo(
fa,
"Jean-François",
- new String[] { "jean", "françois" });
+ new String[] { "jean", "francoi" });
// 2. stopwords
assertAnalyzesTo(
@@ -81,16 +81,16 @@ public class TestFrenchAnalyzer extends
new String[] {
"lanc",
"chism",
- "habit",
+ "habitabl",
"chist",
- "élément",
+ "element",
"captif" });
// some verbs
assertAnalyzesTo(
fa,
"finissions souffrirent rugissante",
- new String[] { "fin", "souffr", "rug" });
+ new String[] { "finision", "soufrirent", "rugisant" });
// some everything else
// aujourd'hui stays one term which is OK
@@ -101,16 +101,16 @@ public class TestFrenchAnalyzer extends
"c3po",
"aujourd'hui",
"oeuf",
- "ïâöûà ä",
- "anticonstitutionnel",
- "jav" });
+ "ïaöuaä",
+ "anticonstitutionel",
+ "java" });
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
assertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
- new String[] { "33bis", "1940", "1945", "1940", "1945", "i" });
+ new String[] { "33bi", "1940", "1945", "1940", "1945", "i" });
}
@@ -217,9 +217,9 @@ public class TestFrenchAnalyzer extends
new String[] {
"lanc",
"chism",
- "habit",
+ "habitabl",
"chist",
- "élément",
+ "element",
"captif" });
}
@@ -238,7 +238,7 @@ public class TestFrenchAnalyzer extends
public void testElision() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
- assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
+ assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouil" });
}
/**
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java Wed Feb 8 12:07:52 2012
@@ -0,0 +1,55 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Simple tests for {@link GalicianMinimalStemmer}
+ */
+public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new GalicianMinimalStemFilter(tokenizer));
+ }
+ };
+
+ public void testPlural() throws Exception {
+ checkOneTerm(a, "elefantes", "elefante");
+ checkOneTerm(a, "elefante", "elefante");
+ checkOneTerm(a, "kalóres", "kalór");
+ checkOneTerm(a, "kalór", "kalór");
+ }
+
+ public void testExceptions() throws Exception {
+ checkOneTerm(a, "mas", "mas");
+ checkOneTerm(a, "barcelonês", "barcelonês");
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+ }
+}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -36,8 +36,8 @@ public class TestItalianAnalyzer extends
public void testBasics() throws IOException {
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
// stemming
- checkOneTermReuse(a, "abbandonata", "abbandon");
- checkOneTermReuse(a, "abbandonati", "abbandon");
+ checkOneTermReuse(a, "abbandonata", "abbandonat");
+ checkOneTermReuse(a, "abbandonati", "abbandonat");
// stopword
assertAnalyzesTo(a, "dallo", new String[] {});
}
@@ -49,7 +49,7 @@ public class TestItalianAnalyzer extends
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT,
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "abbandonata", "abbandonata");
- checkOneTermReuse(a, "abbandonati", "abbandon");
+ checkOneTermReuse(a, "abbandonati", "abbandonat");
}
/** blast some random strings through the analyzer */
@@ -61,7 +61,7 @@ public class TestItalianAnalyzer extends
public void testContractions() throws IOException {
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
- assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" });
+ assertAnalyzesTo(a, "l'Italiano", new String[] { "italian" });
}
/** test that we don't enable this before 3.2*/
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -35,8 +35,8 @@ public class TestPortugueseAnalyzer exte
public void testBasics() throws IOException {
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT);
// stemming
- checkOneTermReuse(a, "quilométricas", "quilométr");
- checkOneTermReuse(a, "quilométricos", "quilométr");
+ checkOneTermReuse(a, "quilométricas", "quilometric");
+ checkOneTermReuse(a, "quilométricos", "quilometric");
// stopword
assertAnalyzesTo(a, "não", new String[] {});
}
@@ -48,7 +48,7 @@ public class TestPortugueseAnalyzer exte
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT,
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "quilométricas", "quilométricas");
- checkOneTermReuse(a, "quilométricos", "quilométr");
+ checkOneTermReuse(a, "quilométricos", "quilometric");
}
/** blast some random strings through the analyzer */
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Wed Feb 8 12:07:52 2012
@@ -21,7 +21,9 @@ import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
@@ -40,14 +42,29 @@ public class TestThaiAnalyzer extends Ba
* testcase for offsets
*/
public void testOffsets() throws Exception {
- assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",
+ assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",
new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "วà¹à¸²", "à¸à¸²à¸", "à¸à¸µ" },
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
}
+ public void testStopWords() throws Exception {
+ assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",
+ new String[] { "à¹à¸ªà¸à¸", "à¸à¸²à¸", "à¸à¸µ" },
+ new int[] { 13, 20, 23 },
+ new int[] { 17, 23, 25 },
+ new int[] { 5, 2, 1 });
+ }
+
+ public void testBackwardsStopWords() throws Exception {
+ assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_35), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",
+ new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "วà¹à¸²", "à¸à¸²à¸", "à¸à¸µ" },
+ new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
+ new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
+ }
+
public void testTokenType() throws Exception {
- assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ à¹à¹à¹",
+ assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ à¹à¹à¹",
new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "วà¹à¸²", "à¸à¸²à¸", "à¸à¸µ", "à¹à¹à¹" },
new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
@@ -96,8 +113,9 @@ public class TestThaiAnalyzer extends Ba
/*
* Test that position increments are adjusted correctly for stopwords.
*/
+ // note this test uses stopfilter's stopset
public void testPositionIncrements() throws Exception {
- final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
+ final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
assertAnalyzesTo(analyzer, "à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸ the à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",
new String[] { "à¸à¸²à¸£", "à¸à¸µà¹", "à¹à¸à¹", "à¸à¹à¸à¸", "à¹à¸ªà¸à¸", "วà¹à¸²", "à¸à¸²à¸", "à¸à¸µ" },
new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
@@ -113,7 +131,7 @@ public class TestThaiAnalyzer extends Ba
}
public void testReusableTokenStream() throws Exception {
- ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
+ ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
assertAnalyzesToReuse(analyzer, "", new String[] {});
assertAnalyzesToReuse(
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Feb 8 12:07:52 2012
@@ -479,6 +479,9 @@ New Features
CommonGramsQueryFilterFactory can optionally read stopwords in Snowball
format (specify format="snowball"). (Robert Muir)
+* SOLR-3105: ElisionFilterFactory optionally allows the parameter
+ ignoreCase (default=false). (Robert Muir)
+
Optimizations
----------------------
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
@@ -592,6 +595,9 @@ Other Changes
* SOLR-3059: Example XSL stylesheet for indexing query result XML (janhoy)
+* SOLR-3097, SOLR-3105: Add analysis configurations for different languages to
+ the example. (Christian Moen, Robert Muir)
+
Build
----------------------
* SOLR-2487: Add build target to package war without slf4j jars (janhoy)
Modified: lucene/dev/trunk/solr/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/build.xml?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/solr/build.xml (original)
+++ lucene/dev/trunk/solr/build.xml Wed Feb 8 12:07:52 2012
@@ -625,4 +625,98 @@
<arg value="update"/>
</exec>
</target>
+
+ <property name="analysis-common.res.dir" value="../modules/analysis/common/src/resources/org/apache/lucene/analysis"/>
+ <property name="analysis-kuromoji.res.dir" value="../modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis"/>
+ <property name="analysis.conf.dest" value="${example}/solr/conf/lang"/>
+
+ <target name="sync-analyzers"
+ description="Committers' Helper: synchronizes analysis resources (e.g. stoplists) to the example">
+ <!-- arabic -->
+ <copy verbose="true" file="${analysis-common.res.dir}/ar/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_ar.txt"/>
+ <!-- bulgarian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/bg/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_bg.txt"/>
+ <!-- catalan -->
+ <copy verbose="true" file="${analysis-common.res.dir}/ca/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_ca.txt"/>
+ <!-- czech -->
+ <copy verbose="true" file="${analysis-common.res.dir}/cz/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_cz.txt"/>
+ <!-- danish -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/danish_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_da.txt"/>
+ <!-- german -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/german_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_de.txt"/>
+ <!-- greek -->
+ <copy verbose="true" file="${analysis-common.res.dir}/el/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_el.txt"/>
+ <!-- spanish -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/spanish_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_es.txt"/>
+ <!-- basque -->
+ <copy verbose="true" file="${analysis-common.res.dir}/eu/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_eu.txt"/>
+ <!-- persian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/fa/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_fa.txt"/>
+ <!-- finnish -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/finnish_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_fi.txt"/>
+ <!-- french -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/french_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_fr.txt"/>
+ <!-- galician -->
+ <copy verbose="true" file="${analysis-common.res.dir}/gl/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_gl.txt"/>
+ <!-- hindi -->
+ <copy verbose="true" file="${analysis-common.res.dir}/hi/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_hi.txt"/>
+ <!-- hungarian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/hungarian_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_hu.txt"/>
+ <!-- armenian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/hy/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_hy.txt"/>
+ <!-- indonesian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/id/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_id.txt"/>
+ <!-- italian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/italian_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_it.txt"/>
+ <!-- japanese -->
+ <copy verbose="true" file="${analysis-kuromoji.res.dir}/kuromoji/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_ja.txt"/>
+ <copy verbose="true" file="${analysis-kuromoji.res.dir}/kuromoji/stoptags.txt"
+ tofile="${analysis.conf.dest}/stoptags_ja.txt"/>
+ <!-- latvian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/lv/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_lv.txt"/>
+ <!-- dutch -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/dutch_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_nl.txt"/>
+ <!-- norwegian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/norwegian_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_no.txt"/>
+ <!-- portuguese -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/portuguese_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_pt.txt"/>
+ <!-- romanian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/ro/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_ro.txt"/>
+ <!-- russian -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/russian_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_ru.txt"/>
+ <!-- swedish -->
+ <copy verbose="true" file="${analysis-common.res.dir}/snowball/swedish_stop.txt"
+ tofile="${analysis.conf.dest}/stopwords_sv.txt"/>
+ <!-- thai -->
+ <copy verbose="true" file="${analysis-common.res.dir}/th/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_th.txt"/>
+ <!-- turkish -->
+ <copy verbose="true" file="${analysis-common.res.dir}/tr/stopwords.txt"
+ tofile="${analysis.conf.dest}/stopwords_tr.txt"/>
+ </target>
</project>
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java Wed Feb 8 12:07:52 2012
@@ -34,7 +34,8 @@ import org.apache.lucene.analysis.TokenS
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
* <filter class="solr.LowerCaseFilterFactory"/>
- * <filter class="solr.ElisionFilterFactory" articles="stopwordarticles.txt"/>
+ * <filter class="solr.ElisionFilterFactory"
+ * articles="stopwordarticles.txt" ignoreCase="true"/>
* </analyzer>
* </fieldType></pre>
*
@@ -45,10 +46,11 @@ public class ElisionFilterFactory extend
public void inform(ResourceLoader loader) {
String articlesFile = args.get("articles");
+ boolean ignoreCase = getBoolean("ignoreCase", false);
if (articlesFile != null) {
try {
- articles = getWordSet(loader, articlesFile, false);
+ articles = getWordSet(loader, articlesFile, ignoreCase);
} catch (IOException e) {
throw new RuntimeException(e);
}
Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java Wed Feb 8 12:07:52 2012
@@ -0,0 +1,39 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.gl.GalicianMinimalStemFilter;
+
+/**
+ * Factory for {@link GalicianMinimalStemFilter}.
+ * <pre class="prettyprint" >
+ * <fieldType name="text_glplural" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter class="solr.GalicianMinimalStemFilterFactory"/>
+ * </analyzer>
+ * </fieldType></pre>
+ *
+ */
+public class GalicianMinimalStemFilterFactory extends BaseTokenFilterFactory {
+ public TokenStream create(TokenStream input) {
+ return new GalicianMinimalStemFilter(input);
+ }
+}
Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java Wed Feb 8 12:07:52 2012
@@ -0,0 +1,39 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.de.GermanNormalizationFilter;
+
+/**
+ * Factory for {@link GermanNormalizationFilter}.
+ * <pre class="prettyprint" >
+ * <fieldType name="text_denorm" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter class="solr.GermanNormalizationFilterFactory"/>
+ * </analyzer>
+ * </fieldType></pre>
+ */
+public class GermanNormalizationFilterFactory extends BaseTokenFilterFactory {
+
+ public TokenStream create(TokenStream input) {
+ return new GermanNormalizationFilter(input);
+ }
+}
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java?rev=1241878&r1=1241877&r2=1241878&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java Wed Feb 8 12:07:52 2012
@@ -64,4 +64,22 @@ public class TestElisionFilterFactory ex
assertTokenStreamContents(stream, new String[] { "avion" });
}
+ /**
+ * Test setting ignoreCase=true
+ */
+ public void testCaseInsensitive() throws Exception {
+ Reader reader = new StringReader("L'avion");
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ ElisionFilterFactory factory = new ElisionFilterFactory();
+ factory.init(DEFAULT_VERSION_PARAM);
+ ResourceLoader loader = new SolrResourceLoader(null, null);
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("articles", "frenchArticles.txt");
+ args.put("ignoreCase", "true");
+ factory.init(args);
+ factory.inform(loader);
+ TokenStream stream = factory.create(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "avion" });
+ }
+
}
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java Wed Feb 8 12:07:52 2012
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure the Galician plural stem factory is working.
+ */
+public class TestGalicianMinimalStemFilterFactory extends BaseTokenTestCase {
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("elefantes");
+ GalicianMinimalStemFilterFactory factory = new GalicianMinimalStemFilterFactory();
+ TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
+ assertTokenStreamContents(stream, new String[] { "elefante" });
+ }
+}
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java Wed Feb 8 12:07:52 2012
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure the German normalization factory is working.
+ */
+public class TestGermanNormalizationFilterFactory extends BaseTokenTestCase {
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("weiÃbier");
+ GermanNormalizationFilterFactory factory = new GermanNormalizationFilterFactory();
+ TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
+ assertTokenStreamContents(stream, new String[] { "weissbier" });
+ }
+}
Added: lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/contractions_ca.txt Wed Feb 8 12:07:52 2012
@@ -0,0 +1,8 @@
+# Set of Catalan contractions for ElisionFilter
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+d
+l
+m
+n
+s
+t
Added: lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/contractions_fr.txt Wed Feb 8 12:07:52 2012
@@ -0,0 +1,9 @@
+# Set of French contractions for ElisionFilter
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+l
+m
+t
+qu
+n
+s
+j
Added: lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/contractions_it.txt Wed Feb 8 12:07:52 2012
@@ -0,0 +1,23 @@
+# Set of Italian contractions for ElisionFilter
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+c
+l
+all
+dall
+dell
+nell
+sull
+coll
+pell
+gl
+agl
+dagl
+degl
+negl
+sugl
+un
+m
+t
+s
+v
+d
Added: lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt?rev=1241878&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt (added)
+++ lucene/dev/trunk/solr/example/solr/conf/lang/stemdict_nl.txt Wed Feb 8 12:07:52 2012
@@ -0,0 +1,6 @@
+# Set of overrides for the dutch stemmer
+# TODO: load this as a resource from the analyzer and sync it in build.xml
+fiets fiets
+bromfiets bromfiets
+ei eier
+kind kinder