You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/06 00:05:49 UTC
svn commit: r907125 [1/3] - in /lucene/java/trunk: ./ contrib/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/
contrib/analyzers/common/src/java/org/apache/lucene/analysi...
Author: rmuir
Date: Fri Feb 5 23:05:46 2010
New Revision: 907125
URL: http://svn.apache.org/viewvc?rev=907125&view=rev
Log:
LUCENE-2055: better snowball integration, deprecate buggy handcoded snowball impls, restructure lang support
Added:
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html (with props)
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/
lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt (with props)
lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/
lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hu/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/no/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pt/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ro/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sv/
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java (with props)
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java (with props)
Modified:
lucene/java/trunk/NOTICE.txt
lucene/java/trunk/contrib/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
Modified: lucene/java/trunk/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/NOTICE.txt?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/NOTICE.txt (original)
+++ lucene/java/trunk/NOTICE.txt Fri Feb 5 23:05:46 2010
@@ -23,6 +23,11 @@
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
See http://members.unine.ch/jacques.savoy/clef/index.html.
+The Romanian analyzer (contrib/analyzers) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
+contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt.
+See http://members.unine.ch/jacques.savoy/clef/index.html.
+
The Bulgarian analyzer (contrib/analyzers) comes with a default
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Fri Feb 5 23:05:46 2010
@@ -27,6 +27,10 @@
used with Version > 3.0 and the TurkishStemmer.
(Robert Muir via Simon Willnauer)
+ * LUCENE-2055: GermanAnalyzer now uses the Snowball German2 algorithm and
+ stopwords list by default for Version > 3.0.
+ (Robert Muir, Uwe Schindler, Simon Willnauer)
+
Bug fixes
* LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
@@ -53,6 +57,13 @@
* LUCENE-2207, LUCENE-2219: Fix incorrect offset calculations in end() for
CJKTokenizer, ChineseTokenizer, SmartChinese SentenceTokenizer,
and WikipediaTokenizer. (Koji Sekiguchi, Robert Muir)
+
+ * LUCENE-2055: Deprecated RussianTokenizer, RussianStemmer, RussianStemFilter,
+ FrenchStemmer, FrenchStemFilter, DutchStemmer, and DutchStemFilter. For
+ these Analyzers, SnowballFilter is used instead (for Version > 3.0), as
+ the previous code did not always implement the Snowball algorithm correctly.
+ Additionally, for Version > 3.0, the Snowball stopword lists are used by
+ default. (Robert Muir, Uwe Schindler, Simon Willnauer)
API Changes
@@ -68,6 +79,12 @@
* LUCENE-2204: Change some package private classes/members to publicly accessible to implement
custom FragmentsBuilders. (Koji Sekiguchi)
+
+ * LUCENE-2055: Integrate snowball into contrib/analyzers. SnowballAnalyzer is
+ now deprecated in favor of language-specific analyzers which contain things
+ such as stopword lists and any language-specific processing in addition to
+ stemming. Add Turkish and Romanian stopwords lists to support this.
+ (Robert Muir, Uwe Schindler, Simon Willnauer)
New features
@@ -105,6 +122,10 @@
* LUCENE-2234: Add a Hindi analyzer. (Robert Muir)
+ * LUCENE-2055: Add analyzers/misc/StemmerOverrideFilter. This filter provides
+ the ability to override any stemmer with a custom dictionary map.
+ (Robert Muir, Uwe Schindler, Simon Willnauer)
+
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.da;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.DanishStemmer;
+
+/**
+ * {@link Analyzer} for Danish.
+ */
+public final class DanishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Danish stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public DanishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public DanishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new DanishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Danish.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -36,10 +36,12 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.German2Stemmer;
/**
* {@link Analyzer} for German language.
@@ -51,6 +53,16 @@
* exclusion list is empty by default.
* </p>
*
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating GermanAnalyzer:
+ * <ul>
+ * <li> As of 3.1, Snowball stemming is done with SnowballFilter, and
+ * Snowball stopwords are used by default.
+ * <li> As of 2.9, StopFilter preserves position
+ * increments
+ * </ul>
+ *
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
@@ -60,7 +72,7 @@
* List of typical german stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
- //TODO make this private in 3.1
+ //TODO make this private in 3.1, remove in 4.0
@Deprecated
public final static String[] GERMAN_STOP_WORDS = {
"einer", "eine", "eines", "einem", "einen",
@@ -77,6 +89,9 @@
"durch", "wegen", "wird"
};
+ /** File containing default German stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "german_stop.txt";
+
/**
* Returns a set of default German-stopwords
* @return a set of default German-stopwords
@@ -86,8 +101,21 @@
}
private static class DefaultSetHolder {
- private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
+ /** @deprecated remove in Lucene 4.0 */
+ @Deprecated
+ private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
+ private static final Set<?> DEFAULT_SET;
+ static {
+ try {
+ DEFAULT_SET =
+ WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
}
/**
@@ -105,7 +133,9 @@
* {@link #getDefaultStopSet()}.
*/
public GermanAnalyzer(Version matchVersion) {
- this(matchVersion, DefaultSetHolder.DEFAULT_SET);
+ this(matchVersion,
+ matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET
+ : DefaultSetHolder.DEFAULT_SET_30);
}
/**
@@ -199,8 +229,9 @@
*
* @return {@link TokenStreamComponents} built from a
* {@link StandardTokenizer} filtered with {@link StandardFilter},
- * {@link LowerCaseFilter}, {@link StopFilter}, and
- * {@link GermanStemFilter}
+ * {@link LowerCaseFilter}, {@link StopFilter},
+ * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided, and
+ * {@link SnowballFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
@@ -210,6 +241,10 @@
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopwords);
result = new KeywordMarkerTokenFilter(result, exclusionSet);
- return new TokenStreamComponents(source, new GermanStemFilter(result));
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ result = new SnowballFilter(result, new German2Stemmer());
+ else
+ result = new GermanStemFilter(result);
+ return new TokenStreamComponents(source, result);
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -24,6 +24,7 @@
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
@@ -41,6 +42,15 @@
* A default set of stopwords is used unless an alternative list is specified.
* </p>
*
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating GreekAnalyzer:
+ * <ul>
+ * <li> As of 3.1, StandardFilter is used by default.
+ * <li> As of 2.9, StopFilter preserves position
+ * increments
+ * </ul>
+ *
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
@@ -117,13 +127,15 @@
*
* @return {@link TokenStreamComponents} built from a
* {@link StandardTokenizer} filtered with
- * {@link GreekLowerCaseFilter} and {@link StopFilter}
+ * {@link GreekLowerCaseFilter}, {@link StandardFilter} and {@link StopFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
- final TokenStream result = new GreekLowerCaseFilter(source);
+ TokenStream result = new GreekLowerCaseFilter(source);
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ result = new StandardFilter(result);
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
}
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,113 @@
+package org.apache.lucene.analysis.en;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for English.
+ */
+public final class EnglishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
+ */
+ public EnglishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public EnglishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link PorterStemFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new PorterStemFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for English.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.es;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.SpanishStemmer;
+
+/**
+ * {@link Analyzer} for Spanish.
+ */
+public final class SpanishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Spanish stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public SpanishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public SpanishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public SpanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new SpanishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Spanish.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.fi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.FinnishStemmer;
+
+/**
+ * {@link Analyzer} for Finnish.
+ */
+public final class FinnishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Italian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public FinnishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public FinnishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public FinnishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new FinnishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Finnish.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java Fri Feb 5 23:05:46 2010
@@ -68,7 +68,7 @@
/**
* Constructs an elision filter with standard stop words
*/
- protected ElisionFilter(Version matchVersion, TokenStream input) {
+ public ElisionFilter(Version matchVersion, TokenStream input) {
this(matchVersion, input, DEFAULT_ARTICLES);
}
@@ -77,7 +77,7 @@
* @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead
*/
@Deprecated
- protected ElisionFilter(TokenStream input) {
+ public ElisionFilter(TokenStream input) {
this(Version.LUCENE_30, input);
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -27,6 +27,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
@@ -55,6 +56,9 @@
* <p>You must specify the required {@link Version}
* compatibility when creating FrenchAnalyzer:
* <ul>
+ * <li> As of 3.1, Snowball stemming is done with SnowballFilter,
+ * LowerCaseFilter is used prior to StopFilter, and ElisionFilter and
+ * Snowball stopwords are used by default.
* <li> As of 2.9, StopFilter preserves position
* increments
* </ul>
@@ -68,7 +72,7 @@
* Extended list of typical French stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
- // TODO make this private in 3.1
+ // TODO make this private in 3.1, remove in 4.0
@Deprecated
public final static String[] FRENCH_STOP_WORDS = {
"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
@@ -95,6 +99,9 @@
"été", "être", "ô"
};
+ /** File containing default French stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
+
/**
* Contains words that should be indexed but not stemmed.
*/
@@ -110,16 +117,31 @@
}
private static class DefaultSetHolder {
- static final Set<?> DEFAULT_STOP_SET = CharArraySet
+ /** @deprecated remove this in Lucene 4.0 */
+ @Deprecated
+ static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
false));
+ static final Set<?> DEFAULT_STOP_SET;
+ static {
+ try {
+ DEFAULT_STOP_SET =
+ WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
}
/**
- * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
+ * Builds an analyzer with the default stop words ({@link #getDefaultStopSet}).
*/
public FrenchAnalyzer(Version matchVersion) {
- this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ this(matchVersion,
+ matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
+ : DefaultSetHolder.DEFAULT_STOP_SET_30);
}
/**
@@ -207,20 +229,34 @@
* {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link StopFilter},
- * {@link FrenchStemFilter} and {@link LowerCaseFilter}
+ * filtered with {@link StandardFilter}, {@link ElisionFilter},
+ * {@link LowerCaseFilter}, {@link StopFilter},
+ * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+ * and {@link SnowballFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- final Tokenizer source = new StandardTokenizer(matchVersion, reader);
- TokenStream result = new StandardFilter(source);
- result = new StopFilter(matchVersion, result, stopwords);
- if(!excltable.isEmpty())
- result = new KeywordMarkerTokenFilter(result, excltable);
- result = new FrenchStemFilter(result);
- // Convert to lowercase after stemming!
- return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new ElisionFilter(matchVersion, result);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
+ return new TokenStreamComponents(source, result);
+ } else {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ result = new FrenchStemFilter(result);
+ // Convert to lowercase after stemming!
+ return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
+ }
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java Fri Feb 5 23:05:46 2010
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -40,7 +41,11 @@
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerTokenFilter
+ * @deprecated Use {@link SnowballFilter} with
+ * {@link org.tartarus.snowball.ext.FrenchStemmer} instead, which has the
+ * same functionality. This filter will be removed in Lucene 4.0
*/
+@Deprecated
public final class FrenchStemFilter extends TokenFilter {
/**
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java Fri Feb 5 23:05:46 2010
@@ -25,8 +25,10 @@
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
* (French stemming algorithm) for details
* </p>
+ * @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead,
+ * which has the same functionality. This filter will be removed in Lucene 4.0
*/
-
+@Deprecated
public class FrenchStemmer {
/**
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.hu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.HungarianStemmer;
+
+/**
+ * {@link Analyzer} for Hungarian.
+ */
+public final class HungarianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Hungarian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public HungarianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public HungarianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public HungarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new HungarianStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Hungarian.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.it;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.ItalianStemmer;
+
+/**
+ * {@link Analyzer} for Italian.
+ */
+public final class ItalianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Italian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public ItalianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public ItalianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public ItalianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new ItalianStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Italian.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java Fri Feb 5 23:05:46 2010
@@ -18,7 +18,6 @@
*/
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
import java.io.IOException;
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,70 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * Provides the ability to override any {@link KeywordAttribute} aware stemmer
+ * with custom dictionary-based stemming.
+ */
+public final class StemmerOverrideFilter extends TokenFilter {
+ private final CharArrayMap<String> dictionary;
+
+ private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+ private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+
+ /**
+ * Create a new StemmerOverrideFilter, performing dictionary-based stemming
+ * with the provided <code>dictionary</code>.
+ * <p>
+ * Any dictionary-stemmed terms will be marked with {@link KeywordAttribute}
+ * so that they will not be stemmed with stemmers down the chain.
+ * </p>
+ */
+ public StemmerOverrideFilter(Version matchVersion, TokenStream input,
+ Map<?,String> dictionary) {
+ super(input);
+ this.dictionary = dictionary instanceof CharArrayMap ?
+ (CharArrayMap<String>) dictionary : CharArrayMap.copy(matchVersion, dictionary);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms
+ String stem = dictionary.get(termAtt.termBuffer(), 0, termAtt.termLength());
+ if (stem != null) {
+ termAtt.setTermBuffer(stem);
+ keywordAtt.setKeyword(true);
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java Fri Feb 5 23:05:46 2010
@@ -19,7 +19,6 @@
import java.io.IOException;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Fri Feb 5 23:05:46 2010
@@ -17,7 +17,6 @@
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -20,11 +20,14 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
@@ -33,7 +36,6 @@
import java.io.File;
import java.io.IOException;
import java.io.Reader;
-import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
@@ -51,6 +53,17 @@
* exclusion list is empty by default.
* </p>
*
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating DutchAnalyzer:
+ * <ul>
+ * <li> As of 3.1, Snowball stemming is done with SnowballFilter,
+ * LowerCaseFilter is used prior to StopFilter, and Snowball
+ * stopwords are used by default.
+ * <li> As of 2.9, StopFilter preserves position
+ * increments
+ * </ul>
+ *
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
@@ -60,19 +73,11 @@
* @deprecated use {@link #getDefaultStopSet()} instead
*/
@Deprecated
- public final static String[] DUTCH_STOP_WORDS =
- {
- "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
- "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
- "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
- "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
- "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
- "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
- "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
- "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
- "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
- "uw", "iemand", "geweest", "andere"
- };
+ public final static String[] DUTCH_STOP_WORDS = getDefaultStopSet().toArray(new String[0]);
+
+ /** File containing default Dutch stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "dutch_stop.txt";
+
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
@@ -82,9 +87,18 @@
}
private static class DefaultSetHolder {
- static final Set<?> DEFAULT_STOP_SET = CharArraySet
- .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
- Arrays.asList(DUTCH_STOP_WORDS), false));
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
}
@@ -223,18 +237,32 @@
* text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link StopFilter},
- * and {@link DutchStemFilter}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+ * {@link StemmerOverrideFilter}, and {@link SnowballFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader aReader) {
- final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
- TokenStream result = new StandardFilter(source);
- result = new StopFilter(matchVersion, result, stoptable);
- if (!excltable.isEmpty())
- result = new KeywordMarkerTokenFilter(result, excltable);
- result = new DutchStemFilter(result, stemdict);
- return new TokenStreamComponents(source, result);
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stoptable);
+ if (!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ if (!stemdict.isEmpty())
+ result = new StemmerOverrideFilter(matchVersion, result, stemdict);
+ result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
+ return new TokenStreamComponents(source, result);
+ } else {
+ final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
+ TokenStream result = new StandardFilter(source);
+ result = new StopFilter(matchVersion, result, stoptable);
+ if (!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ result = new DutchStemFilter(result, stemdict);
+ return new TokenStreamComponents(source, result);
+ }
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java Fri Feb 5 23:05:46 2010
@@ -26,6 +26,7 @@
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -42,7 +43,11 @@
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerTokenFilter
+ * @deprecated Use {@link SnowballFilter} with
+ * {@link org.tartarus.snowball.ext.DutchStemmer} instead, which has the
+ * same functionality. This filter will be removed in Lucene 4.0
*/
+@Deprecated
public final class DutchStemFilter extends TokenFilter {
/**
* The actual token in the input stream.