You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/06 00:05:49 UTC
svn commit: r907125 [2/3] - in /lucene/java/trunk: ./ contrib/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/
contrib/analyzers/common/src/java/org/apache/lucene/analysi...
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java Fri Feb 5 23:05:46 2010
@@ -26,8 +26,10 @@
* the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
* algorithm in Martin Porter's snowball project.
* </p>
+ * @deprecated Use {@link org.tartarus.snowball.ext.DutchStemmer} instead,
+ * which has the same functionality. This filter will be removed in Lucene 4.0
*/
-
+@Deprecated
public class DutchStemmer {
/**
* Buffer for the terms while stemming them.
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,130 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.NorwegianStemmer;
+
+/**
+ * {@link Analyzer} for Norwegian.
+ */
+public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Norwegian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public NorwegianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new NorwegianStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
+
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Norwegian.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java Fri Feb 5 23:05:46 2010
@@ -17,7 +17,6 @@
*/
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java Fri Feb 5 23:05:46 2010
@@ -19,7 +19,6 @@
import java.io.IOException;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java Fri Feb 5 23:05:46 2010
@@ -19,7 +19,6 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java Fri Feb 5 23:05:46 2010
@@ -21,7 +21,6 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/** Set the positionIncrement of all tokens to the "positionIncrement",
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.PortugueseStemmer;
+
+/**
+ * {@link Analyzer} for Portuguese.
+ */
+public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Portuguese stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public PortugueseAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new PortugueseStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Portuguese.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,133 @@
+package org.apache.lucene.analysis.ro;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.RomanianStemmer;
+
+/**
+ * {@link Analyzer} for Romanian.
+ */
+public final class RomanianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Romanian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ /**
+ * The comment character in the stopwords file.
+ * All lines prefixed with this will be ignored.
+ */
+ private static final String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false, RomanianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public RomanianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public RomanianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new RomanianStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Romanian.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Map;
@@ -26,11 +27,15 @@
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.util.Version;
/**
@@ -40,13 +45,22 @@
* will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
* </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating RussianAnalyzer:
+ * <ul>
+ * <li> As of 3.1, StandardTokenizer is used, Snowball stemming is done with
+ * SnowballFilter, and Snowball stopwords are used by default.
+ * </ul>
*/
public final class RussianAnalyzer extends StopwordAnalyzerBase
{
/**
- * List of typical Russian stopwords.
+ * List of typical Russian stopwords. (for backwards compatibility)
+ * @deprecated Remove this for LUCENE 4.0
*/
- private static final String[] RUSSIAN_STOP_WORDS = {
+ @Deprecated
+ private static final String[] RUSSIAN_STOP_WORDS_30 = {
"а", "без", "более", "бÑ", "бÑл", "бÑла", "бÑли", "бÑло", "бÑÑÑ", "в",
"вам", "ваÑ", "веÑÑ", "во", "воÑ", "вÑе", "вÑего", "вÑеÑ
", "вÑ", "где",
"да", "даже", "длÑ", "до", "его", "ее", "ей", "еÑ", "еÑли", "еÑÑÑ",
@@ -59,10 +73,27 @@
"Ñем", "ÑÑо", "ÑÑобÑ", "ÑÑе", "ÑÑÑ", "ÑÑа", "ÑÑи", "ÑÑо", "Ñ"
};
+ /** File containing default Russian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "russian_stop.txt";
+
private static class DefaultSetHolder {
- static final Set<?> DEFAULT_STOP_SET = CharArraySet
+ /** @deprecated remove this for Lucene 4.0 */
+ @Deprecated
+ static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
- Arrays.asList(RUSSIAN_STOP_WORDS), false));
+ Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET =
+ WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
}
private final Set<?> stemExclusionSet;
@@ -77,7 +108,9 @@
}
public RussianAnalyzer(Version matchVersion) {
- this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ this(matchVersion,
+ matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
+ : DefaultSetHolder.DEFAULT_STOP_SET_30);
}
/**
@@ -132,19 +165,30 @@
* provided {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a
- * {@link RussianLetterTokenizer} filtered with
+ * {@link StandardTokenizer} filtered with {@link StandardFilter},
* {@link LowerCaseFilter}, {@link StopFilter},
- * and {@link RussianStemFilter}
+ * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+ * and {@link SnowballFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
- TokenStream result = new LowerCaseFilter(matchVersion, source);
- result = new StopFilter(matchVersion, result, stopwords);
- if(!stemExclusionSet.isEmpty())
- result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
- return new TokenStreamComponents(source, new RussianStemFilter(result));
-
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
+ result, stemExclusionSet);
+ result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
+ return new TokenStreamComponents(source, result);
+ } else {
+ final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
+ result, stemExclusionSet);
+ return new TokenStreamComponents(source, new RussianStemFilter(result));
+ }
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java Fri Feb 5 23:05:46 2010
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.Tokenizer; // for javadocs
import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
+import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
@@ -35,8 +36,11 @@
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
* {@link CharTokenizer#normalize(int)} for details.</li>
- * </ul>
+ * </ul>
+ * @deprecated Use {@link StandardTokenizer} instead, which has the same functionality.
+ * This filter will be removed in Lucene 4.0
*/
+@Deprecated
public class RussianLetterTokenizer extends CharTokenizer
{
private static final int DIGIT_0 = '0';
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java Fri Feb 5 23:05:46 2010
@@ -24,6 +24,7 @@
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter; // javadoc @link
import java.io.IOException;
@@ -40,7 +41,11 @@
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerTokenFilter
+ * @deprecated Use {@link SnowballFilter} with
+ * {@link org.tartarus.snowball.ext.RussianStemmer} instead, which has the
+ * same functionality. This filter will be removed in Lucene 4.0
*/
+@Deprecated
public final class RussianStemFilter extends TokenFilter
{
/**
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java Fri Feb 5 23:05:46 2010
@@ -19,7 +19,10 @@
/**
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
+ * @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead,
+ * which has the same functionality. This filter will be removed in Lucene 4.0
*/
+@Deprecated
class RussianStemmer
{
// positions of RV, R1 and R2 respectively
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -39,7 +39,10 @@
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
* </ul>
* </p>
+ * @deprecated Use the language-specific analyzer in contrib/analyzers instead.
+ * This analyzer will be removed in Lucene 4.0
*/
+@Deprecated
public final class SnowballAnalyzer extends Analyzer {
private String name;
private Set<?> stopSet;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java Fri Feb 5 23:05:46 2010
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
@@ -39,14 +40,14 @@
*/
public final class SnowballFilter extends TokenFilter {
- private SnowballProgram stemmer;
+ private final SnowballProgram stemmer;
- private TermAttribute termAtt;
+ private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
super(input);
this.stemmer = stemmer;
- termAtt = addAttribute(TermAttribute.class);
}
/**
@@ -67,23 +68,24 @@
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
- termAtt = addAttribute(TermAttribute.class);
}
/** Returns the next input Token, after being stemmed */
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- char termBuffer[] = termAtt.termBuffer();
- final int length = termAtt.termLength();
- stemmer.setCurrent(termBuffer, length);
- stemmer.stem();
- final char finalTerm[] = stemmer.getCurrentBuffer();
- final int newLength = stemmer.getCurrentBufferLength();
- if (finalTerm != termBuffer)
- termAtt.setTermBuffer(finalTerm, 0, newLength);
- else
- termAtt.setTermLength(newLength);
+ if (!keywordAttr.isKeyword()) {
+ char termBuffer[] = termAtt.termBuffer();
+ final int length = termAtt.termLength();
+ stemmer.setCurrent(termBuffer, length);
+ stemmer.stem();
+ final char finalTerm[] = stemmer.getCurrentBuffer();
+ final int newLength = stemmer.getCurrentBufferLength();
+ if (finalTerm != termBuffer)
+ termAtt.setTermBuffer(finalTerm, 0, newLength);
+ else
+ termAtt.setTermLength(newLength);
+ }
return true;
} else {
return false;
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.sv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.SwedishStemmer;
+
+/**
+ * {@link Analyzer} for Swedish.
+ */
+public final class SwedishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Swedish stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public SwedishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public SwedishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public SwedishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new SwedishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html Fri Feb 5 23:05:46 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Swedish.
+</body>
+</html>
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java Fri Feb 5 23:05:46 2010
@@ -19,7 +19,6 @@
import java.io.IOException;
import java.util.Locale;
import java.lang.Character.UnicodeBlock;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
Added: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,132 @@
+package org.apache.lucene.analysis.tr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.TurkishStemmer;
+
+/**
+ * {@link Analyzer} for Turkish.
+ */
+public final class TurkishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Turkish stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ /**
+ * The comment character in the stopwords file.
+ * All lines prefixed with this will be ignored.
+ */
+ private static final String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false, TurkishAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public TurkishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public TurkishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public TurkishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link TurkishLowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new TurkishLowerCaseFilter(result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new TurkishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html Fri Feb 5 23:05:46 2010
@@ -17,15 +17,6 @@
-->
<html><head></head>
<body>
-Support for Turkish.
-<p>
-This package contains just the TokenStream for handling turkish casing,
-for a stemmer please see the snowball package.
-</p>
-<p>
-WARNING: SnowballAnalyzer uses LowerCaseFilter by default, even when the
-language is set to Turkish, so you will need to construct your own
-analyzer that combines TurkishLowerCaseFilter and SnowballFilter.
-</p>
+Analyzer for Turkish.
</body>
-</html>
\ No newline at end of file
+</html>
Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt Fri Feb 5 23:05:46 2010
@@ -0,0 +1,233 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+acea
+aceasta
+aceastÄ
+aceea
+acei
+aceia
+acel
+acela
+acele
+acelea
+acest
+acesta
+aceste
+acestea
+aceÅti
+aceÅtia
+acolo
+acum
+ai
+aia
+aibÄ
+aici
+al
+Äla
+ale
+alea
+Älea
+altceva
+altcineva
+am
+ar
+are
+aÅ
+aÅadar
+asemenea
+asta
+Ästa
+astÄzi
+astea
+Ästea
+ÄÅtia
+asupra
+aţi
+au
+avea
+avem
+aveţi
+azi
+bine
+bucur
+bunÄ
+ca
+cÄ
+cÄci
+când
+care
+cÄrei
+cÄror
+cÄrui
+cât
+câte
+câţi
+cÄtre
+câtva
+ce
+cel
+ceva
+chiar
+cînd
+cine
+cineva
+cît
+cîte
+cîţi
+cîtva
+contra
+cu
+cum
+cumva
+curând
+curînd
+da
+dÄ
+dacÄ
+dar
+datoritÄ
+de
+deci
+deja
+deoarece
+departe
+deÅi
+din
+dinaintea
+dintr
+dintre
+drept
+dupÄ
+ea
+ei
+el
+ele
+eram
+este
+eÅti
+eu
+face
+fÄrÄ
+fi
+fie
+fiecare
+fii
+fim
+fiţi
+iar
+ieri
+îi
+îl
+îmi
+împotriva
+în
+înainte
+înaintea
+încât
+încît
+încotro
+între
+întrucât
+întrucît
+îţi
+la
+lângÄ
+le
+li
+lîngÄ
+lor
+lui
+mÄ
+mâine
+mea
+mei
+mele
+mereu
+meu
+mi
+mine
+mult
+multÄ
+mulţi
+ne
+nicÄieri
+nici
+nimeni
+niÅte
+noastrÄ
+noastre
+noi
+noÅtri
+nostru
+nu
+ori
+oricând
+oricare
+oricât
+orice
+oricînd
+oricine
+oricît
+oricum
+oriunde
+pânÄ
+pe
+pentru
+peste
+pînÄ
+poate
+pot
+prea
+prima
+primul
+prin
+printr
+sa
+sÄ
+sÄi
+sale
+sau
+sÄu
+se
+Åi
+sînt
+sîntem
+sînteţi
+spre
+sub
+sunt
+suntem
+sunteţi
+ta
+tÄi
+tale
+tÄu
+te
+Å£i
+Å£ie
+tine
+toatÄ
+toate
+tot
+toţi
+totuÅi
+tu
+un
+una
+unde
+undeva
+unei
+unele
+uneori
+unor
+vÄ
+vi
+voastrÄ
+voastre
+voi
+voÅtri
+vostru
+vouÄ
+vreo
+vreun
Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt Fri Feb 5 23:05:46 2010
@@ -0,0 +1,212 @@
+# Turkish stopwords from LUCENE-559
+# merged with the list from "Information Retrieval on Turkish Texts"
+# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
+acaba
+altmıÅ
+altı
+ama
+ancak
+arada
+aslında
+ayrıca
+bana
+bazı
+belki
+ben
+benden
+beni
+benim
+beri
+beÅ
+bile
+bin
+bir
+birçok
+biri
+birkaç
+birkez
+birÅey
+birÅeyi
+biz
+bize
+bizden
+bizi
+bizim
+böyle
+böylece
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+burada
+çok
+çünkü
+da
+daha
+dahi
+de
+defa
+deÄil
+diÄer
+diye
+doksan
+dokuz
+dolayı
+dolayısıyla
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+eÄer
+elli
+en
+etmesi
+etti
+ettiÄi
+ettiÄini
+gibi
+göre
+halen
+hangi
+hatta
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkesin
+hiç
+hiçbir
+için
+iki
+ile
+ilgili
+ise
+iÅte
+itibaren
+itibariyle
+kadar
+karÅın
+katrilyon
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kez
+ki
+kim
+kimden
+kime
+kimi
+kimse
+kırk
+milyar
+milyon
+mu
+mü
+mı
+nasıl
+ne
+neden
+nedenle
+nerde
+nerede
+nereye
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduÄu
+olduÄunu
+olduklarını
+olmadı
+olmadıÄı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+ondan
+onlar
+onlardan
+onları
+onların
+onu
+onun
+otuz
+oysa
+öyle
+pek
+raÄmen
+sadece
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+Åey
+Åeyden
+Åeyi
+Åeyler
+Åöyle
+Åu
+Åuna
+Åunda
+Åundan
+Åunları
+Åunu
+tarafından
+trilyon
+tüm
+üç
+üzere
+var
+vardı
+ve
+veya
+ya
+yani
+yapacak
+yapılan
+yapılması
+yapıyor
+yapmak
+yaptı
+yaptıÄı
+yaptıÄını
+yaptıkları
+yedi
+yerine
+yetmiÅ
+yine
+yirmi
+yoksa
+yüz
+zaten
Propchange: lucene/java/trunk/contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -22,8 +22,6 @@
import java.util.HashSet;
import java.util.Set;
-import javax.print.DocFlavor.CHAR_ARRAY;
-
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java Fri Feb 5 23:05:46 2010
@@ -21,7 +21,6 @@
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
/**
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.da;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new DanishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "undersøg", "undersøg");
+ checkOneTermReuse(a, "undersøgelse", "undersøg");
+ // stopword
+ assertAnalyzesTo(a, "på", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("undersøgelse");
+ Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT,
+ DanishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "undersøgelse", "undersøgelse");
+ checkOneTermReuse(a, "undersøg", "undersøg");
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,93 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.util.Version;
+
+public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
+ public void testReusableTokenStream() throws Exception {
+ Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+ checkOneTermReuse(a, "Tisch", "tisch");
+ checkOneTermReuse(a, "Tische", "tisch");
+ checkOneTermReuse(a, "Tischen", "tisch");
+ }
+
+ public void testExclusionTableBWCompat() throws IOException {
+ GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT,
+ new StringReader("Fischen Trinken")));
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("fischen");
+ filter.setExclusionSet(set);
+ assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+ }
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("fischen");
+ GermanStemFilter filter = new GermanStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
+ "Fischen Trinken")), set));
+ assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+ }
+
+ public void testWithKeywordAttributeAndExclusionTable() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("fischen");
+ CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set1.add("trinken");
+ set1.add("fischen");
+ GermanStemFilter filter = new GermanStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
+ "Fischen Trinken")), set));
+ filter.setExclusionSet(set1);
+ assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
+ }
+
+ /*
+ * Test that changes to the exclusion table are applied immediately
+ * when using reusable token streams.
+ */
+ public void testExclusionTableReuse() throws Exception {
+ GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+ checkOneTermReuse(a, "tischen", "tisch");
+ a.setStemExclusionTable(new String[] { "tischen" });
+ checkOneTermReuse(a, "tischen", "tischen");
+ }
+
+ /** test some features of the new snowball filter
+ * these only pass with LUCENE_CURRENT, not if you use o.a.l.a.de.GermanStemmer
+ */
+ public void testGermanSpecials() throws Exception {
+ GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+ // a/o/u + e is equivalent to the umlaut form
+ checkOneTermReuse(a, "Schaltflächen", "schaltflach");
+ checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
+ // here they are with the old stemmer
+ a = new GermanAnalyzer(Version.LUCENE_30);
+ checkOneTermReuse(a, "Schaltflächen", "schaltflach");
+ checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Fri Feb 5 23:05:46 2010
@@ -20,15 +20,14 @@
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
-import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
-import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
/**
@@ -40,6 +39,8 @@
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
public void testStemming() throws Exception {
+ Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
+ TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer));
// read test cases from external file:
File dataDir = new File(System.getProperty("dataDir", "./bin"));
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
@@ -55,68 +56,12 @@
continue; // ignore comments and empty lines
String[] parts = line.split(";");
//System.out.println(parts[0] + " -- " + parts[1]);
- check(parts[0], parts[1]);
+ tokenizer.reset(new StringReader(parts[0]));
+ filter.reset();
+ assertTokenStreamContents(filter, new String[] { parts[1] });
}
breader.close();
isr.close();
fis.close();
}
-
- public void testReusableTokenStream() throws Exception {
- Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
- checkReuse(a, "Tisch", "tisch");
- checkReuse(a, "Tische", "tisch");
- checkReuse(a, "Tischen", "tisch");
- }
-
- public void testExclusionTableBWCompat() throws IOException {
- GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT,
- new StringReader("Fischen Trinken")));
- CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
- set.add("fischen");
- filter.setExclusionSet(set);
- assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
- }
-
- public void testWithKeywordAttribute() throws IOException {
- CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
- set.add("fischen");
- GermanStemFilter filter = new GermanStemFilter(
- new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
- "Fischen Trinken")), set));
- assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
- }
-
- public void testWithKeywordAttributeAndExclusionTable() throws IOException {
- CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
- set.add("fischen");
- CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
- set1.add("trinken");
- set1.add("fischen");
- GermanStemFilter filter = new GermanStemFilter(
- new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
- "Fischen Trinken")), set));
- filter.setExclusionSet(set1);
- assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
- }
-
- /*
- * Test that changes to the exclusion table are applied immediately
- * when using reusable token streams.
- */
- public void testExclusionTableReuse() throws Exception {
- GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
- checkReuse(a, "tischen", "tisch");
- a.setStemExclusionTable(new String[] { "tischen" });
- checkReuse(a, "tischen", "tischen");
- }
-
-
- private void check(final String input, final String expected) throws Exception {
- checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
- }
-
- private void checkReuse(Analyzer a, String input, String expected) throws Exception {
- checkOneTermReuse(a, input, expected);
- }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java?rev=907125&r1=907124&r2=907125&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java Fri Feb 5 23:05:46 2010
@@ -18,7 +18,6 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
@@ -63,4 +62,23 @@
assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
}
+
+ /**
+ * Greek Analyzer didn't call standardFilter, so no normalization of acronyms.
+ * check that this is preserved.
+ * @deprecated remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testAcronymBWCompat() throws Exception {
+ Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
+ assertAnalyzesTo(a, "Î.Î .Τ.", new String[] { "α.Ï.Ï." });
+ }
+
+ /**
+ * test that acronym normalization works
+ */
+ public void testAcronym() throws Exception {
+ Analyzer a = new GreekAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "Î.Î .Τ.", new String[] { "αÏÏ" });
+ }
}
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.en;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new EnglishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "books", "book");
+ checkOneTermReuse(a, "book", "book");
+ // stopword
+ assertAnalyzesTo(a, "the", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("books");
+ Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT,
+ EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "books", "books");
+ checkOneTermReuse(a, "book", "book");
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.es;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new SpanishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "chicana", "chican");
+ checkOneTermReuse(a, "chicano", "chican");
+ // stopword
+ assertAnalyzesTo(a, "los", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("chicano");
+ Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT,
+ SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "chicana", "chican");
+ checkOneTermReuse(a, "chicano", "chicano");
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java?rev=907125&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java Fri Feb 5 23:05:46 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.fi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new FinnishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
+ checkOneTermReuse(a, "edeltäjistään", "edeltäj");
+ // stopword
+ assertAnalyzesTo(a, "olla", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("edeltäjistään");
+ Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT,
+ FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
+ checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native