You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2012/03/20 11:57:51 UTC
svn commit: r1302833 [1/3] - in /lucene/dev/trunk: modules/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/no/
modules/analysis/common/src/test/org/apache/lucene/analysis/no/ solr/
solr/core/src/java/org/apache/solr/analysis/ solr...
Author: janhoy
Date: Tue Mar 20 10:57:50 2012
New Revision: 1302833
URL: http://svn.apache.org/viewvc?rev=1302833&view=rev
Log:
SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer
Added:
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt (with props)
lucene/dev/trunk/solr/CHANGES.txt.orig
lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/NorwegianLightStemFilterFactory.java (with props)
lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/NorwegianMinimalStemFilterFactory.java (with props)
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestNorwegianLightStemFilterFactory.java (with props)
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestNorwegianMinimalStemFilterFactory.java (with props)
Modified:
lucene/dev/trunk/modules/analysis/CHANGES.txt
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/example/solr/conf/schema.xml
Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1302833&r1=1302832&r2=1302833&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Tue Mar 20 10:57:50 2012
@@ -106,3 +106,6 @@ New Features
All analyzers in contrib/analyzers and contrib/icu were moved to the
analysis module. The 'smartcn' and 'stempel' components now depend on 'common'.
(Chris Male, Robert Muir)
+
+ * SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)
+
\ No newline at end of file
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java Tue Mar 20 10:57:50 2012
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link NorwegianLightStemmer} to stem Norwegian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class NorwegianLightStemFilter extends TokenFilter {
+ private final NorwegianLightStemmer stemmer = new NorwegianLightStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public NorwegianLightStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java Tue Mar 20 10:57:50 2012
@@ -0,0 +1,119 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Norwegian.
+ * <p>
+ * Parts of this stemmer is adapted from SwedishLightStemFilter, except
+ * that while the Swedish one has a pre-defined rule set and a corresponding
+ * corpus to validate against whereas the Norwegian one is hand crafted.
+ */
+public class NorwegianLightStemmer {
+
+ public int stem(char s[], int len) {
+ // Remove posessive -s (bilens -> bilen) and continue checking
+ if (len > 4 && s[len-1] == 's')
+ len--;
+
+ // Remove common endings, single-pass
+ if (len > 7 &&
+ (endsWith(s, len, "heter") || // general ending (hemmelig-heter -> hemmelig)
+ endsWith(s, len, "heten"))) // general ending (hemmelig-heten -> hemmelig)
+ return len - 5;
+
+ if (len > 5 &&
+ (endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
+ endsWith(s, len, "het"))) // general ending (hemmelig-het -> hemmelig)
+ return len - 3;
+
+ if (len > 7 &&
+ (endsWith(s, len, "elser") || // general ending (føl-elser -> føl)
+ endsWith(s, len, "elsen"))) // general ending (føl-elsen -> føl)
+ return len - 5;
+
+ if (len > 6 &&
+ (endsWith(s, len, "ende") || // (sov-ende -> sov)
+ endsWith(s, len, "else") || // general ending (føl-else -> føl)
+ endsWith(s, len, "este") || // adj (fin-este -> fin)
+ endsWith(s, len, "eren"))) // masc
+ return len - 4;
+
+ if (len > 5 &&
+ (endsWith(s, len, "ere") || // adj (fin-ere -> fin)
+ endsWith(s, len, "est") || // adj (fin-est -> fin)
+ endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
+ ))
+ return len - 3;
+
+ if (len > 4 &&
+ (endsWith(s, len, "er") || // masc/fem indefinite
+ endsWith(s, len, "en") || // masc/fem definite
+ endsWith(s, len, "et") || // neutr definite
+ endsWith(s, len, "st") || // adj (billig-st -> billig)
+ endsWith(s, len, "te")))
+ return len - 2;
+
+ if (len > 3)
+ switch(s[len-1]) {
+ case 'a': // fem definite
+ case 'e': // to get correct stem for nouns ending in -e (kake -> kak, kaker -> kak)
+ case 'n':
+ return len - 1;
+ }
+
+ return len;
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilter.java Tue Mar 20 10:57:50 2012
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link NorwegianMinimalStemmer} to stem Norwegian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class NorwegianMinimalStemFilter extends TokenFilter {
+ private final NorwegianMinimalStemmer stemmer = new NorwegianMinimalStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public NorwegianMinimalStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.java Tue Mar 20 10:57:50 2012
@@ -0,0 +1,90 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Minimal Stemmer for Norwegian bokmål (no-nb)
+ * <p>
+ * Stems known plural forms for Norwegian nouns only, together with genitiv -s
+ */
+public class NorwegianMinimalStemmer {
+
+ public int stem(char s[], int len) {
+ // Remove genitiv s
+ if (len > 4 && s[len-1] == 's')
+ len--;
+
+ if (len > 5 &&
+ endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
+ )
+ return len - 3;
+
+ if (len > 4 &&
+ (endsWith(s, len, "er") || // masc/fem indefinite
+ endsWith(s, len, "en") || // masc/fem definite
+ endsWith(s, len, "et") // neutr definite
+ ))
+ return len - 2;
+
+ if (len > 3)
+ switch(s[len-1]) {
+ case 'a': // fem definite
+ case 'e': // to get correct stem for nouns ending in -e (kake -> kak, kaker -> kak)
+ return len - 1;
+ }
+
+ return len;
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java Tue Mar 20 10:57:50 2012
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+import static org.apache.lucene.analysis.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link NorwegianLightStemFilter}
+ */
+public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(source, new NorwegianLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary file */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java Tue Mar 20 10:57:50 2012
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+import static org.apache.lucene.analysis.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link NorwegianMinimalStemFilter}
+ */
+public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary file */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_light.txt Tue Mar 20 10:57:50 2012
@@ -0,0 +1,144 @@
+#
+# Tests for norwegian Bokmål light stemmer
+# It should tackle nouns, adjectives, genitiv and some general endings
+#
+# Nouns masculine
+bil bil
+bilen bil
+biler bil
+bilene bil
+bilens bil
+bilenes bil
+sekretæren sekretær
+sekretær sekretær
+sekretærene sekretær
+kaker kak
+kaken kak
+kakene kak
+kakenes kak
+bibliotekar bibliotekar
+bibliotekarer bibliotekar
+bibliotekaren bibliotekar
+bibliotekarens bibliotekar
+bibliotekarene bibliotekar
+bibliotekarenes bibliotekar
+# Nouns feminine
+veske vesk
+veska vesk
+vesken vesk
+veskene vesk
+veskas vesk
+# Nouns neutral
+huset hus
+husene hus
+husets hus
+hus hus
+huset hus
+husene hus
+husenes hus
+flagg flagg
+flagga flagg
+flaggene flagg
+flaggets flagg
+flaggenes flagg
+politi politi
+politiet politi
+politiets politi
+politienes politi
+# General endings
+god god
+godhet god
+godheten god
+forelskelse forelsk
+forelsket forelsk
+forelskelsen forelsk
+forelske forelsk
+kristen krist
+kristendom kristen
+kristendommen kristendomm
+kristendommens kristendomm
+fattig fattig
+fattigdom fattig
+fattigdommen fattigdomm
+fattigdommens fattigdomm
+# -het (see http://no.wiktionary.org/wiki/Kategori:Ord_som_ender_p%C3%A5_%C2%AB-het%C2%BB)
+hemmelig hemmelig
+hemmelighet hemmelig
+hemmelighets hemmelig
+hemmeligheter hemmelig
+hemmeligheten hemmelig
+hemmelighetens hemmelig
+kjærlig kjærlig
+kjærlighet kjærlig
+kjærligheter kjærlig
+kjærligheten kjærlig
+forlegen forleg
+forlegenhet forlegen
+forlegenheten forlegen
+forlegenhetens forlegen
+tvetydig tvetydig
+tvetydighet tvetydig
+tvetydigheter tvetydig
+tvetydigheten tvetydig
+tvetydighetens tvetydig
+virkelig virkelig
+virkelighet virkelig
+virkeligheten virkelig
+virkelighetens virkelig
+# Adjectives
+billig billig
+billigere billig
+billigst billig
+billige billig
+frisk frisk
+friskere frisk
+friskest frisk
+syk syk
+sykere syk
+sykest syk
+#########################################
+# Words that should not be stemmed
+#
+# Irregular masculine nouns (not supposed to be handled correctly)
+# Fetched from http://no.wiktionary.org/wiki/Kategori:Substantiv_i_norsk_med_uregelrett_flertallsb%C3%B8yning
+vaffel vaffel
+vafler vafl
+vaflene vafl
+tittel tittel
+titler titl
+titlene titl
+kam kam
+kammer kamm
+kammene kamm
+kamrene kamr
+# Irregular feminine nouns, not handled
+ku ku
+ku ku
+kyr kyr
+kuer kuer
+kyrne kyrn
+kuene kuen
+datter datt
+døtre døtr
+døtrene døtr
+# Other words that should not be touched
+abc abc
+123 123
+Jens Jens
+# Adjectives
+billig billig
+billigere billig
+billigst billig
+billige billig
+frisk frisk
+friskere frisk
+friskest frisk
+# Irregular adjectives that should not be stemmed
+god god
+bedre bedr
+best best
+# Verbs, should not be stemmed
+føle føl
+følte føl
+følt følt
+
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt?rev=1302833&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/no/nb_minimal.txt Tue Mar 20 10:57:50 2012
@@ -0,0 +1,99 @@
+#
+# Tests for norwegian Bokmål minimal stemmer
+# It only tries to stem nouns, i.e. being very little agressive
+#
+# Nouns masculine
+bil bil
+bilen bil
+biler bil
+bilene bil
+bilens bil
+bilenes bil
+sekretæren sekretær
+sekretær sekretær
+sekretærene sekretær
+kaker kak
+kaken kak
+kakene kak
+kakenes kak
+bibliotekar bibliotekar
+bibliotekarer bibliotekar
+bibliotekaren bibliotekar
+bibliotekarens bibliotekar
+bibliotekarene bibliotekar
+bibliotekarenes bibliotekar
+# Nouns feminine
+veske vesk
+veska vesk
+vesken vesk
+veskene vesk
+veskas vesk
+# Nouns neutral
+huset hus
+husene hus
+husets hus
+hus hus
+huset hus
+husene hus
+husenes hus
+flagg flagg
+flagga flagg
+flaggene flagg
+flaggets flagg
+flaggenes flagg
+politi politi
+politiet politi
+politiets politi
+politienes politi
+#########################################
+# Words that should not be stemmed
+#
+# Irregular masculine nouns (not supposed to be handled correctly)
+# Fetched from http://no.wiktionary.org/wiki/Kategori:Substantiv_i_norsk_med_uregelrett_flertallsb%C3%B8yning
+vaffel vaffel
+vafler vafl
+vaflene vafl
+tittel tittel
+titler titl
+titlene titl
+kam kam
+kammer kamm
+kammene kamm
+kamrene kamr
+# Irregular feminine nouns, not handled
+ku ku
+ku ku
+kyr kyr
+kuer kuer
+kyrne kyrn
+kuene kuen
+datter datt
+døtre døtr
+døtrene døtr
+# Other words that should not be touched
+abc abc
+123 123
+Jens Jens
+# Adjective, should not be stemmed
+billig billig
+billigere billiger
+billigst billigst
+billige billig
+god god
+bedre bedr
+best best
+# General endings, should not be stemmed
+god god
+godhet godh
+forelskelse forelskels
+kristendom kristendom
+# Verbs, should not be stemmed
+føle føl
+følte følt
+følt følt
+hemmelig hemmelig
+hemmelighet hemmeligh
+hemmeligheten hemmelighet
+kjærlig kjærlig
+kjærlighet kjærlig
+kjærligheten kjærlig
\ No newline at end of file
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1302833&r1=1302832&r2=1302833&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Tue Mar 20 10:57:50 2012
@@ -561,6 +561,8 @@ New Features
* SOLR-2826: URLClassify Update Processor (janhoy)
+* SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)
+
Optimizations
----------------------
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter