You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/07/14 14:10:37 UTC
svn commit: r964019 [2/4] - in /lucene/dev/trunk: lucene/contrib/
modules/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/ar/
modules/analysis/common/src/java/org/apache/lucene/analysis/bg/
modules/analysis/common/src/java/org/apa...
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,238 @@
+package org.apache.lucene.analysis.hu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Hungarian.
+ * <p>
+ * This stemmer implements the "UniNE" algorithm in:
+ * <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
+ * Jacques Savoy
+ */
+public class HungarianLightStemmer {
+ public int stem(char s[], int len) {
+ for (int i = 0; i < len; i++)
+ switch(s[i]) {
+ case 'á': s[i] = 'a'; break;
+ case 'ë':
+ case 'é': s[i] = 'e'; break;
+ case 'Ã': s[i] = 'i'; break;
+ case 'ó':
+ case 'Å':
+ case 'õ':
+ case 'ö': s[i] = 'o'; break;
+ case 'ú':
+ case 'ű':
+ case 'Å©':
+ case 'û':
+ case 'ü': s[i] = 'u'; break;
+ }
+
+ len = removeCase(s, len);
+ len = removePossessive(s, len);
+ len = removePlural(s, len);
+ return normalize(s, len);
+ }
+
+ private int removeCase(char s[], int len) {
+ if (len > 6 && endsWith(s, len, "kent"))
+ return len - 4;
+
+ if (len > 5) {
+ if (endsWith(s, len, "nak") ||
+ endsWith(s, len, "nek") ||
+ endsWith(s, len, "val") ||
+ endsWith(s, len, "vel") ||
+ endsWith(s, len, "ert") ||
+ endsWith(s, len, "rol") ||
+ endsWith(s, len, "ban") ||
+ endsWith(s, len, "ben") ||
+ endsWith(s, len, "bol") ||
+ endsWith(s, len, "nal") ||
+ endsWith(s, len, "nel") ||
+ endsWith(s, len, "hoz") ||
+ endsWith(s, len, "hez") ||
+ endsWith(s, len, "tol"))
+ return len - 3;
+
+ if (endsWith(s, len, "al") || endsWith(s, len, "el")) {
+ if (!isVowel(s[len-3]) && s[len-3] == s[len-4])
+ return len - 3;
+ }
+ }
+
+ if (len > 4) {
+ if (endsWith(s, len, "at") ||
+ endsWith(s, len, "et") ||
+ endsWith(s, len, "ot") ||
+ endsWith(s, len, "va") ||
+ endsWith(s, len, "ve") ||
+ endsWith(s, len, "ra") ||
+ endsWith(s, len, "re") ||
+ endsWith(s, len, "ba") ||
+ endsWith(s, len, "be") ||
+ endsWith(s, len, "ul") ||
+ endsWith(s, len, "ig"))
+ return len - 2;
+
+ if ((endsWith(s, len, "on") || endsWith(s, len, "en")) && !isVowel(s[len-3]))
+ return len - 2;
+
+ switch(s[len-1]) {
+ case 't':
+ case 'n': return len - 1;
+ case 'a':
+ case 'e': if (s[len-2] == s[len-3] && !isVowel(s[len-2])) return len - 2;
+ }
+ }
+
+ return len;
+ }
+
+ private int removePossessive(char s[], int len) {
+ if (len > 6) {
+ if (!isVowel(s[len-5]) &&
+ (endsWith(s, len, "atok") ||
+ endsWith(s, len, "otok") ||
+ endsWith(s, len, "etek")))
+ return len - 4;
+
+ if (endsWith(s, len, "itek") || endsWith(s, len, "itok"))
+ return len - 4;
+ }
+
+ if (len > 5) {
+ if (!isVowel(s[len-4]) &&
+ (endsWith(s, len, "unk") ||
+ endsWith(s, len, "tok") ||
+ endsWith(s, len, "tek")))
+ return len - 3;
+
+ if (isVowel(s[len-4]) && endsWith(s, len, "juk"))
+ return len - 3;
+
+ if (endsWith(s, len, "ink"))
+ return len - 3;
+ }
+
+ if (len > 4) {
+ if (!isVowel(s[len-3]) &&
+ (endsWith(s, len, "am") ||
+ endsWith(s, len, "em") ||
+ endsWith(s, len, "om") ||
+ endsWith(s, len, "ad") ||
+ endsWith(s, len, "ed") ||
+ endsWith(s, len, "od") ||
+ endsWith(s, len, "uk")))
+ return len - 2;
+
+ if (isVowel(s[len-3]) &&
+ (endsWith(s, len, "nk") ||
+ endsWith(s, len, "ja") ||
+ endsWith(s, len, "je")))
+ return len - 2;
+
+ if (endsWith(s, len, "im") ||
+ endsWith(s, len, "id") ||
+ endsWith(s, len, "ik"))
+ return len - 2;
+ }
+
+ if (len > 3)
+ switch(s[len-1]) {
+ case 'a':
+ case 'e': if (!isVowel(s[len-2])) return len - 1; break;
+ case 'm':
+ case 'd': if (isVowel(s[len-2])) return len - 1; break;
+ case 'i': return len - 1;
+ }
+
+ return len;
+ }
+
+ private int removePlural(char s[], int len) {
+ if (len > 3 && s[len-1] == 'k')
+ switch(s[len-2]) {
+ case 'a':
+ case 'o':
+ case 'e': if (len > 4) return len - 2; /* intentional fallthru */
+ default: return len - 1;
+ }
+ return len;
+ }
+
+ private int normalize(char s[], int len) {
+ if (len > 3)
+ switch(s[len-1]) {
+ case 'a':
+ case 'e':
+ case 'i':
+ case 'o': return len - 1;
+ }
+ return len;
+ }
+
+ private boolean isVowel(char ch) {
+ switch(ch) {
+ case 'a':
+ case 'e':
+ case 'i':
+ case 'o':
+ case 'u':
+ case 'y': return true;
+ default: return false;
+ }
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java Wed Jul 14 12:10:34 2010
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis.id;
* limitations under the License.
*/
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
/**
* Stemmer for Indonesian.
* <p>
@@ -266,39 +268,5 @@ public class IndonesianStemmer {
return length - 1;
}
return length;
- }
-
- private boolean startsWith(char s[], int len, String prefix) {
- final int prefixLen = prefix.length();
- if (prefixLen > len)
- return false;
- for (int i = 0; i < prefixLen; i++)
- if (s[i] != prefix.charAt(i))
- return false;
- return true;
- }
-
- private boolean endsWith(char s[], int len, String suffix) {
- final int suffixLen = suffix.length();
- if (suffixLen > len)
- return false;
- for (int i = suffixLen - 1; i >= 0; i--)
- if (s[len -(suffixLen - i)] != suffix.charAt(i))
- return false;
-
- return true;
- }
-
- private int deleteN(char s[], int pos, int len, int nChars) {
- for (int i = 0; i < nChars; i++)
- len = delete(s, pos, len);
- return len;
- }
-
- private int delete(char s[], int pos, int len) {
- if (pos < len)
- System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
-
- return len - 1;
- }
+ }
}
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java Wed Jul 14 12:10:34 2010
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.in;
import java.util.BitSet;
import java.util.IdentityHashMap;
import static java.lang.Character.UnicodeBlock.*;
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Normalizes the Unicode representation of text in Indian languages.
@@ -290,14 +291,4 @@ public class IndicNormalizer {
return len;
}
-
- /**
- * Delete a character in-place
- */
- private int delete(char s[], int pos, int len) {
- if (pos < len)
- System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
-
- return len - 1;
- }
}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.it;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link ItalianLightStemmer} to stem Italian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class ItalianLightStemFilter extends TokenFilter {
+ private final ItalianLightStemmer stemmer = new ItalianLightStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public ItalianLightStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,117 @@
+package org.apache.lucene.analysis.it;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Light Stemmer for Italian.
+ * <p>
+ * This stemmer implements the algorithm described in:
+ * <i>Report on CLEF-2001 Experiments</i>
+ * Jacques Savoy
+ */
+public class ItalianLightStemmer {
+
+ public int stem(char s[], int len) {
+ if (len < 6)
+ return len;
+
+ for (int i = 0; i < len; i++)
+ switch(s[i]) {
+ case 'Ã ':
+ case 'á':
+ case 'â':
+ case 'ä': s[i] = 'a'; break;
+ case 'ò':
+ case 'ó':
+ case 'ô':
+ case 'ö': s[i] = 'o'; break;
+ case 'è':
+ case 'é':
+ case 'ê':
+ case 'ë': s[i] = 'e'; break;
+ case 'ù':
+ case 'ú':
+ case 'û':
+ case 'ü': s[i] = 'u'; break;
+ case 'ì':
+ case 'Ã':
+ case 'î':
+ case 'ï': s[i] = 'i'; break;
+ }
+
+ switch(s[len-1]) {
+ case 'e':
+ if (s[len-2] == 'i' || s[len-2] == 'h')
+ return len - 2;
+ else
+ return len - 1;
+ case 'i':
+ if (s[len-2] == 'h' || s[len-2] == 'i')
+ return len - 2;
+ else
+ return len - 1;
+ case 'a':
+ if (s[len-2] == 'i')
+ return len - 2;
+ else
+ return len - 1;
+ case 'o':
+ if (s[len-2] == 'i')
+ return len - 2;
+ else
+ return len - 1;
+ }
+
+ return len;
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link PortugueseLightStemmer} to stem
+ * Portuguese words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class PortugueseLightStemFilter extends TokenFilter {
+ private final PortugueseLightStemmer stemmer = new PortugueseLightStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public PortugueseLightStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,202 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Portuguese
+ */
+public class PortugueseLightStemmer {
+
+ public int stem(char s[], int len) {
+ if (len < 4)
+ return len;
+
+ len = removeSuffix(s, len);
+
+ if (len > 3 && s[len-1] == 'a')
+ len = normFeminine(s, len);
+
+ if (len > 4)
+ switch(s[len-1]) {
+ case 'e':
+ case 'a':
+ case 'o': len--; break;
+ }
+
+ for (int i = 0; i < len; i++)
+ switch(s[i]) {
+ case 'Ã ':
+ case 'á':
+ case 'â':
+ case 'ä':
+ case 'ã': s[i] = 'a'; break;
+ case 'ò':
+ case 'ó':
+ case 'ô':
+ case 'ö':
+ case 'õ': s[i] = 'o'; break;
+ case 'è':
+ case 'é':
+ case 'ê':
+ case 'ë': s[i] = 'e'; break;
+ case 'ù':
+ case 'ú':
+ case 'û':
+ case 'ü': s[i] = 'u'; break;
+ case 'ì':
+ case 'Ã':
+ case 'î':
+ case 'ï': s[i] = 'i'; break;
+ case 'ç': s[i] = 'c'; break;
+ }
+
+ return len;
+ }
+
+ private int removeSuffix(char s[], int len) {
+ if (len > 4 && endsWith(s, len, "es"))
+ switch(s[len-3]) {
+ case 'r':
+ case 's':
+ case 'l':
+ case 'z': return len - 2;
+ }
+
+ if (len > 3 && endsWith(s, len, "ns")) {
+ s[len - 2] = 'm';
+ return len - 1;
+ }
+
+ if (len > 4 && (endsWith(s, len, "eis") || endsWith(s, len, "éis"))) {
+ s[len - 3] = 'e';
+ s[len - 2] = 'l';
+ return len - 1;
+ }
+
+ if (len > 4 && endsWith(s, len, "ais")) {
+ s[len - 2] = 'l';
+ return len - 1;
+ }
+
+ if (len > 4 && endsWith(s, len, "óis")) {
+ s[len - 3] = 'o';
+ s[len - 2] = 'l';
+ return len - 1;
+ }
+
+ if (len > 4 && endsWith(s, len, "is")) {
+ s[len - 1] = 'l';
+ return len;
+ }
+
+ if (len > 3 &&
+ (endsWith(s, len, "ões") ||
+ endsWith(s, len, "ães"))) {
+ len--;
+ s[len - 2] = 'ã';
+ s[len - 1] = 'o';
+ return len;
+ }
+
+ if (len > 6 && endsWith(s, len, "mente"))
+ return len - 5;
+
+ if (len > 3 && s[len-1] == 's')
+ return len - 1;
+ return len;
+ }
+
+ private int normFeminine(char s[], int len) {
+ if (len > 7 &&
+ (endsWith(s, len, "inha") ||
+ endsWith(s, len, "iaca") ||
+ endsWith(s, len, "eira"))) {
+ s[len - 1] = 'o';
+ return len;
+ }
+
+ if (len > 6) {
+ if (endsWith(s, len, "osa") ||
+ endsWith(s, len, "ica") ||
+ endsWith(s, len, "ida") ||
+ endsWith(s, len, "ada") ||
+ endsWith(s, len, "iva") ||
+ endsWith(s, len, "ama")) {
+ s[len - 1] = 'o';
+ return len;
+ }
+
+ if (endsWith(s, len, "ona")) {
+ s[len - 3] = 'ã';
+ s[len - 2] = 'o';
+ return len - 1;
+ }
+
+ if (endsWith(s, len, "ora"))
+ return len - 1;
+
+ if (endsWith(s, len, "esa")) {
+ s[len - 3] = 'ê';
+ return len - 1;
+ }
+
+ if (endsWith(s, len, "na")) {
+ s[len - 1] = 'o';
+ return len;
+ }
+ }
+ return len;
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link PortugueseMinimalStemmer} to stem
+ * Portuguese words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class PortugueseMinimalStemFilter extends TokenFilter {
+ private final PortugueseMinimalStemmer stemmer = new PortugueseMinimalStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public PortugueseMinimalStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,119 @@
+package org.apache.lucene.analysis.pt;
+
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Minimal Stemmer for Portuguese
+ * <p>
+ * This follows the "RSLP-S" algorithm presented in:
+ * <i>A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese
+ * Information Retrieval</i> (Orengo, et al)
+ * which is just the plural reduction step of the RSLP
+ * algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>,
+ * Orengo et al.
+ */
+public class PortugueseMinimalStemmer {
+
+ private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("lápis", "cais", "mais", "crúcis", "biquÃnis", "pois",
+ "depois","dois","leis"),
+ false);
+
+ private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos",
+ "férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés",
+ "através", "convés", "ês", "paÃs", "após", "ambas", "ambos",
+ "messias", "depois"),
+ false);
+
+ public int stem(char s[], int len) {
+ if (len < 3 || s[len-1] != 's')
+ return len;
+
+ if (s[len-2] == 'n') {
+ len--;
+ s[len-1] = 'm';
+ return len;
+ }
+
+ if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') {
+ len--;
+ s[len-2] = 'ã';
+ s[len-1] = 'o';
+ return len;
+ }
+
+ if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e')
+ if (!(len == 4 && s[0] == 'm')) {
+ len--;
+ s[len-1] = 'o';
+ return len;
+ }
+
+ if (len >= 4 && s[len-2] == 'i') {
+ if (s[len-3] == 'a')
+ if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) {
+ len--;
+ s[len-1] = 'l';
+ return len;
+ }
+
+ if (len >= 5 && s[len-3] == 'é') {
+ len--;
+ s[len-2] = 'e';
+ s[len-1] = 'l';
+ return len;
+ }
+
+ if (len >= 5 && s[len-3] == 'e') {
+ len--;
+ s[len-1] = 'l';
+ return len;
+ }
+
+ if (len >= 5 && s[len-3] == 'ó') {
+ len--;
+ s[len-2] = 'o';
+ s[len-1] = 'l';
+ return len;
+ }
+
+ if (!excIS.contains(s, 0, len)) {
+ s[len-1] = 'l';
+ return len;
+ }
+ }
+
+ if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e')
+ return len - 2;
+
+ if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e')
+ if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o'))
+ return len - 2;
+
+ if (excS.contains(s, 0, len))
+ return len;
+ else
+ return len-1;
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link RussianLightStemmer} to stem Russian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class RussianLightStemFilter extends TokenFilter {
+ private final RussianLightStemmer stemmer = new RussianLightStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public RussianLightStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,153 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Russian.
+ * <p>
+ * This stemmer implements the following algorithm:
+ * <i>Indexing and Searching Strategies for the Russian Language.</i>
+ * Ljiljana Dolamic and Jacques Savoy.
+ */
+public class RussianLightStemmer {
+
+ public int stem(char s[], int len) {
+ len = removeCase(s, len);
+ return normalize(s, len);
+ }
+
+ private int normalize(char s[], int len) {
+ if (len > 3)
+ switch(s[len-1]) {
+ case 'Ñ':
+ case 'и': return len - 1;
+ case 'н': if (s[len-2] == 'н') return len - 1;
+ }
+ return len;
+ }
+
+ private int removeCase(char s[], int len) {
+ if (len > 6 &&
+ (endsWith(s, len, "иÑми") ||
+ endsWith(s, len, "оÑми")))
+ return len - 4;
+
+ if (len > 5 &&
+ (endsWith(s, len, "иÑм") ||
+ endsWith(s, len, "иÑÑ
") ||
+ endsWith(s, len, "оÑÑ
") ||
+ endsWith(s, len, "Ñми") ||
+ endsWith(s, len, "оÑм") ||
+ endsWith(s, len, "оÑв") ||
+ endsWith(s, len, "ами") ||
+ endsWith(s, len, "его") ||
+ endsWith(s, len, "емÑ") ||
+ endsWith(s, len, "еÑи") ||
+ endsWith(s, len, "ими") ||
+ endsWith(s, len, "ого") ||
+ endsWith(s, len, "омÑ") ||
+ endsWith(s, len, "Ñми") ||
+ endsWith(s, len, "оев")))
+ return len - 3;
+
+ if (len > 4 &&
+ (endsWith(s, len, "аÑ") ||
+ endsWith(s, len, "ÑÑ") ||
+ endsWith(s, len, "ÑÑ
") ||
+ endsWith(s, len, "ÑÑ") ||
+ endsWith(s, len, "аÑ
") ||
+ endsWith(s, len, "еÑ") ||
+ endsWith(s, len, "иÑ
") ||
+ endsWith(s, len, "иÑ") ||
+ endsWith(s, len, "иÑ") ||
+ endsWith(s, len, "Ñв") ||
+ endsWith(s, len, "оÑ") ||
+ endsWith(s, len, "ÑÑ") ||
+ endsWith(s, len, "Ñм") ||
+ endsWith(s, len, "ÑÑ
") ||
+ endsWith(s, len, "еÑ") ||
+ endsWith(s, len, "ам") ||
+ endsWith(s, len, "ем") ||
+ endsWith(s, len, "ей") ||
+ endsWith(s, len, "Ñм") ||
+ endsWith(s, len, "ев") ||
+ endsWith(s, len, "ий") ||
+ endsWith(s, len, "им") ||
+ endsWith(s, len, "ое") ||
+ endsWith(s, len, "ой") ||
+ endsWith(s, len, "ом") ||
+ endsWith(s, len, "ов") ||
+ endsWith(s, len, "Ñе") ||
+ endsWith(s, len, "Ñй") ||
+ endsWith(s, len, "Ñм") ||
+ endsWith(s, len, "ми")))
+ return len - 2;
+
+ if (len > 3)
+ switch(s[len-1]) {
+ case 'а':
+ case 'е':
+ case 'и':
+ case 'о':
+ case 'Ñ':
+ case 'й':
+ case 'Ñ':
+ case 'Ñ':
+ case 'Ñ': return len - 1;
+ }
+
+ return len;
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.sv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link SwedishLightStemmer} to stem Swedish
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class SwedishLightStemFilter extends TokenFilter {
+ private final SwedishLightStemmer stemmer = new SwedishLightStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public SwedishLightStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,111 @@
+package org.apache.lucene.analysis.sv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Swedish.
+ * <p>
+ * This stemmer implements the algorithm described in:
+ * <i>Report on CLEF-2003 Monolingual Tracks</i>
+ * Jacques Savoy
+ */
+public class SwedishLightStemmer {
+
+ public int stem(char s[], int len) {
+ if (len > 4 && s[len-1] == 's')
+ len--;
+
+ if (len > 7 &&
+ (endsWith(s, len, "elser") ||
+ endsWith(s, len, "heten")))
+ return len - 5;
+
+ if (len > 6 &&
+ (endsWith(s, len, "arne") ||
+ endsWith(s, len, "erna") ||
+ endsWith(s, len, "ande") ||
+ endsWith(s, len, "else") ||
+ endsWith(s, len, "aste") ||
+ endsWith(s, len, "orna") ||
+ endsWith(s, len, "aren")))
+ return len - 4;
+
+ if (len > 5 &&
+ (endsWith(s, len, "are") ||
+ endsWith(s, len, "ast") ||
+ endsWith(s, len, "het")))
+ return len - 3;
+
+ if (len > 4 &&
+ (endsWith(s, len, "ar") ||
+ endsWith(s, len, "er") ||
+ endsWith(s, len, "or") ||
+ endsWith(s, len, "en") ||
+ endsWith(s, len, "at") ||
+ endsWith(s, len, "te") ||
+ endsWith(s, len, "et")))
+ return len - 2;
+
+ if (len > 3)
+ switch(s[len-1]) {
+ case 't':
+ case 'a':
+ case 'e':
+ case 'n': return len - 1;
+ }
+
+ return len;
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,89 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Some commonly-used stemming functions */
+public class StemmerUtil {
+ /**
+ * Returns true if the character array starts with the suffix.
+ *
+ * @param s Input Buffer
+ * @param len length of input buffer
+ * @param suffix Suffix string to test
+ * @return true if <code>s</code> starts with <code>suffix</code>
+ */
+ public static boolean startsWith(char s[], int len, String prefix) {
+ final int prefixLen = prefix.length();
+ if (prefixLen > len)
+ return false;
+ for (int i = 0; i < prefixLen; i++)
+ if (s[i] != prefix.charAt(i))
+ return false;
+ return true;
+ }
+
+ /**
+ * Returns true if the character array ends with the suffix.
+ *
+ * @param s Input Buffer
+ * @param len length of input buffer
+ * @param suffix Suffix string to test
+ * @return true if <code>s</code> ends with <code>suffix</code>
+ */
+ public static boolean endsWith(char s[], int len, String suffix) {
+ final int suffixLen = suffix.length();
+ if (suffixLen > len)
+ return false;
+ for (int i = suffixLen - 1; i >= 0; i--)
+ if (s[len -(suffixLen - i)] != suffix.charAt(i))
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Delete a character in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len length of input buffer
+ * @return length of input buffer after deletion
+ */
+ public static int delete(char s[], int pos, int len) {
+ if (pos < len)
+ System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+
+ /**
+ * Delete n characters in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len Length of input buffer
+ * @param nChars number of characters to delete
+ * @return length of input buffer after deletion
+ */
+ public static int deleteN(char s[], int pos, int len, int nChars) {
+ // TODO: speed up, this is silly
+ for (int i = 0; i < nChars; i++)
+ len = delete(s, pos, len);
+ return len;
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link GermanLightStemFilter}
+ */
+public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new GermanLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,60 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link GermanMinimalStemFilter}
+ */
+public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new GermanMinimalStemFilter(source));
+ }
+ };
+
+ /** Test some examples from the paper */
+ public void testExamples() throws IOException {
+ checkOneTerm(analyzer, "sängerinnen", "sangerin");
+ checkOneTerm(analyzer, "frauen", "frau");
+ checkOneTerm(analyzer, "kenntnisse", "kenntnis");
+ checkOneTerm(analyzer, "staates", "staat");
+ checkOneTerm(analyzer, "bilder", "bild");
+ checkOneTerm(analyzer, "boote", "boot");
+ checkOneTerm(analyzer, "götter", "gott");
+ checkOneTerm(analyzer, "äpfel", "apfel");
+ }
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Wed Jul 14 12:10:34 2010
@@ -17,17 +17,17 @@ package org.apache.lucene.analysis.de;
* limitations under the License.
*/
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-import java.io.StringReader;
+import java.io.InputStream;
+import java.io.Reader;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Test the German stemmer. The stemming algorithm is known to work less
@@ -38,25 +38,18 @@ import org.apache.lucene.analysis.core.L
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
public void testStemming() throws Exception {
- Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
- TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer));
- // read test cases from external file:
- InputStreamReader isr = new InputStreamReader(getClass().getResourceAsStream("data.txt"), "iso-8859-1");
- BufferedReader breader = new BufferedReader(isr);
- while(true) {
- String line = breader.readLine();
- if (line == null)
- break;
- line = line.trim();
- if (line.startsWith("#") || line.equals(""))
- continue; // ignore comments and empty lines
- String[] parts = line.split(";");
- //System.out.println(parts[0] + " -- " + parts[1]);
- tokenizer.reset(new StringReader(parts[0]));
- filter.reset();
- assertTokenStreamContents(filter, new String[] { parts[1] });
- }
- breader.close();
- isr.close();
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer t = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(t,
+ new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t)));
+ }
+ };
+
+ InputStream vocOut = getClass().getResourceAsStream("data.txt");
+ assertVocabulary(analyzer, vocOut);
+ vocOut.close();
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt Wed Jul 14 12:10:34 2010
@@ -1,48 +1,48 @@
# German special characters are replaced:
-häufig;haufig
+häufig haufig
# here the stemmer works okay, it maps related words to the same stem:
-abschließen;abschliess
-abschließender;abschliess
-abschließendes;abschliess
-abschließenden;abschliess
-
-Tisch;tisch
-Tische;tisch
-Tischen;tisch
-
-Haus;hau
-Hauses;hau
-Häuser;hau
-Häusern;hau
+abschlieÃen abschliess
+abschlieÃender abschliess
+abschlieÃendes abschliess
+abschlieÃenden abschliess
+
+Tisch tisch
+Tische tisch
+Tischen tisch
+
+Haus hau
+Hauses hau
+Häuser hau
+Häusern hau
# here's a case where overstemming occurs, i.e. a word is
# mapped to the same stem as unrelated words:
-hauen;hau
+hauen hau
# here's a case where understemming occurs, i.e. two related words
# are not mapped to the same stem. This is the case with basically
# all irregular forms:
-Drama;drama
-Dramen;dram
+Drama drama
+Dramen dram
-# replace "ß" with 'ss':
-Ausmaß;ausmass
+# replace "Ã" with 'ss':
+Ausmaà ausmass
# fake words to test if suffixes are cut off:
-xxxxxe;xxxxx
-xxxxxs;xxxxx
-xxxxxn;xxxxx
-xxxxxt;xxxxx
-xxxxxem;xxxxx
-xxxxxer;xxxxx
-xxxxxnd;xxxxx
+xxxxxe xxxxx
+xxxxxs xxxxx
+xxxxxn xxxxx
+xxxxxt xxxxx
+xxxxxem xxxxx
+xxxxxer xxxxx
+xxxxxnd xxxxx
# the suffixes are also removed when combined:
-xxxxxetende;xxxxx
+xxxxxetende xxxxx
# words that are shorter than four charcters are not changed:
-xxe;xxe
+xxe xxe
# -em and -er are not removed from words shorter than five characters:
-xxem;xxem
-xxer;xxer
+xxem xxem
+xxer xxer
# -nd is not removed from words shorter than six characters:
-xxxnd;xxxnd
+xxxnd xxxnd
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip?rev=964019&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip?rev=964019&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.en;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+/**
+ * Simple tests for {@link EnglishMinimalStemFilter}
+ */
+public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new EnglishMinimalStemFilter(source));
+ }
+ };
+
+ /** Test some examples from various papers about this technique */
+ public void testExamples() throws IOException {
+ checkOneTerm(analyzer, "queries", "query");
+ checkOneTerm(analyzer, "phrases", "phrase");
+ checkOneTerm(analyzer, "corpus", "corpus");
+ checkOneTerm(analyzer, "stress", "stress");
+ checkOneTerm(analyzer, "kings", "king");
+ checkOneTerm(analyzer, "panels", "panel");
+ checkOneTerm(analyzer, "aerodynamics", "aerodynamic");
+ checkOneTerm(analyzer, "congress", "congress");
+ checkOneTerm(analyzer, "serious", "serious");
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java?rev=964019&r1=964018&r2=964019&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java Wed Jul 14 12:10:34 2010
@@ -17,21 +17,22 @@ package org.apache.lucene.analysis.en;
* limitations under the License.
*/
-import java.io.BufferedReader;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
+import java.io.Reader;
import java.io.StringReader;
-import java.util.zip.ZipFile;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
/**
* Test the PorterStemFilter with Martin Porter's test data.
*/
@@ -41,26 +42,16 @@ public class TestPorterStemFilter extend
* The output should be the same as the string in output.txt
*/
public void testPorterStemFilter() throws Exception {
- Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
- TokenStream filter = new PorterStemFilter(tokenizer);
- ZipFile zipFile = new ZipFile(getDataFile("porterTestData.zip"));
- InputStream voc = zipFile.getInputStream(zipFile.getEntry("voc.txt"));
- InputStream out = zipFile.getInputStream(zipFile.getEntry("output.txt"));
- BufferedReader vocReader = new BufferedReader(new InputStreamReader(
- voc, "UTF-8"));
- BufferedReader outputReader = new BufferedReader(new InputStreamReader(
- out, "UTF-8"));
- String inputWord = null;
- while ((inputWord = vocReader.readLine()) != null) {
- String expectedWord = outputReader.readLine();
- assertNotNull(expectedWord);
- tokenizer.reset(new StringReader(inputWord));
- filter.reset();
- assertTokenStreamContents(filter, new String[] { expectedWord });
- }
- vocReader.close();
- outputReader.close();
- zipFile.close();
+ Analyzer a = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer t = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(t, new PorterStemFilter(t));
+ }
+ };
+
+ assertVocabulary(a, getDataFile("porterTestData.zip"), "voc.txt", "output.txt");
}
public void testWithKeywordAttribute() throws IOException {
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.es;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link SpanishLightStemFilter}
+ */
+public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new SpanishLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("eslighttestdata.zip"), "eslight.txt");
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip?rev=964019&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java?rev=964019&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java Wed Jul 14 12:10:34 2010
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.fi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link FinnishLightStemFilter}
+ */
+public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new FinnishLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
+ }
+}
Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip?rev=964019&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream