You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/04/14 19:07:10 UTC
svn commit: r1092396 - in /lucene/dev/trunk: lucene/contrib/
modules/analysis/common/src/java/org/apache/lucene/analysis/lv/
modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/
modules/analysis/common/src/test/org/apache/lucene/analysi...
Author: rmuir
Date: Thu Apr 14 17:07:10 2011
New Revision: 1092396
URL: http://svn.apache.org/viewvc?rev=1092396&view=rev
Log:
LUCENE-3016: add analyzer for Latvian
Added:
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java (with props)
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html (with props)
lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/
lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java (with props)
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java (with props)
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java (with props)
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1092396&r1=1092395&r2=1092396&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Apr 14 17:07:10 2011
@@ -50,6 +50,10 @@ Bug fixes
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
+New Features
+
+ * LUCENE-3016: Add analyzer for Latvian. (Robert Muir)
+
======================= Lucene 3.1.0 =======================
Changes in backwards compatibility policy
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Latvian.
+ */
+public final class LatvianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Latvian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public LatvianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a
+ * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
+ * which tokenizes all the text in the provided {@link Reader}.
+ *
+ * @return A
+ * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
+ * built from an {@link StandardTokenizer} filtered with
+ * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
+ * , {@link KeywordMarkerFilter} if a stem exclusion set is
+ * provided and {@link LatvianStemFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerFilter(result, stemExclusionSet);
+ result = new LatvianStemFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class LatvianStemFilter extends TokenFilter {
+ private final LatvianStemmer stemmer = new LatvianStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public LatvianStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,174 @@
+package org.apache.lucene.analysis.lv;
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light stemmer for Latvian.
+ * <p>
+ * This is a light version of the algorithm in Karlis Kreslin's PhD thesis
+ * <i>A stemming algorithm for Latvian</i> with the following modifications:
+ * <ul>
+ * <li>Only explicitly stems noun and adjective morphology
+ * <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
+ * <li>Removes only the primary inflectional suffixes: case and number for nouns ;
+ * case, number, gender, and definitiveness for adjectives.
+ * <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
+ * </ul>
+ */
+public class LatvianStemmer {
+ /**
+ * Stem a latvian word. returns the new adjusted length.
+ */
+ public int stem(char s[], int len) {
+ int numVowels = numVowels(s, len);
+
+ for (int i = 0; i < affixes.length; i++) {
+ Affix affix = affixes[i];
+ if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
+ len -= affix.affix.length;
+ return affix.palatalizes ? unpalatalize(s, len) : len;
+ }
+ }
+
+ return len;
+ }
+
+ static final Affix affixes[] = {
+ new Affix("ajiem", 3, false), new Affix("ajai", 3, false),
+ new Affix("ajam", 2, false), new Affix("ajÄm", 2, false),
+ new Affix("ajos", 2, false), new Affix("ajÄs", 2, false),
+ new Affix("iem", 2, true), new Affix("ajÄ", 2, false),
+ new Affix("ais", 2, false), new Affix("ai", 2, false),
+ new Affix("ei", 2, false), new Affix("Äm", 1, false),
+ new Affix("am", 1, false), new Affix("Äm", 1, false),
+ new Affix("Ä«m", 1, false), new Affix("im", 1, false),
+ new Affix("um", 1, false), new Affix("us", 1, true),
+ new Affix("as", 1, false), new Affix("Äs", 1, false),
+ new Affix("es", 1, false), new Affix("os", 1, true),
+ new Affix("ij", 1, false), new Affix("Ä«s", 1, false),
+ new Affix("Äs", 1, false), new Affix("is", 1, false),
+ new Affix("ie", 1, false), new Affix("u", 1, true),
+ new Affix("a", 1, true), new Affix("i", 1, true),
+ new Affix("e", 1, false), new Affix("Ä", 1, false),
+ new Affix("Ä", 1, false), new Affix("Ä«", 1, false),
+ new Affix("Å«", 1, false), new Affix("o", 1, false),
+ new Affix("s", 0, false), new Affix("Å¡", 0, false),
+ };
+
+ static class Affix {
+ char affix[]; // suffix
+ int vc; // vowel count of the suffix
+ boolean palatalizes; // true if we should fire palatalization rules.
+
+ Affix(String affix, int vc, boolean palatalizes) {
+ this.affix = affix.toCharArray();
+ this.vc = vc;
+ this.palatalizes = palatalizes;
+ }
+ }
+
+ /**
+ * Most cases are handled except for the ambiguous ones:
+ * <ul>
+ * <li> s -> Å¡
+ * <li> t -> Å¡
+ * <li> d -> ž
+ * <li> z -> ž
+ * </ul>
+ */
+ private int unpalatalize(char s[], int len) {
+ // we check the character removed: if its -u then
+ // its 2,5, or 6 gen pl., and these two can only apply then.
+ if (s[len] == 'u') {
+ // kš -> kst
+ if (endsWith(s, len, "kš")) {
+ len++;
+ s[len-2] = 's';
+ s[len-1] = 't';
+ return len;
+ }
+ // ÅÅ -> nn
+ if (endsWith(s, len, "ÅÅ")) {
+ s[len-2] = 'n';
+ s[len-1] = 'n';
+ return len;
+ }
+ }
+
+ // otherwise all other rules
+ if (endsWith(s, len, "pj") || endsWith(s, len, "bj")
+ || endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
+ // labial consonant
+ return len-1;
+ } else if (endsWith(s, len, "Å¡Å")) {
+ s[len-2] = 's';
+ s[len-1] = 'n';
+ return len;
+ } else if (endsWith(s, len, "žÅ")) {
+ s[len-2] = 'z';
+ s[len-1] = 'n';
+ return len;
+ } else if (endsWith(s, len, "šļ")) {
+ s[len-2] = 's';
+ s[len-1] = 'l';
+ return len;
+ } else if (endsWith(s, len, "žļ")) {
+ s[len-2] = 'z';
+ s[len-1] = 'l';
+ return len;
+ } else if (endsWith(s, len, "ļÅ")) {
+ s[len-2] = 'l';
+ s[len-1] = 'n';
+ return len;
+ } else if (endsWith(s, len, "ļļ")) {
+ s[len-2] = 'l';
+ s[len-1] = 'l';
+ return len;
+ } else if (s[len-1] == 'Ä') {
+ s[len-1] = 'c';
+ return len;
+ } else if (s[len-1] == 'ļ') {
+ s[len-1] = 'l';
+ return len;
+ } else if (s[len-1] == 'Å') {
+ s[len-1] = 'n';
+ return len;
+ }
+
+ return len;
+ }
+
+ /**
+ * Count the vowels in the string, we always require at least
+ * one in the remaining stem to accept it.
+ */
+ private int numVowels(char s[], int len) {
+ int n = 0;
+ for (int i = 0; i < len; i++) {
+ switch(s[i]) {
+ case 'a': case 'e': case 'i':
+ case 'o': case 'u': case 'Ä':
+ case 'Ä«': case 'Ä': case 'Å«':
+ n++;
+ }
+ }
+ return n;
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html Thu Apr 14 17:07:10 2011
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Latvian.
+</body>
+</html>
Added: lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt (added)
+++ lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt Thu Apr 14 17:07:10 2011
@@ -0,0 +1,172 @@
+# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
+# the original list of over 800 forms was refined:
+# pronouns, adverbs, interjections were removed
+#
+# prepositions
+aiz
+ap
+ar
+apakš
+Ärpus
+augšpus
+bez
+caur
+dÄļ
+gar
+iekš
+iz
+kopš
+labad
+lejpus
+līdz
+no
+otrpus
+pa
+par
+pÄr
+pÄc
+pie
+pirms
+pret
+priekš
+starp
+Å¡aipus
+uz
+viÅpus
+virs
+virspus
+zem
+apakšpus
+# Conjunctions
+un
+bet
+jo
+ja
+ka
+lai
+tomÄr
+tikko
+turpretī
+arī
+kaut
+gan
+tÄdÄļ
+tÄ
+ne
+tikvien
+vien
+kÄ
+ir
+te
+vai
+kamÄr
+# Particles
+ar
+diezin
+droši
+diemžÄl
+nebūt
+ik
+it
+taÄu
+nu
+pat
+tiklab
+iekšpus
+nedz
+tik
+nevis
+turpretim
+jeb
+iekam
+iekÄm
+iekÄms
+kolīdz
+līdzko
+tiklīdz
+jebšu
+tÄlab
+tÄpÄc
+nekÄ
+itin
+jÄ
+jau
+jel
+nÄ
+nezin
+tad
+tikai
+vis
+tak
+iekams
+vien
+# modal verbs
+būt
+biju
+biji
+bija
+bijÄm
+bijÄt
+esmu
+esi
+esam
+esat
+būšu
+būsi
+būs
+būsim
+būsiet
+tikt
+tiku
+tiki
+tika
+tikÄm
+tikÄt
+tieku
+tiec
+tiek
+tiekam
+tiekat
+tikšu
+tiks
+tiksim
+tiksiet
+tapt
+tapi
+tapÄt
+topat
+tapšu
+tapsi
+taps
+tapsim
+tapsiet
+kļūt
+kļuvu
+kļuvi
+kļuva
+kļuvÄm
+kļuvÄt
+kļūstu
+kļūsti
+kļūst
+kļūstam
+kļūstat
+kļūšu
+kļūsi
+kļūs
+kļūsim
+kļūsiet
+# verbs
+varÄt
+varÄju
+varÄjÄm
+varÄÅ¡u
+varÄsim
+var
+varÄji
+varÄjÄt
+varÄsi
+varÄsiet
+varat
+varÄja
+varÄs
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new LatvianAnalyzer(TEST_VERSION_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "tirgiem", "tirg");
+ checkOneTermReuse(a, "tirgus", "tirg");
+ // stopword
+ assertAnalyzesTo(a, "un", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("tirgiem");
+ Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
+ LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "tirgiem", "tirgiem");
+ checkOneTermReuse(a, "tirgus", "tirg");
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,272 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+/**
+ * Basic tests for {@link LatvianStemmer}
+ */
+public class TestLatvianStemmer extends BaseTokenStreamTestCase {
+ private Analyzer a = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
+ }
+ };
+
+ public void testNouns1() throws IOException {
+ // decl. I
+ checkOneTerm(a, "tÄvs", "tÄv"); // nom. sing.
+ checkOneTerm(a, "tÄvi", "tÄv"); // nom. pl.
+ checkOneTerm(a, "tÄva", "tÄv"); // gen. sing.
+ checkOneTerm(a, "tÄvu", "tÄv"); // gen. pl.
+ checkOneTerm(a, "tÄvam", "tÄv"); // dat. sing.
+ checkOneTerm(a, "tÄviem", "tÄv"); // dat. pl.
+ checkOneTerm(a, "tÄvu", "tÄv"); // acc. sing.
+ checkOneTerm(a, "tÄvus", "tÄv"); // acc. pl.
+ checkOneTerm(a, "tÄvÄ", "tÄv"); // loc. sing.
+ checkOneTerm(a, "tÄvos", "tÄv"); // loc. pl.
+ checkOneTerm(a, "tÄvs", "tÄv"); // voc. sing.
+ checkOneTerm(a, "tÄvi", "tÄv"); // voc. pl.
+ }
+
+ /**
+ * decl II nouns with (s,t) -> š and (d,z) -> ž
+ * palatalization will generally conflate to two stems
+ * due to the ambiguity (plural and singular).
+ */
+ public void testNouns2() throws IOException {
+ // decl. II
+
+ // c -> Ä palatalization
+ checkOneTerm(a, "lÄcis", "lÄc"); // nom. sing.
+ checkOneTerm(a, "lÄÄi", "lÄc"); // nom. pl.
+ checkOneTerm(a, "lÄÄa", "lÄc"); // gen. sing.
+ checkOneTerm(a, "lÄÄu", "lÄc"); // gen. pl.
+ checkOneTerm(a, "lÄcim", "lÄc"); // dat. sing.
+ checkOneTerm(a, "lÄÄiem", "lÄc"); // dat. pl.
+ checkOneTerm(a, "lÄci", "lÄc"); // acc. sing.
+ checkOneTerm(a, "lÄÄus", "lÄc"); // acc. pl.
+ checkOneTerm(a, "lÄcÄ«", "lÄc"); // loc. sing.
+ checkOneTerm(a, "lÄÄos", "lÄc"); // loc. pl.
+ checkOneTerm(a, "lÄci", "lÄc"); // voc. sing.
+ checkOneTerm(a, "lÄÄi", "lÄc"); // voc. pl.
+
+ // n -> Å palatalization
+ checkOneTerm(a, "akmens", "akmen"); // nom. sing.
+ checkOneTerm(a, "akmeÅi", "akmen"); // nom. pl.
+ checkOneTerm(a, "akmens", "akmen"); // gen. sing.
+ checkOneTerm(a, "akmeÅu", "akmen"); // gen. pl.
+ checkOneTerm(a, "akmenim", "akmen"); // dat. sing.
+ checkOneTerm(a, "akmeÅiem", "akmen"); // dat. pl.
+ checkOneTerm(a, "akmeni", "akmen"); // acc. sing.
+ checkOneTerm(a, "akmeÅus", "akmen"); // acc. pl.
+ checkOneTerm(a, "akmenī", "akmen"); // loc. sing.
+ checkOneTerm(a, "akmeÅos", "akmen"); // loc. pl.
+ checkOneTerm(a, "akmens", "akmen"); // voc. sing.
+ checkOneTerm(a, "akmeÅi", "akmen"); // voc. pl.
+
+ // no palatalization
+ checkOneTerm(a, "kurmis", "kurm"); // nom. sing.
+ checkOneTerm(a, "kurmji", "kurm"); // nom. pl.
+ checkOneTerm(a, "kurmja", "kurm"); // gen. sing.
+ checkOneTerm(a, "kurmju", "kurm"); // gen. pl.
+ checkOneTerm(a, "kurmim", "kurm"); // dat. sing.
+ checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl.
+ checkOneTerm(a, "kurmi", "kurm"); // acc. sing.
+ checkOneTerm(a, "kurmjus", "kurm"); // acc. pl.
+ checkOneTerm(a, "kurmī", "kurm"); // loc. sing.
+ checkOneTerm(a, "kurmjos", "kurm"); // loc. pl.
+ checkOneTerm(a, "kurmi", "kurm"); // voc. sing.
+ checkOneTerm(a, "kurmji", "kurm"); // voc. pl.
+ }
+
+ public void testNouns3() throws IOException {
+ // decl III
+ checkOneTerm(a, "lietus", "liet"); // nom. sing.
+ checkOneTerm(a, "lieti", "liet"); // nom. pl.
+ checkOneTerm(a, "lietus", "liet"); // gen. sing.
+ checkOneTerm(a, "lietu", "liet"); // gen. pl.
+ checkOneTerm(a, "lietum", "liet"); // dat. sing.
+ checkOneTerm(a, "lietiem", "liet"); // dat. pl.
+ checkOneTerm(a, "lietu", "liet"); // acc. sing.
+ checkOneTerm(a, "lietus", "liet"); // acc. pl.
+ checkOneTerm(a, "lietū", "liet"); // loc. sing.
+ checkOneTerm(a, "lietos", "liet"); // loc. pl.
+ checkOneTerm(a, "lietus", "liet"); // voc. sing.
+ checkOneTerm(a, "lieti", "liet"); // voc. pl.
+ }
+
+ public void testNouns4() throws IOException {
+ // decl IV
+ checkOneTerm(a, "lapa", "lap"); // nom. sing.
+ checkOneTerm(a, "lapas", "lap"); // nom. pl.
+ checkOneTerm(a, "lapas", "lap"); // gen. sing.
+ checkOneTerm(a, "lapu", "lap"); // gen. pl.
+ checkOneTerm(a, "lapai", "lap"); // dat. sing.
+ checkOneTerm(a, "lapÄm", "lap"); // dat. pl.
+ checkOneTerm(a, "lapu", "lap"); // acc. sing.
+ checkOneTerm(a, "lapas", "lap"); // acc. pl.
+ checkOneTerm(a, "lapÄ", "lap"); // loc. sing.
+ checkOneTerm(a, "lapÄs", "lap"); // loc. pl.
+ checkOneTerm(a, "lapa", "lap"); // voc. sing.
+ checkOneTerm(a, "lapas", "lap"); // voc. pl.
+
+ checkOneTerm(a, "puika", "puik"); // nom. sing.
+ checkOneTerm(a, "puikas", "puik"); // nom. pl.
+ checkOneTerm(a, "puikas", "puik"); // gen. sing.
+ checkOneTerm(a, "puiku", "puik"); // gen. pl.
+ checkOneTerm(a, "puikam", "puik"); // dat. sing.
+ checkOneTerm(a, "puikÄm", "puik"); // dat. pl.
+ checkOneTerm(a, "puiku", "puik"); // acc. sing.
+ checkOneTerm(a, "puikas", "puik"); // acc. pl.
+ checkOneTerm(a, "puikÄ", "puik"); // loc. sing.
+ checkOneTerm(a, "puikÄs", "puik"); // loc. pl.
+ checkOneTerm(a, "puika", "puik"); // voc. sing.
+ checkOneTerm(a, "puikas", "puik"); // voc. pl.
+ }
+
+ /**
+ * Genitive plural forms with (s,t) -> š and (d,z) -> ž
+ * will not conflate due to ambiguity.
+ */
+ public void testNouns5() throws IOException {
+ // decl V
+ // l -> ļ palatalization
+ checkOneTerm(a, "egle", "egl"); // nom. sing.
+ checkOneTerm(a, "egles", "egl"); // nom. pl.
+ checkOneTerm(a, "egles", "egl"); // gen. sing.
+ checkOneTerm(a, "egļu", "egl"); // gen. pl.
+ checkOneTerm(a, "eglei", "egl"); // dat. sing.
+ checkOneTerm(a, "eglÄm", "egl"); // dat. pl.
+ checkOneTerm(a, "egli", "egl"); // acc. sing.
+ checkOneTerm(a, "egles", "egl"); // acc. pl.
+ checkOneTerm(a, "eglÄ", "egl"); // loc. sing.
+ checkOneTerm(a, "eglÄs", "egl"); // loc. pl.
+ checkOneTerm(a, "egle", "egl"); // voc. sing.
+ checkOneTerm(a, "egles", "egl"); // voc. pl.
+ }
+
+ public void testNouns6() throws IOException {
+ // decl VI
+
+ // no palatalization
+ checkOneTerm(a, "govs", "gov"); // nom. sing.
+ checkOneTerm(a, "govis", "gov"); // nom. pl.
+ checkOneTerm(a, "govs", "gov"); // gen. sing.
+ checkOneTerm(a, "govju", "gov"); // gen. pl.
+ checkOneTerm(a, "govij", "gov"); // dat. sing.
+ checkOneTerm(a, "govīm", "gov"); // dat. pl.
+ checkOneTerm(a, "govi ", "gov"); // acc. sing.
+ checkOneTerm(a, "govis", "gov"); // acc. pl.
+ checkOneTerm(a, "govi ", "gov"); // inst. sing.
+ checkOneTerm(a, "govīm", "gov"); // inst. pl.
+ checkOneTerm(a, "govī", "gov"); // loc. sing.
+ checkOneTerm(a, "govīs", "gov"); // loc. pl.
+ checkOneTerm(a, "govs", "gov"); // voc. sing.
+ checkOneTerm(a, "govis", "gov"); // voc. pl.
+ }
+
+ public void testAdjectives() throws IOException {
+ checkOneTerm(a, "zils", "zil"); // indef. nom. masc. sing.
+ checkOneTerm(a, "zilais", "zil"); // def. nom. masc. sing.
+ checkOneTerm(a, "zili", "zil"); // indef. nom. masc. pl.
+ checkOneTerm(a, "zilie", "zil"); // def. nom. masc. pl.
+ checkOneTerm(a, "zila", "zil"); // indef. nom. fem. sing.
+ checkOneTerm(a, "zilÄ", "zil"); // def. nom. fem. sing.
+ checkOneTerm(a, "zilas", "zil"); // indef. nom. fem. pl.
+ checkOneTerm(a, "zilÄs", "zil"); // def. nom. fem. pl.
+ checkOneTerm(a, "zila", "zil"); // indef. gen. masc. sing.
+ checkOneTerm(a, "zilÄ", "zil"); // def. gen. masc. sing.
+ checkOneTerm(a, "zilu", "zil"); // indef. gen. masc. pl.
+ checkOneTerm(a, "zilo", "zil"); // def. gen. masc. pl.
+ checkOneTerm(a, "zilas", "zil"); // indef. gen. fem. sing.
+ checkOneTerm(a, "zilÄs", "zil"); // def. gen. fem. sing.
+ checkOneTerm(a, "zilu", "zil"); // indef. gen. fem. pl.
+ checkOneTerm(a, "zilo", "zil"); // def. gen. fem. pl.
+ checkOneTerm(a, "zilam", "zil"); // indef. dat. masc. sing.
+ checkOneTerm(a, "zilajam", "zil"); // def. dat. masc. sing.
+ checkOneTerm(a, "ziliem", "zil"); // indef. dat. masc. pl.
+ checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl.
+ checkOneTerm(a, "zilai", "zil"); // indef. dat. fem. sing.
+ checkOneTerm(a, "zilajai", "zil"); // def. dat. fem. sing.
+ checkOneTerm(a, "zilÄm", "zil"); // indef. dat. fem. pl.
+ checkOneTerm(a, "zilajÄm", "zil"); // def. dat. fem. pl.
+ checkOneTerm(a, "zilu", "zil"); // indef. acc. masc. sing.
+ checkOneTerm(a, "zilo", "zil"); // def. acc. masc. sing.
+ checkOneTerm(a, "zilus", "zil"); // indef. acc. masc. pl.
+ checkOneTerm(a, "zilos", "zil"); // def. acc. masc. pl.
+ checkOneTerm(a, "zilu", "zil"); // indef. acc. fem. sing.
+ checkOneTerm(a, "zilo", "zil"); // def. acc. fem. sing.
+ checkOneTerm(a, "zilÄs", "zil"); // indef. acc. fem. pl.
+ checkOneTerm(a, "zilÄs", "zil"); // def. acc. fem. pl.
+ checkOneTerm(a, "zilÄ", "zil"); // indef. loc. masc. sing.
+ checkOneTerm(a, "zilajÄ", "zil"); // def. loc. masc. sing.
+ checkOneTerm(a, "zilos", "zil"); // indef. loc. masc. pl.
+ checkOneTerm(a, "zilajos", "zil"); // def. loc. masc. pl.
+ checkOneTerm(a, "zilÄ", "zil"); // indef. loc. fem. sing.
+ checkOneTerm(a, "zilajÄ", "zil"); // def. loc. fem. sing.
+ checkOneTerm(a, "zilÄs", "zil"); // indef. loc. fem. pl.
+ checkOneTerm(a, "zilajÄs", "zil"); // def. loc. fem. pl.
+ checkOneTerm(a, "zilais", "zil"); // voc. masc. sing.
+ checkOneTerm(a, "zilie", "zil"); // voc. masc. pl.
+ checkOneTerm(a, "zilÄ", "zil"); // voc. fem. sing.
+ checkOneTerm(a, "zilÄs", "zil"); // voc. fem. pl.
+ }
+
+ /**
+ * Note: we intentionally don't handle the ambiguous
+ * (s,t) -> š and (d,z) -> ž
+ */
+ public void testPalatalization() throws IOException {
+ checkOneTerm(a, "krÄsns", "krÄsn"); // nom. sing.
+ checkOneTerm(a, "krÄÅ¡Åu", "krÄsn"); // gen. pl.
+ checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing.
+ checkOneTerm(a, "zvaigžÅu", "zvaigzn"); // gen. pl.
+ checkOneTerm(a, "kÄpslis", "kÄpsl"); // nom. sing.
+ checkOneTerm(a, "kÄpšļu", "kÄpsl"); // gen. pl.
+ checkOneTerm(a, "zizlis", "zizl"); // nom. sing.
+ checkOneTerm(a, "zižļu", "zizl"); // gen. pl.
+ checkOneTerm(a, "vilnis", "viln"); // nom. sing.
+ checkOneTerm(a, "viļÅu", "viln"); // gen. pl.
+ checkOneTerm(a, "lelle", "lell"); // nom. sing.
+ checkOneTerm(a, "leļļu", "lell"); // gen. pl.
+ checkOneTerm(a, "pinne", "pinn"); // nom. sing.
+ checkOneTerm(a, "piÅÅu", "pinn"); // gen. pl.
+ checkOneTerm(a, "rīkste", "rīkst"); // nom. sing.
+ checkOneTerm(a, "rīkšu", "rīkst"); // gen. pl.
+ }
+
+ /**
+ * Test some length restrictions, we require a 3+ char stem,
+ * with at least one vowel.
+ */
+ public void testLength() throws IOException {
+ checkOneTerm(a, "usa", "usa"); // length
+ checkOneTerm(a, "60ms", "60ms"); // vowel count
+ }
+}
Added: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,38 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.lv.LatvianStemFilter;
+
+/**
+ * Factory for {@link LatvianStemFilter}.
+ * <pre class="prettyprint" >
+ * <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter class="solr.LatvianStemFilterFactory"/>
+ * </analyzer>
+ * </fieldType></pre>
+ */
+public class LatvianStemFilterFactory extends BaseTokenFilterFactory {
+ public TokenStream create(TokenStream input) {
+ return new LatvianStemFilter(input);
+ }
+}
Added: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure the Latvian stem factory is working.
+ */
+public class TestLatvianStemFilterFactory extends BaseTokenTestCase {
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("tirgiem tirgus");
+ LatvianStemFilterFactory factory = new LatvianStemFilterFactory();
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
+ }
+}