You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/04/14 19:07:10 UTC

svn commit: r1092396 - in /lucene/dev/trunk: lucene/contrib/ modules/analysis/common/src/java/org/apache/lucene/analysis/lv/ modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/ modules/analysis/common/src/test/org/apache/lucene/analysi...

Author: rmuir
Date: Thu Apr 14 17:07:10 2011
New Revision: 1092396

URL: http://svn.apache.org/viewvc?rev=1092396&view=rev
Log:
LUCENE-3016: add analyzer for Latvian

Added:
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html   (with props)
    lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/
    lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt   (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java   (with props)
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java   (with props)
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java   (with props)
Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1092396&r1=1092395&r2=1092396&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Apr 14 17:07:10 2011
@@ -50,6 +50,10 @@ Bug fixes
  * LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
    on sentences longer than 32,767 characters.  (wangzhenghang via Robert Muir)
 
+New Features
+
+ * LUCENE-3016: Add analyzer for Latvian.  (Robert Muir)
+
 ======================= Lucene 3.1.0 =======================
 
 Changes in backwards compatibility policy

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Latvian.
+ */
+public final class LatvianAnalyzer extends StopwordAnalyzerBase {
+  private final Set<?> stemExclusionSet;
+  
+  /** File containing default Latvian stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public LatvianAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerFilter} before
+   * stemming.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates a
+   * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
+   * which tokenizes all the text in the provided {@link Reader}.
+   * 
+   * @return A
+   *         {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
+   *         built from an {@link StandardTokenizer} filtered with
+   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
+   *         , {@link KeywordMarkerFilter} if a stem exclusion set is
+   *         provided and {@link LatvianStemFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(matchVersion, source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerFilter(result, stemExclusionSet);
+    result = new LatvianStemFilter(result);
+    return new TokenStreamComponents(source, result);
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilter.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link LatvianStemmer} to stem Latvian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class LatvianStemFilter extends TokenFilter {
+  private final LatvianStemmer stemmer = new LatvianStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public LatvianStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemmer.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,174 @@
+package org.apache.lucene.analysis.lv;
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light stemmer for Latvian.
+ * <p>
+ * This is a light version of the algorithm in Karlis Kreslin's PhD thesis
+ * <i>A stemming algorithm for Latvian</i> with the following modifications:
+ * <ul>
+ *   <li>Only explicitly stems noun and adjective morphology
+ *   <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
+ *   <li>Removes only the primary inflectional suffixes: case and number for nouns ; 
+ *       case, number, gender, and definitiveness for adjectives.
+ *   <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
+ * </ul>
+ */
+public class LatvianStemmer {
+  /**
+   * Stem a latvian word. returns the new adjusted length.
+   */
+  public int stem(char s[], int len) {
+    int numVowels = numVowels(s, len);
+    
+    for (int i = 0; i < affixes.length; i++) {
+      Affix affix = affixes[i];
+      if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
+        len -= affix.affix.length;
+        return affix.palatalizes ? unpalatalize(s, len) : len;
+      }
+    }
+    
+    return len;
+  }
+  
+  static final Affix affixes[] = {
+    new Affix("ajiem", 3, false), new Affix("ajai",  3, false), 
+    new Affix("ajam",  2, false), new Affix("ajām",  2, false),
+    new Affix("ajos",  2, false), new Affix("ajās",  2, false),
+    new Affix("iem",   2, true),  new Affix("ajā",   2, false),
+    new Affix("ais",   2, false), new Affix("ai",    2, false),
+    new Affix("ei",    2, false), new Affix("ām",    1, false),
+    new Affix("am",    1, false), new Affix("ēm",    1, false),
+    new Affix("Ä«m",    1, false), new Affix("im",    1, false),
+    new Affix("um",    1, false), new Affix("us",    1, true),
+    new Affix("as",    1, false), new Affix("ās",    1, false),
+    new Affix("es",    1, false), new Affix("os",    1, true),
+    new Affix("ij",    1, false), new Affix("Ä«s",    1, false),
+    new Affix("ēs",    1, false), new Affix("is",    1, false),
+    new Affix("ie",    1, false), new Affix("u",     1, true),
+    new Affix("a",     1, true),  new Affix("i",     1, true),
+    new Affix("e",     1, false), new Affix("ā",     1, false),
+    new Affix("ē",     1, false), new Affix("ī",     1, false),
+    new Affix("Å«",     1, false), new Affix("o",     1, false),
+    new Affix("s",     0, false), new Affix("Å¡",     0, false),
+  };
+
+  static class Affix {
+    char affix[];         // suffix
+    int vc;               // vowel count of the suffix
+    boolean palatalizes;  // true if we should fire palatalization rules.
+    
+    Affix(String affix, int vc, boolean palatalizes) {
+      this.affix = affix.toCharArray();
+      this.vc = vc;
+      this.palatalizes = palatalizes;
+    }
+  }
+
+  /**
+   * Most cases are handled except for the ambiguous ones:
+   * <ul>
+   *  <li> s -> Å¡
+   *  <li> t -> Å¡
+   *  <li> d -> ž
+   *  <li> z -> ž
+   * </ul>
+   */
+  private int unpalatalize(char s[], int len) {
+    // we check the character removed: if its -u then 
+    // its 2,5, or 6 gen pl., and these two can only apply then.
+    if (s[len] == 'u') {
+      // kš -> kst
+      if (endsWith(s, len, "kš")) {
+        len++;
+        s[len-2] = 's';
+        s[len-1] = 't';
+        return len;
+      }
+      // ņņ -> nn
+      if (endsWith(s, len, "ņņ")) {
+        s[len-2] = 'n';
+        s[len-1] = 'n';
+        return len;
+      }
+    }
+    
+    // otherwise all other rules
+    if (endsWith(s, len, "pj") || endsWith(s, len, "bj") 
+        || endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
+      // labial consonant
+      return len-1;
+    } else if (endsWith(s, len, "šņ")) {
+      s[len-2] = 's';
+      s[len-1] = 'n';
+      return len;
+    } else if (endsWith(s, len, "žņ")) {
+      s[len-2] = 'z';
+      s[len-1] = 'n';
+      return len;
+    } else if (endsWith(s, len, "šļ")) {
+      s[len-2] = 's';
+      s[len-1] = 'l';
+      return len;
+    } else if (endsWith(s, len, "žļ")) {
+      s[len-2] = 'z';
+      s[len-1] = 'l';
+      return len;
+    } else if (endsWith(s, len, "ļņ")) {
+      s[len-2] = 'l';
+      s[len-1] = 'n';
+      return len;
+    } else if (endsWith(s, len, "ļļ")) {
+      s[len-2] = 'l';
+      s[len-1] = 'l';
+      return len;
+    } else if (s[len-1] == 'č') {
+      s[len-1] = 'c';
+      return len;
+    } else if (s[len-1] == 'ļ') {
+      s[len-1] = 'l';
+      return len;
+    } else if (s[len-1] == 'ņ') {
+      s[len-1] = 'n';
+      return len;
+    }
+    
+    return len;
+  }
+  
+  /**
+   * Count the vowels in the string, we always require at least
+   * one in the remaining stem to accept it.
+   */
+  private int numVowels(char s[], int len) {
+    int n = 0;
+    for (int i = 0; i < len; i++) {
+      switch(s[i]) {
+        case 'a': case 'e': case 'i':  
+        case 'o': case 'u': case 'ā':  
+        case 'ī': case 'ē': case 'ū':
+          n++;
+      }
+    }
+    return n;
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/package.html Thu Apr 14 17:07:10 2011
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Latvian.
+</body>
+</html>

Added: lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt (added)
+++ lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt Thu Apr 14 17:07:10 2011
@@ -0,0 +1,172 @@
+# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
+# the original list of over 800 forms was refined: 
+#   pronouns, adverbs, interjections were removed
+# 
+# prepositions
+aiz
+ap
+ar
+apakš
+ārpus
+augšpus
+bez
+caur
+dēļ
+gar
+iekš
+iz
+kopš
+labad
+lejpus
+līdz
+no
+otrpus
+pa
+par
+pār
+pēc
+pie
+pirms
+pret
+priekš
+starp
+Å¡aipus
+uz
+viņpus
+virs
+virspus
+zem
+apakšpus
+# Conjunctions
+un
+bet
+jo
+ja
+ka
+lai
+tomēr
+tikko
+turpretī
+arī
+kaut
+gan
+tādēļ
+tā
+ne
+tikvien
+vien
+kā
+ir
+te
+vai
+kamēr
+# Particles
+ar
+diezin
+droši
+diemžēl
+nebūt
+ik
+it
+taču
+nu
+pat
+tiklab
+iekšpus
+nedz
+tik
+nevis
+turpretim
+jeb
+iekam
+iekām
+iekāms
+kolīdz
+līdzko
+tiklīdz
+jebšu
+tālab
+tāpēc
+nekā
+itin
+jā
+jau
+jel
+nē
+nezin
+tad
+tikai
+vis
+tak
+iekams
+vien
+# modal verbs
+būt  
+biju 
+biji
+bija
+bijām
+bijāt
+esmu
+esi
+esam
+esat 
+būšu     
+būsi
+būs
+būsim
+būsiet
+tikt
+tiku
+tiki
+tika
+tikām
+tikāt
+tieku
+tiec
+tiek
+tiekam
+tiekat
+tikšu
+tiks
+tiksim
+tiksiet
+tapt
+tapi
+tapāt
+topat
+tapšu
+tapsi
+taps
+tapsim
+tapsiet
+kļūt
+kļuvu
+kļuvi
+kļuva
+kļuvām
+kļuvāt
+kļūstu
+kļūsti
+kļūst
+kļūstam
+kļūstat
+kļūšu
+kļūsi
+kļūs
+kļūsim
+kļūsiet
+# verbs
+varēt
+varēju
+varējām
+varēšu
+varēsim
+var
+varēji
+varējāt
+varēsi
+varēsiet
+varat
+varēja
+varēs

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new LatvianAnalyzer(TEST_VERSION_CURRENT);
+  }
+  
+  /** test stopwords and stemming */
+  public void testBasics() throws IOException {
+    Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT);
+    // stemming
+    checkOneTermReuse(a, "tirgiem", "tirg");
+    checkOneTermReuse(a, "tirgus", "tirg");
+    // stopword
+    assertAnalyzesTo(a, "un", new String[] {});
+  }
+  
+  /** test use of exclusion set */
+  public void testExclude() throws IOException {
+    Set<String> exclusionSet = new HashSet<String>();
+    exclusionSet.add("tirgiem");
+    Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT, 
+        LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTermReuse(a, "tirgiem", "tirgiem");
+    checkOneTermReuse(a, "tirgus", "tirg");
+  }
+}

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemmer.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,272 @@
+package org.apache.lucene.analysis.lv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+/**
+ * Basic tests for {@link LatvianStemmer}
+ */
+public class TestLatvianStemmer extends BaseTokenStreamTestCase {
+  private Analyzer a = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+      return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
+    }
+  };
+  
+  public void testNouns1() throws IOException {
+    // decl. I
+    checkOneTerm(a, "tēvs",   "tēv"); // nom. sing.
+    checkOneTerm(a, "tēvi",   "tēv"); // nom. pl.
+    checkOneTerm(a, "tēva",   "tēv"); // gen. sing.
+    checkOneTerm(a, "tēvu",   "tēv"); // gen. pl.
+    checkOneTerm(a, "tēvam",  "tēv"); // dat. sing.
+    checkOneTerm(a, "tēviem", "tēv"); // dat. pl.
+    checkOneTerm(a, "tēvu",   "tēv"); // acc. sing.
+    checkOneTerm(a, "tēvus",  "tēv"); // acc. pl.
+    checkOneTerm(a, "tēvā",   "tēv"); // loc. sing.
+    checkOneTerm(a, "tēvos",  "tēv"); // loc. pl.
+    checkOneTerm(a, "tēvs",   "tēv"); // voc. sing.
+    checkOneTerm(a, "tēvi",   "tēv"); // voc. pl.
+  }
+  
+  /**
+   * decl II nouns with (s,t) -> š and (d,z) -> ž
+   * palatalization will generally conflate to two stems
+   * due to the ambiguity (plural and singular).
+   */
+  public void testNouns2() throws IOException {
+    // decl. II
+    
+    // c -> č palatalization
+    checkOneTerm(a, "lācis",  "lāc"); // nom. sing.
+    checkOneTerm(a, "lāči",   "lāc"); // nom. pl.
+    checkOneTerm(a, "lāča",   "lāc"); // gen. sing.
+    checkOneTerm(a, "lāču",   "lāc"); // gen. pl.
+    checkOneTerm(a, "lācim",  "lāc"); // dat. sing.
+    checkOneTerm(a, "lāčiem", "lāc"); // dat. pl.
+    checkOneTerm(a, "lāci",   "lāc"); // acc. sing.
+    checkOneTerm(a, "lāčus",  "lāc"); // acc. pl.
+    checkOneTerm(a, "lācī",   "lāc"); // loc. sing.
+    checkOneTerm(a, "lāčos",  "lāc"); // loc. pl.
+    checkOneTerm(a, "lāci",   "lāc"); // voc. sing.
+    checkOneTerm(a, "lāči",   "lāc"); // voc. pl.
+    
+    // n -> ņ palatalization
+    checkOneTerm(a, "akmens",   "akmen"); // nom. sing.
+    checkOneTerm(a, "akmeņi",   "akmen"); // nom. pl.
+    checkOneTerm(a, "akmens",   "akmen"); // gen. sing.
+    checkOneTerm(a, "akmeņu",   "akmen"); // gen. pl.
+    checkOneTerm(a, "akmenim",  "akmen"); // dat. sing.
+    checkOneTerm(a, "akmeņiem", "akmen"); // dat. pl.
+    checkOneTerm(a, "akmeni",   "akmen"); // acc. sing.
+    checkOneTerm(a, "akmeņus",  "akmen"); // acc. pl.
+    checkOneTerm(a, "akmenī",   "akmen"); // loc. sing.
+    checkOneTerm(a, "akmeņos",  "akmen"); // loc. pl.
+    checkOneTerm(a, "akmens",   "akmen"); // voc. sing.
+    checkOneTerm(a, "akmeņi",   "akmen"); // voc. pl.
+    
+    // no palatalization
+    checkOneTerm(a, "kurmis",   "kurm"); // nom. sing.
+    checkOneTerm(a, "kurmji",   "kurm"); // nom. pl.
+    checkOneTerm(a, "kurmja",   "kurm"); // gen. sing.
+    checkOneTerm(a, "kurmju",   "kurm"); // gen. pl.
+    checkOneTerm(a, "kurmim",   "kurm"); // dat. sing.
+    checkOneTerm(a, "kurmjiem", "kurm"); // dat. pl.
+    checkOneTerm(a, "kurmi",    "kurm"); // acc. sing.
+    checkOneTerm(a, "kurmjus",  "kurm"); // acc. pl.
+    checkOneTerm(a, "kurmī",    "kurm"); // loc. sing.
+    checkOneTerm(a, "kurmjos",  "kurm"); // loc. pl.
+    checkOneTerm(a, "kurmi",    "kurm"); // voc. sing.
+    checkOneTerm(a, "kurmji",   "kurm"); // voc. pl.
+  }
+  
+  public void testNouns3() throws IOException {
+    // decl III
+    checkOneTerm(a, "lietus",  "liet"); // nom. sing.
+    checkOneTerm(a, "lieti",   "liet"); // nom. pl.
+    checkOneTerm(a, "lietus",  "liet"); // gen. sing.
+    checkOneTerm(a, "lietu",   "liet"); // gen. pl.
+    checkOneTerm(a, "lietum",  "liet"); // dat. sing.
+    checkOneTerm(a, "lietiem", "liet"); // dat. pl.
+    checkOneTerm(a, "lietu",   "liet"); // acc. sing.
+    checkOneTerm(a, "lietus",  "liet"); // acc. pl.
+    checkOneTerm(a, "lietū",   "liet"); // loc. sing.
+    checkOneTerm(a, "lietos",  "liet"); // loc. pl.
+    checkOneTerm(a, "lietus",  "liet"); // voc. sing.
+    checkOneTerm(a, "lieti",   "liet"); // voc. pl.
+  }
+  
+  public void testNouns4() throws IOException {
+    // decl IV
+    checkOneTerm(a, "lapa",  "lap"); // nom. sing.
+    checkOneTerm(a, "lapas", "lap"); // nom. pl.
+    checkOneTerm(a, "lapas", "lap"); // gen. sing.
+    checkOneTerm(a, "lapu",  "lap"); // gen. pl.
+    checkOneTerm(a, "lapai", "lap"); // dat. sing.
+    checkOneTerm(a, "lapām", "lap"); // dat. pl.
+    checkOneTerm(a, "lapu",  "lap"); // acc. sing.
+    checkOneTerm(a, "lapas", "lap"); // acc. pl.
+    checkOneTerm(a, "lapā",  "lap"); // loc. sing.
+    checkOneTerm(a, "lapās", "lap"); // loc. pl.
+    checkOneTerm(a, "lapa",  "lap"); // voc. sing.
+    checkOneTerm(a, "lapas", "lap"); // voc. pl.
+    
+    checkOneTerm(a, "puika",  "puik"); // nom. sing.
+    checkOneTerm(a, "puikas", "puik"); // nom. pl.
+    checkOneTerm(a, "puikas", "puik"); // gen. sing.
+    checkOneTerm(a, "puiku",  "puik"); // gen. pl.
+    checkOneTerm(a, "puikam", "puik"); // dat. sing.
+    checkOneTerm(a, "puikām", "puik"); // dat. pl.
+    checkOneTerm(a, "puiku",  "puik"); // acc. sing.
+    checkOneTerm(a, "puikas", "puik"); // acc. pl.
+    checkOneTerm(a, "puikā",  "puik"); // loc. sing.
+    checkOneTerm(a, "puikās", "puik"); // loc. pl.
+    checkOneTerm(a, "puika",  "puik"); // voc. sing.
+    checkOneTerm(a, "puikas", "puik"); // voc. pl.
+  }
+  
+  /**
+   * Genitive plural forms with (s,t) -> š and (d,z) -> ž
+   * will not conflate due to ambiguity.
+   */
+  public void testNouns5() throws IOException {
+    // decl V
+    // l -> ļ palatalization
+    checkOneTerm(a, "egle",  "egl"); // nom. sing.
+    checkOneTerm(a, "egles", "egl"); // nom. pl.
+    checkOneTerm(a, "egles", "egl"); // gen. sing.
+    checkOneTerm(a, "egļu",  "egl"); // gen. pl.
+    checkOneTerm(a, "eglei", "egl"); // dat. sing.
+    checkOneTerm(a, "eglēm", "egl"); // dat. pl.
+    checkOneTerm(a, "egli",  "egl"); // acc. sing.
+    checkOneTerm(a, "egles", "egl"); // acc. pl.
+    checkOneTerm(a, "eglē",  "egl"); // loc. sing.
+    checkOneTerm(a, "eglēs", "egl"); // loc. pl.
+    checkOneTerm(a, "egle",  "egl"); // voc. sing.
+    checkOneTerm(a, "egles", "egl"); // voc. pl.
+  }
+  
+  public void testNouns6() throws IOException {
+    // decl VI
+    
+    // no palatalization
+    checkOneTerm(a, "govs",  "gov"); // nom. sing.
+    checkOneTerm(a, "govis", "gov"); // nom. pl.
+    checkOneTerm(a, "govs",  "gov"); // gen. sing.
+    checkOneTerm(a, "govju", "gov"); // gen. pl.
+    checkOneTerm(a, "govij", "gov"); // dat. sing.
+    checkOneTerm(a, "govīm", "gov"); // dat. pl.
+    checkOneTerm(a, "govi ", "gov"); // acc. sing.
+    checkOneTerm(a, "govis", "gov"); // acc. pl.
+    checkOneTerm(a, "govi ", "gov"); // inst. sing.
+    checkOneTerm(a, "govīm", "gov"); // inst. pl.
+    checkOneTerm(a, "govī",  "gov"); // loc. sing.
+    checkOneTerm(a, "govīs", "gov"); // loc. pl.
+    checkOneTerm(a, "govs",  "gov"); // voc. sing.
+    checkOneTerm(a, "govis", "gov"); // voc. pl.
+  }
+  
+  public void testAdjectives() throws IOException {
+    checkOneTerm(a, "zils",     "zil"); // indef. nom. masc. sing.
+    checkOneTerm(a, "zilais",   "zil"); // def. nom. masc. sing.
+    checkOneTerm(a, "zili",     "zil"); // indef. nom. masc. pl.
+    checkOneTerm(a, "zilie",    "zil"); // def. nom. masc. pl.
+    checkOneTerm(a, "zila",     "zil"); // indef. nom. fem. sing.
+    checkOneTerm(a, "zilā",     "zil"); // def. nom. fem. sing.
+    checkOneTerm(a, "zilas",    "zil"); // indef. nom. fem. pl.
+    checkOneTerm(a, "zilās",    "zil"); // def. nom. fem. pl.
+    checkOneTerm(a, "zila",     "zil"); // indef. gen. masc. sing.
+    checkOneTerm(a, "zilā",     "zil"); // def. gen. masc. sing.
+    checkOneTerm(a, "zilu",     "zil"); // indef. gen. masc. pl.
+    checkOneTerm(a, "zilo",     "zil"); // def. gen. masc. pl.
+    checkOneTerm(a, "zilas",    "zil"); // indef. gen. fem. sing.
+    checkOneTerm(a, "zilās",    "zil"); // def. gen. fem. sing.
+    checkOneTerm(a, "zilu",     "zil"); // indef. gen. fem. pl.
+    checkOneTerm(a, "zilo",     "zil"); // def. gen. fem. pl.
+    checkOneTerm(a, "zilam",    "zil"); // indef. dat. masc. sing.
+    checkOneTerm(a, "zilajam",  "zil"); // def. dat. masc. sing.
+    checkOneTerm(a, "ziliem",   "zil"); // indef. dat. masc. pl.
+    checkOneTerm(a, "zilajiem", "zil"); // def. dat. masc. pl.
+    checkOneTerm(a, "zilai",    "zil"); // indef. dat. fem. sing.
+    checkOneTerm(a, "zilajai",  "zil"); // def. dat. fem. sing.
+    checkOneTerm(a, "zilām",    "zil"); // indef. dat. fem. pl.
+    checkOneTerm(a, "zilajām",  "zil"); // def. dat. fem. pl.
+    checkOneTerm(a, "zilu",     "zil"); // indef. acc. masc. sing.
+    checkOneTerm(a, "zilo",     "zil"); // def. acc. masc. sing.
+    checkOneTerm(a, "zilus",    "zil"); // indef. acc. masc. pl.
+    checkOneTerm(a, "zilos",    "zil"); // def. acc. masc. pl.
+    checkOneTerm(a, "zilu",     "zil"); // indef. acc. fem. sing.
+    checkOneTerm(a, "zilo",     "zil"); // def. acc. fem. sing.
+    checkOneTerm(a, "zilās",    "zil"); // indef. acc. fem. pl.
+    checkOneTerm(a, "zilās",    "zil"); // def. acc. fem. pl.
+    checkOneTerm(a, "zilā",     "zil"); // indef. loc. masc. sing.
+    checkOneTerm(a, "zilajā",   "zil"); // def. loc. masc. sing.
+    checkOneTerm(a, "zilos",    "zil"); // indef. loc. masc. pl.
+    checkOneTerm(a, "zilajos",  "zil"); // def. loc. masc. pl.
+    checkOneTerm(a, "zilā",     "zil"); // indef. loc. fem. sing.
+    checkOneTerm(a, "zilajā",   "zil"); // def. loc. fem. sing.
+    checkOneTerm(a, "zilās",    "zil"); // indef. loc. fem. pl.
+    checkOneTerm(a, "zilajās",  "zil"); // def. loc. fem. pl.
+    checkOneTerm(a, "zilais",   "zil"); // voc. masc. sing.
+    checkOneTerm(a, "zilie",    "zil"); // voc. masc. pl.
+    checkOneTerm(a, "zilā",     "zil"); // voc. fem. sing.
+    checkOneTerm(a, "zilās",    "zil"); // voc. fem. pl.
+  }
+  
+  /**
+   * Note: we intentionally don't handle the ambiguous
+   * (s,t) -> š and (d,z) -> ž
+   */
+  public void testPalatalization() throws IOException {
+    checkOneTerm(a, "krāsns", "krāsn"); // nom. sing.
+    checkOneTerm(a, "krāšņu", "krāsn"); // gen. pl.
+    checkOneTerm(a, "zvaigzne", "zvaigzn"); // nom. sing.
+    checkOneTerm(a, "zvaigžņu", "zvaigzn"); // gen. pl.
+    checkOneTerm(a, "kāpslis", "kāpsl"); // nom. sing.
+    checkOneTerm(a, "kāpšļu",  "kāpsl"); // gen. pl.
+    checkOneTerm(a, "zizlis", "zizl"); // nom. sing.
+    checkOneTerm(a, "zižļu",  "zizl"); // gen. pl.
+    checkOneTerm(a, "vilnis", "viln"); // nom. sing.
+    checkOneTerm(a, "viļņu",  "viln"); // gen. pl.
+    checkOneTerm(a, "lelle", "lell"); // nom. sing.
+    checkOneTerm(a, "leļļu", "lell"); // gen. pl.
+    checkOneTerm(a, "pinne", "pinn"); // nom. sing.
+    checkOneTerm(a, "piņņu", "pinn"); // gen. pl.
+    checkOneTerm(a, "rīkste", "rīkst"); // nom. sing.
+    checkOneTerm(a, "rīkšu",  "rīkst"); // gen. pl.
+  }
+  
+  /**
+   * Test some length restrictions, we require a 3+ char stem,
+   * with at least one vowel.
+   */
+  public void testLength() throws IOException {
+    checkOneTerm(a, "usa", "usa"); // length
+    checkOneTerm(a, "60ms", "60ms"); // vowel count
+  }
+}

Added: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,38 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.lv.LatvianStemFilter;
+
+/** 
+ * Factory for {@link LatvianStemFilter}. 
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.LatvianStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class LatvianStemFilterFactory extends BaseTokenFilterFactory {
+  public TokenStream create(TokenStream input) {
+    return new LatvianStemFilter(input);
+  }
+}

Added: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java?rev=1092396&view=auto
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java Thu Apr 14 17:07:10 2011
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure the Latvian stem factory is working.
+ */
+public class TestLatvianStemFilterFactory extends BaseTokenTestCase {
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("tirgiem tirgus");
+    LatvianStemFilterFactory factory = new LatvianStemFilterFactory();
+    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+    assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
+  }
+}