You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/05/07 23:21:12 UTC
svn commit: r942235 - in /lucene/dev/trunk: lucene/contrib/ modules/analysis/common/src/java/org/apache/lucene/analysis/id/ modules/analysis/common/src/resources/org/apache/lucene/analysis/id/ modules/analysis/common/src/test/org/apache/lucene/analysis...

Author: rmuir
Date: Fri May  7 21:21:12 2010
New Revision: 942235

URL: http://svn.apache.org/viewvc?rev=942235&view=rev
Log:
LUCENE-2437: Indonesian Analyzer

Added:
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html   (with props)
    lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/
    lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt   (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java   (with props)
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java   (with props)
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java   (with props)
Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=942235&r1=942234&r2=942235&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Fri May  7 21:21:12 2010
@@ -152,6 +152,8 @@ New features
    of AttributeSource.cloneAttributes() instances and the new copyTo() method.
    (Steven Rowe via Uwe Schindler)
 
+ * LUCENE-2437: Add an Analyzer for Indonesian.  (Robert Muir)
+
 Build
 
  * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java?rev=942235&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java Fri May  7 21:21:12 2010
@@ -0,0 +1,130 @@
+package org.apache.lucene.analysis.id;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Analyzer for Indonesian (Bahasa)
+ */
+public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
+  /** File containing default Indonesian stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, IndonesianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+  
+  private final Set<?> stemExclusionSet;
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public IndonesianAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words
+   * 
+   * @param matchVersion
+   *          lucene compatibility version
+   * @param stopwords
+   *          a stopword set
+   */
+  public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords){
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerFilter} before
+   * {@link IndonesianStemFilter}.
+   * 
+   * @param matchVersion
+   *          lucene compatibility version
+   * @param stopwords
+   *          a stopword set
+   * @param stemExclusionSet
+   *          a set of terms not to be stemmed
+   */
+  public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates
+   * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
+   * used to tokenize all the text in the provided {@link Reader}.
+   * 
+   * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
+   *         built from an {@link StandardTokenizer} filtered with
+   *         {@link StandardFilter}, {@link LowerCaseFilter},
+   *         {@link StopFilter}, {@link KeywordMarkerFilter}
+   *         if a stem exclusion set is provided and {@link IndonesianStemFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(source);
+    result = new LowerCaseFilter(matchVersion, source);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if (!stemExclusionSet.isEmpty()) {
+      result = new KeywordMarkerFilter(result, stemExclusionSet);
+    }
+    return new TokenStreamComponents(source, new IndonesianStemFilter(result));
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java?rev=942235&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java Fri May  7 21:21:12 2010
@@ -0,0 +1,67 @@
+package org.apache.lucene.analysis.id;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link IndonesianStemmer} to stem Indonesian words.
+ */
+public final class IndonesianStemFilter extends TokenFilter {
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  private final IndonesianStemmer stemmer = new IndonesianStemmer();
+  private final boolean stemDerivational;
+
+  /**
+   * Calls {@link #IndonesianStemFilter(TokenStream, boolean) IndonesianStemFilter(input, true)}
+   */
+  public IndonesianStemFilter(TokenStream input) {
+    this(input, true);
+  }
+  
+  /**
+   * Create a new IndonesianStemFilter.
+   * <p>
+   * If <code>stemDerivational</code> is false, 
+   * only inflectional suffixes (particles and possessive pronouns) are stemmed.
+   */
+  public IndonesianStemFilter(TokenStream input, boolean stemDerivational) {
+    super(input);
+    this.stemDerivational = stemDerivational;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if(!keywordAtt.isKeyword()) {
+        final int newlen = 
+          stemmer.stem(termAtt.buffer(), termAtt.length(), stemDerivational);
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java?rev=942235&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java Fri May  7 21:21:12 2010
@@ -0,0 +1,304 @@
+package org.apache.lucene.analysis.id;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Stemmer for Indonesian.
+ * <p>
+ * Stems Indonesian words with the algorithm presented in:
+ * <i>A Study of Stemming Effects on Information Retrieval in 
+ * Bahasa Indonesia</i>, Fadillah Z Tala.
+ * http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf
+ */
+public class IndonesianStemmer {
+  private int numSyllables;
+  private int flags;
+  private static final int REMOVED_KE = 1;
+  private static final int REMOVED_PENG = 2;
+  private static final int REMOVED_DI = 4;
+  private static final int REMOVED_MENG = 8;
+  private static final int REMOVED_TER = 16;
+  private static final int REMOVED_BER = 32;
+  private static final int REMOVED_PE = 64;
+  
+  /**
+   * Stem a term (returning its new length).
+   * <p>
+   * Use <code>stemDerivational</code> to control whether full stemming
+   * or only light inflectional stemming is done.
+   */
+  public int stem(char text[], int length, boolean stemDerivational) {
+    flags = 0;
+    numSyllables = 0;
+    for (int i = 0; i < length; i++)
+      if (isVowel(text[i]))
+          numSyllables++;
+    
+    if (numSyllables > 2) length = removeParticle(text, length);
+    if (numSyllables > 2) length = removePossessivePronoun(text, length);
+    
+    if (stemDerivational)
+      length = stemDerivational(text, length);
+    return length;
+  }
+  
+  private int stemDerivational(char text[], int length) {
+    int oldLength = length;
+    if (numSyllables > 2) length = removeFirstOrderPrefix(text, length);
+    if (oldLength != length) { // a rule is fired
+      oldLength = length;
+      if (numSyllables > 2) length = removeSuffix(text, length);
+      if (oldLength != length) // a rule is fired
+        if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
+    } else { // fail
+      if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
+      if (numSyllables > 2) length = removeSuffix(text, length);
+    }
+    return length;
+  }
+  
+  private boolean isVowel(char ch) {
+    switch(ch) {
+      case 'a':
+      case 'e':
+      case 'i':
+      case 'o':
+      case 'u':
+        return true;
+      default:
+        return false;
+    }
+  }
+  
+  private int removeParticle(char text[], int length) {
+    if (endsWith(text, length, "kah") || 
+        endsWith(text, length, "lah") || 
+        endsWith(text, length, "pun")) {
+        numSyllables--;
+        return length - 3;
+    }
+    
+    return length;
+  }
+  
+  private int removePossessivePronoun(char text[], int length) {
+    if (endsWith(text, length, "ku") || endsWith(text, length, "mu")) {
+      numSyllables--;
+      return length - 2;
+    }
+    
+    if (endsWith(text, length, "nya")) {
+      numSyllables--;
+      return length - 3;
+    }
+    
+    return length;
+  }
+  
+  private int removeFirstOrderPrefix(char text[], int length) {
+    if (startsWith(text, length, "meng")) {
+      flags |= REMOVED_MENG;
+      numSyllables--;
+      return deleteN(text, 0, length, 4);
+    }
+    
+    if (startsWith(text, length, "meny") && length > 4 && isVowel(text[4])) {
+      flags |= REMOVED_MENG;
+      text[3] = 's';
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (startsWith(text, length, "men")) {
+      flags |= REMOVED_MENG;
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+ 
+    if (startsWith(text, length, "mem")) {
+      flags |= REMOVED_MENG;
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (startsWith(text, length, "me")) {
+      flags |= REMOVED_MENG;
+      numSyllables--;
+      return deleteN(text, 0, length, 2);
+    }
+    
+    if (startsWith(text, length, "peng")) {
+      flags |= REMOVED_PENG;
+      numSyllables--;
+      return deleteN(text, 0, length, 4);
+    }
+    
+    if (startsWith(text, length, "peny") && length > 4 && isVowel(text[4])) {
+      flags |= REMOVED_PENG;
+      text[3] = 's';
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (startsWith(text, length, "peny")) {
+      flags |= REMOVED_PENG;
+      numSyllables--;
+      return deleteN(text, 0, length, 4);
+    }
+    
+    if (startsWith(text, length, "pen") && length > 3 && isVowel(text[3])) {
+      flags |= REMOVED_PENG;
+      text[2] = 't';
+      numSyllables--;
+      return deleteN(text, 0, length, 2);
+    }
+
+    if (startsWith(text, length, "pen")) {
+      flags |= REMOVED_PENG;
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (startsWith(text, length, "pem")) {
+      flags |= REMOVED_PENG;
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (startsWith(text, length, "di")) {
+      flags |= REMOVED_DI;
+      numSyllables--;
+      return deleteN(text, 0, length, 2);
+    }
+    
+    if (startsWith(text, length, "ter")) {
+      flags |= REMOVED_TER;
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (startsWith(text, length, "ke")) {
+      flags |= REMOVED_KE;
+      numSyllables--;
+      return deleteN(text, 0, length, 2);
+    }
+    
+    return length;
+  }
+  
+  private int removeSecondOrderPrefix(char text[], int length) {
+    if (startsWith(text, length, "ber")) {
+      flags |= REMOVED_BER;
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (length == 7 && startsWith(text, length, "belajar")) {
+      flags |= REMOVED_BER;
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (startsWith(text, length, "be") && length > 4 
+        && !isVowel(text[2]) && text[3] == 'e' && text[4] == 'r') {
+      flags |= REMOVED_BER;
+      numSyllables--;
+      return deleteN(text, 0, length, 2);
+    }
+    
+    if (startsWith(text, length, "per")) {
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (length == 7 && startsWith(text, length, "pelajar")) {
+      numSyllables--;
+      return deleteN(text, 0, length, 3);
+    }
+    
+    if (startsWith(text, length, "pe")) {
+      flags |= REMOVED_PE;
+      numSyllables--;
+      return deleteN(text, 0, length, 2);
+    }
+
+    return length;
+  }
+  
+  private int removeSuffix(char text[], int length) {
+    if (endsWith(text, length, "kan") 
+        && (flags & REMOVED_KE) == 0 
+        && (flags & REMOVED_PENG) == 0 
+        && (flags & REMOVED_PE) == 0) {
+      numSyllables--;
+      return length - 3;
+    }
+    
+    if (endsWith(text, length, "an") 
+        && (flags & REMOVED_DI) == 0 
+        && (flags & REMOVED_MENG) == 0 
+        && (flags & REMOVED_TER) == 0) {
+      numSyllables--;
+      return length - 2;
+    }
+    
+    if (endsWith(text, length, "i") 
+        && !endsWith(text, length, "si") 
+        && (flags & REMOVED_BER) == 0 
+        && (flags & REMOVED_KE) == 0 
+        && (flags & REMOVED_PENG) == 0) {
+      numSyllables--;
+      return length - 1;
+    }
+    return length;
+  }
+  
+  private boolean startsWith(char s[], int len, String prefix) {
+    final int prefixLen = prefix.length();
+    if (prefixLen > len)
+      return false;
+    for (int i = 0; i < prefixLen; i++)
+      if (s[i] != prefix.charAt(i)) 
+        return false;
+    return true;
+  }
+  
+  private boolean endsWith(char s[], int len, String suffix) {
+    final int suffixLen = suffix.length();
+    if (suffixLen > len)
+      return false;
+    for (int i = suffixLen - 1; i >= 0; i--)
+      if (s[len -(suffixLen - i)] != suffix.charAt(i))
+        return false;
+    
+    return true;
+  }
+  
+  private int deleteN(char s[], int pos, int len, int nChars) {
+    for (int i = 0; i < nChars; i++)
+      len = delete(s, pos, len);
+    return len;
+  }
+  
+  private int delete(char s[], int pos, int len) {
+    if (pos < len) 
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html?rev=942235&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html Fri May  7 21:21:12 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Indonesian.
+</body>
+</html>

Propchange: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/id/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt?rev=942235&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt (added)
+++ lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt Fri May  7 21:21:12 2010
@@ -0,0 +1,359 @@
+# from appendix D of: A Study of Stemming Effects on Information
+# Retrieval in Bahasa Indonesia
+ada
+adanya
+adalah
+adapun
+agak
+agaknya
+agar
+akan
+akankah
+akhirnya
+aku
+akulah
+amat
+amatlah
+anda
+andalah
+antar
+diantaranya
+antara
+antaranya
+diantara
+apa
+apaan
+mengapa
+apabila
+apakah
+apalagi
+apatah
+atau
+ataukah
+ataupun
+bagai
+bagaikan
+sebagai
+sebagainya
+bagaimana
+bagaimanapun
+sebagaimana
+bagaimanakah
+bagi
+bahkan
+bahwa
+bahwasanya
+sebaliknya
+banyak
+sebanyak
+beberapa
+seberapa
+begini
+beginian
+beginikah
+beginilah
+sebegini
+begitu
+begitukah
+begitulah
+begitupun
+sebegitu
+belum
+belumlah
+sebelum
+sebelumnya
+sebenarnya
+berapa
+berapakah
+berapalah
+berapapun
+betulkah
+sebetulnya
+biasa
+biasanya
+bila
+bilakah
+bisa
+bisakah
+sebisanya
+boleh
+bolehkah
+bolehlah
+buat
+bukan
+bukankah
+bukanlah
+bukannya
+cuma
+percuma
+dahulu
+dalam
+dan
+dapat
+dari
+daripada
+dekat
+demi
+demikian
+demikianlah
+sedemikian
+dengan
+depan
+di
+dia
+dialah
+dini
+diri
+dirinya
+terdiri
+dong
+dulu
+enggak
+enggaknya
+entah
+entahlah
+terhadap
+terhadapnya
+hal
+hampir
+hanya
+hanyalah
+harus
+haruslah
+harusnya
+seharusnya
+hendak
+hendaklah
+hendaknya
+hingga
+sehingga
+ia
+ialah
+ibarat
+ingin
+inginkah
+inginkan
+ini
+inikah
+inilah
+itu
+itukah
+itulah
+jangan
+jangankan
+janganlah
+jika
+jikalau
+juga
+justru
+kala
+kalau
+kalaulah
+kalaupun
+kalian
+kami
+kamilah
+kamu
+kamulah
+kan
+kapan
+kapankah
+kapanpun
+dikarenakan
+karena
+karenanya
+ke
+kecil
+kemudian
+kenapa
+kepada
+kepadanya
+ketika
+seketika
+khususnya
+kini
+kinilah
+kiranya
+sekiranya
+kita
+kitalah
+kok
+lagi
+lagian
+selagi
+lah
+lain
+lainnya
+melainkan
+selaku
+lalu
+melalui
+terlalu
+lama
+lamanya
+selama
+selama
+selamanya
+lebih
+terlebih
+bermacam
+macam
+semacam
+maka
+makanya
+makin
+malah
+malahan
+mampu
+mampukah
+mana
+manakala
+manalagi
+masih
+masihkah
+semasih
+masing
+mau
+maupun
+semaunya
+memang
+mereka
+merekalah
+meski
+meskipun
+semula
+mungkin
+mungkinkah
+nah
+namun
+nanti
+nantinya
+nyaris
+oleh
+olehnya
+seorang
+seseorang
+pada
+padanya
+padahal
+paling
+sepanjang
+pantas
+sepantasnya
+sepantasnyalah
+para
+pasti
+pastilah
+per
+pernah
+pula
+pun
+merupakan
+rupanya
+serupa
+saat
+saatnya
+sesaat
+saja
+sajalah
+saling
+bersama
+sama
+sesama
+sambil
+sampai
+sana
+sangat
+sangatlah
+saya
+sayalah
+se
+sebab
+sebabnya
+sebuah
+tersebut
+tersebutlah
+sedang
+sedangkan
+sedikit
+sedikitnya
+segala
+segalanya
+segera
+sesegera
+sejak
+sejenak
+sekali
+sekalian
+sekalipun
+sesekali
+sekaligus
+sekarang
+sekarang
+sekitar
+sekitarnya
+sela
+selain
+selalu
+seluruh
+seluruhnya
+semakin
+sementara
+sempat
+semua
+semuanya
+sendiri
+sendirinya
+seolah
+seperti
+sepertinya
+sering
+seringnya
+serta
+siapa
+siapakah
+siapapun
+disini
+disinilah
+sini
+sinilah
+sesuatu
+sesuatunya
+suatu
+sesudah
+sesudahnya
+sudah
+sudahkah
+sudahlah
+supaya
+tadi
+tadinya
+tak
+tanpa
+setelah
+telah
+tentang
+tentu
+tentulah
+tentunya
+tertentu
+seterusnya
+tapi
+tetapi
+setiap
+tiap
+setidaknya
+tidak
+tidakkah
+tidaklah
+toh
+waduh
+wah
+wahai
+sewaktu
+walau
+walaupun
+wong
+yaitu
+yakni
+yang

Propchange: lucene/dev/trunk/modules/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java?rev=942235&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java Fri May  7 21:21:12 2010
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.id;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new IndonesianAnalyzer(TEST_VERSION_CURRENT);
+  }
+  
+  /** test stopwords and stemming */
+  public void testBasics() throws IOException {
+    Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT);
+    // stemming
+    checkOneTermReuse(a, "peledakan", "ledak");
+    checkOneTermReuse(a, "pembunuhan", "bunuh");
+    // stopword
+    assertAnalyzesTo(a, "bahwa", new String[] {});
+  }
+  
+  /** test use of exclusion set */
+  public void testExclude() throws IOException {
+    Set<String> exclusionSet = new HashSet<String>();
+    exclusionSet.add("peledakan");
+    Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT, 
+        IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTermReuse(a, "peledakan", "peledakan");
+    checkOneTermReuse(a, "pembunuhan", "bunuh");
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java?rev=942235&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java Fri May  7 21:21:12 2010
@@ -0,0 +1,136 @@
+package org.apache.lucene.analysis.id;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Tests {@link IndonesianStemmer}
+ */
+public class TestIndonesianStemmer extends BaseTokenStreamTestCase {
+  /* full stemming, no stopwords */
+  Analyzer a = new ReusableAnalyzerBase() {
+    @Override
+    public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KeywordTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer));
+    }
+  };
+  
+  /** Some examples from the paper */
+  public void testExamples() throws IOException {
+    checkOneTerm(a, "bukukah", "buku");
+    checkOneTermReuse(a, "adalah", "ada");
+    checkOneTermReuse(a, "bukupun", "buku");
+    checkOneTermReuse(a, "bukuku", "buku");
+    checkOneTermReuse(a, "bukumu", "buku");
+    checkOneTermReuse(a, "bukunya", "buku");
+    checkOneTermReuse(a, "mengukur", "ukur");
+    checkOneTermReuse(a, "menyapu", "sapu");
+    checkOneTermReuse(a, "menduga", "duga");
+    checkOneTermReuse(a, "menuduh", "uduh");
+    checkOneTermReuse(a, "membaca", "baca");
+    checkOneTermReuse(a, "merusak", "rusak");
+    checkOneTermReuse(a, "pengukur", "ukur");
+    checkOneTermReuse(a, "penyapu", "sapu");
+    checkOneTermReuse(a, "penduga", "duga");
+    checkOneTermReuse(a, "pembaca", "baca");
+    checkOneTermReuse(a, "diukur", "ukur");
+    checkOneTermReuse(a, "tersapu", "sapu");
+    checkOneTermReuse(a, "kekasih", "kasih");
+    checkOneTermReuse(a, "berlari", "lari");
+    checkOneTermReuse(a, "belajar", "ajar");
+    checkOneTermReuse(a, "bekerja", "kerja");
+    checkOneTermReuse(a, "perjelas", "jelas");
+    checkOneTermReuse(a, "pelajar", "ajar");
+    checkOneTermReuse(a, "pekerja", "kerja");
+    checkOneTermReuse(a, "tarikkan", "tarik");
+    checkOneTermReuse(a, "ambilkan", "ambil");
+    checkOneTermReuse(a, "mengambilkan", "ambil");
+    checkOneTermReuse(a, "makanan", "makan");
+    checkOneTermReuse(a, "janjian", "janji");
+    checkOneTermReuse(a, "perjanjian", "janji");
+    checkOneTermReuse(a, "tandai", "tanda");
+    checkOneTermReuse(a, "dapati", "dapat");
+    checkOneTermReuse(a, "mendapati", "dapat");
+    checkOneTermReuse(a, "pantai", "panta");
+  }
+  
+  /** Some detailed analysis examples (that might not be the best) */
+  public void testIRExamples() throws IOException {
+    checkOneTerm(a, "penyalahgunaan", "salahguna");
+    checkOneTermReuse(a, "menyalahgunakan", "salahguna");
+    checkOneTermReuse(a, "disalahgunakan", "salahguna");
+       
+    checkOneTermReuse(a, "pertanggungjawaban", "tanggungjawab");
+    checkOneTermReuse(a, "mempertanggungjawabkan", "tanggungjawab");
+    checkOneTermReuse(a, "dipertanggungjawabkan", "tanggungjawab");
+    
+    checkOneTermReuse(a, "pelaksanaan", "laksana");
+    checkOneTermReuse(a, "pelaksana", "laksana");
+    checkOneTermReuse(a, "melaksanakan", "laksana");
+    checkOneTermReuse(a, "dilaksanakan", "laksana");
+    
+    checkOneTermReuse(a, "melibatkan", "libat");
+    checkOneTermReuse(a, "terlibat", "libat");
+    
+    checkOneTermReuse(a, "penculikan", "culik");
+    checkOneTermReuse(a, "menculik", "culik");
+    checkOneTermReuse(a, "diculik", "culik");
+    checkOneTermReuse(a, "penculik", "culik");
+    
+    checkOneTermReuse(a, "perubahan", "ubah");
+    checkOneTermReuse(a, "peledakan", "ledak");
+    checkOneTermReuse(a, "penanganan", "tangan");
+    checkOneTermReuse(a, "kepolisian", "polisi");
+    checkOneTermReuse(a, "kenaikan", "naik");
+    checkOneTermReuse(a, "bersenjata", "senjata");
+    checkOneTermReuse(a, "penyelewengan", "seleweng");
+    checkOneTermReuse(a, "kecelakaan", "celaka");
+  }
+  
+  /* inflectional-only stemming */
+  Analyzer b = new ReusableAnalyzerBase() {
+    @Override
+    public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KeywordTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new IndonesianStemFilter(tokenizer, false));
+    }
+  };
+  
+  /** Test stemming only inflectional suffixes */
+  public void testInflectionalOnly() throws IOException {
+    checkOneTerm(b, "bukunya", "buku");
+    checkOneTermReuse(b, "bukukah", "buku");
+    checkOneTermReuse(b, "bukunyakah", "buku");
+    checkOneTermReuse(b, "dibukukannya", "dibukukan");
+  }
+  
+  public void testShouldntStem() throws IOException {
+    checkOneTerm(a, "bersenjata", "senjata");
+    checkOneTermReuse(a, "bukukah", "buku");
+    checkOneTermReuse(a, "gigi", "gigi");
+  }
+}

Propchange: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemmer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java?rev=942235&view=auto
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java Fri May  7 21:21:12 2010
@@ -0,0 +1,37 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.id.IndonesianStemFilter;
+
+/** Factory for {@link IndonesianStemFilter} */
+public class IndonesianStemFilterFactory extends BaseTokenFilterFactory {
+  private boolean stemDerivational = true;
+
+  public void init(Map<String, String> args) {
+    super.init(args);
+    stemDerivational = getBoolean("stemDerivational", true);
+  }
+
+  public TokenStream create(TokenStream input) {
+    return new IndonesianStemFilter(input, stemDerivational);
+  }
+}

Propchange: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java?rev=942235&view=auto
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java (added)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java Fri May  7 21:21:12 2010
@@ -0,0 +1,59 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure the Indonesian stem filter factory is working.
+ */
+public class TestIndonesianStemFilterFactory extends BaseTokenTestCase {
+  /**
+   * Ensure the filter actually stems text.
+   */
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("dibukukannya");
+    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+    IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    factory.init(args);
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "buku" });
+  }
+  
+  /**
+   * Test inflectional-only mode
+   */
+  public void testStemmingInflectional() throws Exception {
+    Reader reader = new StringReader("dibukukannya");
+    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+    IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("stemDerivational", "false");
+    factory.init(args);
+    TokenStream stream = factory.create(tokenizer);
+    assertTokenStreamContents(stream, new String[] { "dibukukan" });
+  }
+}

Propchange: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native