You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2020/08/26 11:49:00 UTC
[lucene-solr] branch master updated: LUCENE-9313: Analyzer for Serbian language based on Snowball stemmer

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new c7029b1  LUCENE-9313: Analyzer for Serbian language based on Snowball stemmer
c7029b1 is described below

commit c7029b12541d908e9c86f1f9946a3369aea5236a
Author: Robert Muir <rm...@apache.org>
AuthorDate: Wed Aug 26 07:50:29 2020 -0400

    LUCENE-9313: Analyzer for Serbian language based on Snowball stemmer
---
 lucene/CHANGES.txt                                 |   2 +
 .../apache/lucene/analysis/sr/SerbianAnalyzer.java | 129 +++++++++++++++++
 .../org/apache/lucene/analysis/sr/stopwords.txt    | 156 +++++++++++++++++++++
 .../lucene/analysis/sr/TestSerbianAnalyzer.java    |  65 +++++++++
 4 files changed, 352 insertions(+)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0e16e67..046bdc7 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -176,6 +176,8 @@ New Features
   small segments on getReader, subject to a configurable timeout, to improve
   search performance by reducing the number of small segments for searching. (Simon Willnauer)
 
+* LUCENE-9313: Add SerbianAnalyzer based on the snowball stemmer. (Dragan Ivanovic)
+
 Improvements
 ---------------------
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianAnalyzer.java
new file mode 100644
index 0000000..c672725
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianAnalyzer.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.sr;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.tartarus.snowball.ext.SerbianStemmer;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * {@link Analyzer} for Serbian.
+ *
+ * @since 8.6
+ */
+public class SerbianAnalyzer extends StopwordAnalyzerBase {
+  private final CharArraySet stemExclusionSet;
+
+  /** File containing default Serbian stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+  /**
+   * The comment character in the stopwords file.
+   * All lines prefixed with this will be ignored.
+   */
+  private static final String STOPWORDS_COMMENT = "#";
+
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static CharArraySet getDefaultStopSet() {
+    return SerbianAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final CharArraySet DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, SerbianAnalyzer.class,
+                                           DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public SerbianAnalyzer() {
+    this(SerbianAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   *
+   * @param stopwords a stopword set
+   */
+  public SerbianAnalyzer(CharArraySet stopwords) {
+    this(stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+   * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
+   * stemming.
+   *
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public SerbianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
+    super(stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
+  }
+
+  /**
+   * Creates a
+   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   * which tokenizes all the text in the provided {@link Reader}.
+   *
+   * @return A
+   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   *         built from an {@link StandardTokenizer} filtered with
+   *         {@link LowerCaseFilter}, {@link StopFilter}
+   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
+   *         provided, {@link SnowballFilter} ({@link SerbianStemmer} https://snowballstem.org/algorithms/serbian/stemmer.html), and {@link SerbianNormalizationFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName) {
+    final Tokenizer source = new StandardTokenizer();
+    TokenStream result = new LowerCaseFilter(source);
+    result = new StopFilter(result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+    result = new SnowballFilter(result, new SerbianStemmer());
+    result = new SerbianNormalizationFilter(result);
+    return new TokenStreamComponents(source, result);
+  }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    return new LowerCaseFilter(in);
+  }
+}
diff --git a/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt
new file mode 100644
index 0000000..17cea56
--- /dev/null
+++ b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt
@@ -0,0 +1,156 @@
+i
+ili
+a
+ali
+pa
+biti
+ne
+jesam
+sam
+jesi
+si
+je
+jesmo
+smo
+jeste
+ste
+jesu
+su
+nijesam
+nisam
+nijesi
+nisi
+nije
+nijesmo
+nismo
+nijeste
+niste
+nijesu
+nisu
+budem
+budeš
+bude
+budemo
+budete
+budu
+budes
+bih
+bi
+bismo
+biste
+biše
+bise
+bio
+bili
+budimo
+budite
+bila
+bilo
+bile
+ću
+ćeš
+će
+ćemo
+ćete
+neću
+nećeš
+neće
+nećemo
+nećete
+cu
+ces
+ce
+cemo
+cete
+necu
+neces
+nece
+necemo
+necete
+mogu
+možeš
+može
+možemo
+možete
+mozes
+moze
+mozemo
+mozete
+и
+или
+а
+али
+па
+бити
+не
+јесам
+сам
+јеси
+си
+је
+јесмо
+смо
+јесте
+сте
+јесу
+су
+нијесам
+нисам
+нијеси
+ниси
+није
+нијесмо
+нисмо
+нијесте
+нисте
+нијесу
+нису
+будем
+будеш
+буде
+будемо
+будете
+буду
+будес
+бих
+би
+бисмо
+бисте
+бише
+бисе
+био
+били
+будимо
+будите
+била
+било
+биле
+ћу
+ћеш
+ће
+ћемо
+ћете
+нећу
+нећеш
+неће
+нећемо
+нећете
+цу
+цес
+це
+цемо
+цете
+нецу
+нецес
+неце
+нецемо
+нецете
+могу
+можеш
+може
+можемо
+можете
+мозес
+мозе
+моземо
+мозете
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianAnalyzer.java
new file mode 100644
index 0000000..c649baf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianAnalyzer.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.sr;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+
+import java.io.IOException;
+
+/**
+ * Test the SerbianAnalyzer
+ *
+ */
+public class TestSerbianAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new SerbianAnalyzer().close();
+  }
+
+  /** test stopwords and stemming */
+  public void testBasics() throws IOException {
+    Analyzer a = new SerbianAnalyzer();
+    // stemming
+    checkOneTerm(a, "abdiciraće", "abdicirac");
+    checkOneTerm(a, "decimalnim", "decimaln");
+    checkOneTerm(a, "đubrište", "djubrist");
+
+    // stopword
+    assertAnalyzesTo(a, "ili", new String[] {});
+    a.close();
+  }
+
+  /** test use of exclusion set */
+  public void testExclude() throws IOException {
+    CharArraySet exclusionSet = new CharArraySet( asSet("decimalnim"), false);
+    Analyzer a = new SerbianAnalyzer(
+                 SerbianAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTerm(a, "decimalnim", "decimalnim");
+    checkOneTerm(a, "decimalni", "decimaln");
+    a.close();
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer analyzer = new SerbianAnalyzer();
+    checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+    analyzer.close();
+  }
+}