You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2020/08/26 11:49:00 UTC
[lucene-solr] branch master updated: LUCENE-9313: Analyzer for
Serbian language based on Snowball stemmer
This is an automated email from the ASF dual-hosted git repository.
rmuir pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new c7029b1 LUCENE-9313: Analyzer for Serbian language based on Snowball stemmer
c7029b1 is described below
commit c7029b12541d908e9c86f1f9946a3369aea5236a
Author: Robert Muir <rm...@apache.org>
AuthorDate: Wed Aug 26 07:50:29 2020 -0400
LUCENE-9313: Analyzer for Serbian language based on Snowball stemmer
---
lucene/CHANGES.txt | 2 +
.../apache/lucene/analysis/sr/SerbianAnalyzer.java | 129 +++++++++++++++++
.../org/apache/lucene/analysis/sr/stopwords.txt | 156 +++++++++++++++++++++
.../lucene/analysis/sr/TestSerbianAnalyzer.java | 65 +++++++++
4 files changed, 352 insertions(+)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0e16e67..046bdc7 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -176,6 +176,8 @@ New Features
small segments on getReader, subject to a configurable timeout, to improve
search performance by reducing the number of small segments for searching. (Simon Willnauer)
+* LUCENE-9313: Add SerbianAnalyzer based on the snowball stemmer. (Dragan Ivanovic)
+
Improvements
---------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianAnalyzer.java
new file mode 100644
index 0000000..c672725
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianAnalyzer.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.sr;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.tartarus.snowball.ext.SerbianStemmer;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * {@link Analyzer} for Serbian.
+ *
+ * @since 8.6
+ */
+public class SerbianAnalyzer extends StopwordAnalyzerBase {
+ private final CharArraySet stemExclusionSet;
+
+ /** File containing default Serbian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /**
+ * The comment character in the stopwords file.
+ * All lines prefixed with this will be ignored.
+ */
+ private static final String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static CharArraySet getDefaultStopSet() {
+ return SerbianAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final CharArraySet DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false, SerbianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public SerbianAnalyzer() {
+ this(SerbianAnalyzer.DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopwords a stopword set
+ */
+ public SerbianAnalyzer(CharArraySet stopwords) {
+ this(stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
+ * stemming.
+ *
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public SerbianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
+ super(stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
+ }
+
+ /**
+ * Creates a
+ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+ * which tokenizes all the text in the provided {@link Reader}.
+ *
+ * @return A
+ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+ * built from an {@link StandardTokenizer} filtered with
+ * {@link LowerCaseFilter}, {@link StopFilter}
+ * , {@link SetKeywordMarkerFilter} if a stem exclusion set is
+ * provided, {@link SnowballFilter} ({@link SerbianStemmer} https://snowballstem.org/algorithms/serbian/stemmer.html), and {@link SerbianNormalizationFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer source = new StandardTokenizer();
+ TokenStream result = new LowerCaseFilter(source);
+ result = new StopFilter(result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new SerbianStemmer());
+ result = new SerbianNormalizationFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ return new LowerCaseFilter(in);
+ }
+}
diff --git a/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt
new file mode 100644
index 0000000..17cea56
--- /dev/null
+++ b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt
@@ -0,0 +1,156 @@
+i
+ili
+a
+ali
+pa
+biti
+ne
+jesam
+sam
+jesi
+si
+je
+jesmo
+smo
+jeste
+ste
+jesu
+su
+nijesam
+nisam
+nijesi
+nisi
+nije
+nijesmo
+nismo
+nijeste
+niste
+nijesu
+nisu
+budem
+budeš
+bude
+budemo
+budete
+budu
+budes
+bih
+bi
+bismo
+biste
+biše
+bise
+bio
+bili
+budimo
+budite
+bila
+bilo
+bile
+ću
+ćeš
+će
+ćemo
+ćete
+neću
+nećeš
+neće
+nećemo
+nećete
+cu
+ces
+ce
+cemo
+cete
+necu
+neces
+nece
+necemo
+necete
+mogu
+možeš
+može
+možemo
+možete
+mozes
+moze
+mozemo
+mozete
+и
+или
+а
+али
+па
+бити
+не
+јесам
+сам
+јеси
+си
+је
+јесмо
+смо
+јесте
+сте
+јесу
+су
+нијесам
+нисам
+нијеси
+ниси
+није
+нијесмо
+нисмо
+нијесте
+нисте
+нијесу
+нису
+будем
+будеш
+буде
+будемо
+будете
+буду
+будес
+бих
+би
+бисмо
+бисте
+бише
+бисе
+био
+били
+будимо
+будите
+била
+било
+биле
+ћу
+ћеш
+ће
+ћемо
+ћете
+нећу
+нећеш
+неће
+нећемо
+нећете
+цу
+цес
+це
+цемо
+цете
+нецу
+нецес
+неце
+нецемо
+нецете
+могу
+можеш
+може
+можемо
+можете
+мозес
+мозе
+моземо
+мозете
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianAnalyzer.java
new file mode 100644
index 0000000..c649baf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianAnalyzer.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.sr;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+
+import java.io.IOException;
+
+/**
+ * Test the SerbianAnalyzer
+ *
+ */
+public class TestSerbianAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new SerbianAnalyzer().close();
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new SerbianAnalyzer();
+ // stemming
+ checkOneTerm(a, "abdiciraće", "abdicirac");
+ checkOneTerm(a, "decimalnim", "decimaln");
+ checkOneTerm(a, "đubrište", "djubrist");
+
+ // stopword
+ assertAnalyzesTo(a, "ili", new String[] {});
+ a.close();
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ CharArraySet exclusionSet = new CharArraySet( asSet("decimalnim"), false);
+ Analyzer a = new SerbianAnalyzer(
+ SerbianAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTerm(a, "decimalnim", "decimalnim");
+ checkOneTerm(a, "decimalni", "decimaln");
+ a.close();
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ Analyzer analyzer = new SerbianAnalyzer();
+ checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+ analyzer.close();
+ }
+}