You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/11/30 20:53:14 UTC
[lucene] branch main updated: LUCENE-10248: Spanish Plural Stemmer (#461)
This is an automated email from the ASF dual-hosted git repository.
rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new edb936f LUCENE-10248: Spanish Plural Stemmer (#461)
edb936f is described below
commit edb936f09055cc16a71b5b037bef77177786759b
Author: Xavier Sanchez Loro <xa...@wallapop.com>
AuthorDate: Tue Nov 30 21:51:10 2021 +0100
LUCENE-10248: Spanish Plural Stemmer (#461)
Adds a new Spanish stemmer just for stemming plural to singular whilst maintaining gender: the SpanishPluralStemmer. The goal is to provide a lightweight algorithmic approach with better precision and recall than current approaches.
See blog post for more details: https://medium.com/inside-wallapop/spanish-plural-stemmer-matching-plural-and-singular-forms-in-spanish-using-lucene-93e005e38373
This approach is based on rules specified in WikiLingua: http://www.wikilengua.org/index.php/Plural_(formaci%C3%B3n)
Some characteristics:
* Designed to stem just plural to singular form
* Distinguishes between masculine and feminine forms
* It will increase recall but precision can be reduced depending on the use case/information need
* Stems plural words of foreign origin: i.e. complots, bits, punks, robots
* Support for invariant words: same plural and singular form or plural does not make sense: i.e. crisis, jueves, lapsus, abrebotellas, etc
* Support for special cases: i.e. yoes, clubes, itemes, faralaes
* Use it when the distinction between singular and plural is not relevant but gender is relevant
* Produces meaningful tokens in form of singular
* Not strange stems like “amig”: it’s true that stemmers must not generate grammatically correct tokens, but if we generate correct stems we decrease the possibility of collisions with other words
---
.../analysis/es/SpanishPluralStemFilter.java | 54 ++++
.../es/SpanishPluralStemFilterFactory.java | 58 +++++
.../lucene/analysis/es/SpanishPluralStemmer.java | 285 +++++++++++++++++++++
.../org.apache.lucene.analysis.TokenFilterFactory | 1 +
.../analysis/es/TestSpanishPluralStemFilter.java | 73 ++++++
.../es/TestSpanishPluralStemFilterFactory.java | 46 ++++
.../apache/lucene/analysis/es/espluraltestdata.zip | Bin 0 -> 133046 bytes
7 files changed, 517 insertions(+)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilter.java
new file mode 100644
index 0000000..78e189e
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilter.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link SpanishPluralStemmer} to stem Spanish words.
+ *
+ * <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
+ * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
+ * TokenStream}.
+ */
+public final class SpanishPluralStemFilter extends TokenFilter {
+ private final SpanishPluralStemmer stemmer = new SpanishPluralStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public SpanishPluralStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilterFactory.java
new file mode 100644
index 0000000..6063601
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemFilterFactory.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link SpanishPluralStemFilterFactory}.
+ *
+ * <pre class="prettyprint">
+ * <fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter class="solr.SpanishPluralStemFilterFactory"/>
+ * </analyzer>
+ * </fieldType></pre>
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class SpanishPluralStemFilterFactory extends TokenFilterFactory {
+
+ /** SPI name */
+ public static final String NAME = "spanishPluralStem";
+
+ /** Default ctor for compatibility with SPI */
+ public SpanishPluralStemFilterFactory() {
+ throw defaultCtorException();
+ }
+
+ public SpanishPluralStemFilterFactory(Map<String, String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new SpanishPluralStemFilter(input);
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemmer.java
new file mode 100644
index 0000000..cc76a24
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishPluralStemmer.java
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import java.util.Arrays;
+import java.util.List;
+import org.apache.lucene.analysis.CharArraySet;
+
+/**
+ * Plural Stemmer for Spanish
+ *
+ * <p>This stemmer implements the rules described in:
+ * <i>http://www.wikilengua.org/index.php/Plural_(formación)</i>
+ */
+public class SpanishPluralStemmer {
+
+ private static final CharArraySet invariants;
+ private static final CharArraySet specialCases;
+
+ private static final List<String> invariantsList =
+ Arrays.asList(
+ "abrebotellas",
+ "abrecartas",
+ "abrelatas",
+ "afueras",
+ "albatros",
+ "albricias",
+ "aledaños",
+ "alexis",
+ "aries",
+ "alicates",
+ "analisis",
+ "andurriales",
+ "antitesis",
+ "añicos",
+ "apendicitis",
+ "apocalipsis",
+ "arcoiris",
+ "aries",
+ "bilis",
+ "boletus",
+ "boris",
+ "brindis",
+ "cactus",
+ "canutas",
+ "caries",
+ "cascanueces",
+ "cascarrabias",
+ "ciempies",
+ "cifosis",
+ "cortaplumas",
+ "corpus",
+ "cosmos",
+ "cosquillas",
+ "creces",
+ "crisis",
+ "cuatrocientas",
+ "cuatrocientos",
+ "cuelgacapas",
+ "cuentacuentos",
+ "cuentapasos",
+ "cumpleaños",
+ "doscientas",
+ "doscientos",
+ "dosis",
+ "enseres",
+ "entonces",
+ "esponsales",
+ "estatus",
+ "exequias",
+ "fauces",
+ "forceps",
+ "fotosintesis",
+ "gafas",
+ "gafotas",
+ "gargaras",
+ "gris",
+ "honorarios",
+ "ictus",
+ "jueves",
+ "lapsus",
+ "lavacoches",
+ "lavaplatos",
+ "limpiabotas",
+ "lunes",
+ "maitines",
+ "martes",
+ "mondadientes",
+ "novecientas",
+ "novecientos",
+ "nupcias",
+ "ochocientas",
+ "ochocientos",
+ "pais",
+ "paris",
+ "parabrisas",
+ "paracaidas",
+ "parachoques",
+ "paraguas",
+ "pararrayos",
+ "pisapapeles",
+ "piscis",
+ "portaaviones",
+ "portamaletas",
+ "portamantas",
+ "quinientas",
+ "quinientos",
+ "quinientos",
+ "quitamanchas",
+ "recogepelotas",
+ "rictus",
+ "rompeolas",
+ "sacacorchos",
+ "sacapuntas",
+ "saltamontes",
+ "salvavidas",
+ "seis",
+ "seiscientas",
+ "seiscientos",
+ "setecientas",
+ "setecientos",
+ "sintesis",
+ "tenis",
+ "tifus",
+ "trabalenguas",
+ "vacaciones",
+ "venus",
+ "versus",
+ "viacrucis",
+ "virus",
+ "viveres",
+ "volandas");
+
+ static {
+ final CharArraySet invariantSet = new CharArraySet(invariantsList, true);
+ invariants = CharArraySet.unmodifiableSet(invariantSet);
+
+ final List<String> specialCasesList =
+ Arrays.asList(
+ "yoes",
+ "noes",
+ "sies",
+ "clubes",
+ "faralaes",
+ "albalaes",
+ "itemes",
+ "albumes",
+ "sandwiches",
+ "relojes",
+ "bojes",
+ "contrarreloj",
+ "carcajes");
+ final CharArraySet sepecialSet = new CharArraySet(specialCasesList, true);
+ specialCases = CharArraySet.unmodifiableSet(sepecialSet);
+ }
+
+ public int stem(char s[], int len) {
+ if (len < 4) return len; // plural have at least 4 letters (ases,eses,etc.)
+ removeAccents(s, len);
+ if (invariant(s, len)) return len;
+ if (special(s, len)) return len - 2;
+ switch (s[len - 1]) {
+ case 's':
+ if (!isVowel(s[len - 2])) { // no vocals, singular words ending with consonant
+ return len - 1;
+ }
+ if ((s[len - 4] == 'q'
+ || (s[len - 4] == 'g')
+ && s[len - 3] == 'u'
+ && (s[len - 2] == 'i' || s[len - 2] == 'e'))) { // maniquis,caquis, parques
+ return len - 1;
+ }
+ if (isVowel(s[len - 4])
+ && (s[len - 3] == 'r')
+ && s[len - 2] == 'e') { // escaneres, alfileres, amores, cables
+ return len - 2;
+ }
+ if (isVowel(s[len - 4])
+ && (s[len - 3] == 'd' || s[len - 3] == 'l' || s[len - 3] == 'n' || s[len - 3] == 'x')
+ && s[len - 2] == 'e') { // abades, comerciales, faxes, relojes,
+ return len - 2;
+ }
+ if ((s[len - 3] == 'y' || s[len - 3] == 'u') && s[len - 2] == 'e') { // bambues,leyes
+ return len - 2;
+ }
+ if ((s[len - 4] == 'u'
+ || s[len - 4] == 'l'
+ || s[len - 4] == 'r'
+ || s[len - 4] == 't'
+ || s[len - 4] == 'n')
+ && (s[len - 3] == 'i')
+ && s[len - 2] == 'e') { // jabalies,israelies, maniquies
+ return len - 2;
+ }
+ if ((s[len - 3] == 's' && s[len - 2] == 'e')) { // reses
+ return len - 2;
+ }
+ if (isVowel(s[len - 3]) && s[len - 2] == 'i') { // jerseis
+ s[len - 2] = 'y';
+ return len - 1;
+ }
+ if (s[len - 3] == 'd' && s[len - 2] == 'i') { // brandis
+ s[len - 2] = 'y';
+ return len - 1;
+ }
+ if (s[len - 2] == 'e' && s[len - 3] == 'c') { // voces-->voz
+ s[len - 3] = 'z';
+ return len - 2;
+ }
+ if (isVowel(s[len - 2])) // remove last 's': jabalís, casas, coches, etc.
+ {
+ return len - 1;
+ }
+ break;
+ }
+ return len;
+ }
+
+ private boolean isVowel(char c) {
+ boolean res = false;
+ if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') {
+ res = true;
+ }
+ return res;
+ }
+
+ private boolean invariant(char[] s, int len) {
+ return invariants.contains(s, 0, len);
+ }
+
+ private boolean special(char[] s, int len) {
+ return specialCases.contains(s, 0, len);
+ }
+
+ private void removeAccents(char[] s, int len) {
+ for (int i = 0; i < len; i++) {
+ switch (s[i]) {
+ case 'à':
+ case 'á':
+ case 'â':
+ case 'ä':
+ s[i] = 'a';
+ break;
+ case 'ò':
+ case 'ó':
+ case 'ô':
+ case 'ö':
+ s[i] = 'o';
+ break;
+ case 'è':
+ case 'é':
+ case 'ê':
+ case 'ë':
+ s[i] = 'e';
+ break;
+ case 'ù':
+ case 'ú':
+ case 'û':
+ case 'ü':
+ s[i] = 'u';
+ break;
+ case 'ì':
+ case 'í':
+ case 'î':
+ case 'ï':
+ s[i] = 'i';
+ break;
+ }
+ }
+ }
+}
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
index 2328b87..27fac27 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -48,6 +48,7 @@ org.apache.lucene.analysis.en.KStemFilterFactory
org.apache.lucene.analysis.en.PorterStemFilterFactory
org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
+org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilter.java
new file mode 100644
index 0000000..9381b90
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilter.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+/** Simple tests for {@link SpanishPluralStemFilter} */
+public class TestSpanishPluralStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ analyzer =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(source, new SpanishPluralStemFilter(source));
+ }
+ };
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ analyzer.close();
+ super.tearDown();
+ }
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataPath("espluraltestdata.zip"), "esplural.txt");
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+ }
+
+ public void testEmptyTerm() throws IOException {
+ Analyzer a =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(tokenizer, new SpanishPluralStemFilter(tokenizer));
+ }
+ };
+ checkOneTerm(a, "", "");
+ a.close();
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilterFactory.java
new file mode 100644
index 0000000..8abe712
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishPluralStemFilterFactory.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.es;
+
+import java.io.Reader;
+import java.io.StringReader;
+import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+/** Simple tests to ensure the Spanish Plural stem factory is working. */
+public class TestSpanishPluralStemFilterFactory extends BaseTokenStreamFactoryTestCase {
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("sociedades");
+ TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ ((Tokenizer) stream).setReader(reader);
+ stream = tokenFilterFactory("SpanishPluralStem").create(stream);
+ assertTokenStreamContents(stream, new String[] {"sociedad"});
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ IllegalArgumentException expected =
+ expectThrows(
+ IllegalArgumentException.class,
+ () -> {
+ tokenFilterFactory("SpanishPluralStem", "bogusArg", "bogusValue");
+ });
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/espluraltestdata.zip b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/espluraltestdata.zip
new file mode 100644
index 0000000..05ffd65
Binary files /dev/null and b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/espluraltestdata.zip differ